From 2c8b14c61832ed78c52bb623fd15fe50af03f42f Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Thu, 27 Feb 2020 08:53:48 +0100 Subject: [PATCH 01/21] inital commit for transitive closure clustering --- src/basic/config.cpp | 6 +- src/basic/config.h | 5 +- src/run/cluster.cpp | 129 +++++++++++++++++--- src/run/disjoint_set.h | 216 +++++++++++++++++++++++++++++++++ src/util/command_line_parser.h | 2 +- 5 files changed, 340 insertions(+), 18 deletions(-) create mode 100644 src/run/disjoint_set.h diff --git a/src/basic/config.cpp b/src/basic/config.cpp index f93ffa44..f3c93a0f 100644 --- a/src/basic/config.cpp +++ b/src/basic/config.cpp @@ -143,6 +143,10 @@ Config::Config(int argc, const char **argv, bool check_io) makedb.add() ("in", 0, "input reference file in FASTA format", input_ref_file); + Options_group cluster("Cluster options"); + cluster.add() + ("calgo", 0, "Clustering algorithm (0=multi-step/1=transitive-closure)", cluster_algo, -1); + Options_group aligner("Aligner options"); aligner.add() ("query", 'q', "input query file", query_file) @@ -306,7 +310,7 @@ Config::Config(int argc, const char **argv, bool check_io) ("cut-bar", 0, "", cut_bar) ("bootstrap", 0, "", bootstrap); - parser.add(general).add(makedb).add(aligner).add(advanced).add(view_options).add(getseq_options).add(hidden_options); + parser.add(general).add(makedb).add(cluster).add(aligner).add(advanced).add(view_options).add(getseq_options).add(hidden_options); parser.store(argc, argv, command); if (long_reads) { diff --git a/src/basic/config.h b/src/basic/config.h index a4f162eb..73cc9c8e 100644 --- a/src/basic/config.h +++ b/src/basic/config.h @@ -206,6 +206,9 @@ struct Config enum { double_indexed = 0, query_indexed = 1, subject_indexed = 2 }; int algo; + enum { multi_step = 0, transitive_closure = 1}; + int cluster_algo; + enum { query_parallel = 0, target_parallel = 1 }; unsigned load_balancing; @@ -275,4 +278,4 @@ struct Config extern Config config; -#endif \ No newline at end of file +#endif diff --git a/src/run/cluster.cpp b/src/run/cluster.cpp index 0943990e..7f1f4f13 100644 --- a/src/run/cluster.cpp +++ b/src/run/cluster.cpp @@ -28,6 +28,7 @@ along with this program. If not, see . #include "../basic/config.h" #include "../data/reference.h" #include "workflow.h" +#include "disjoint_set.h" #include "../util/io/consumer.h" #include "../util/algo/algo.h" #include "../basic/statistics.h" @@ -60,6 +61,32 @@ struct Neighbors : public vector>, public Consumer { vector edges; }; +class NeighborStream : public Consumer { + LazyDisjointSet* disjointSet; + virtual void consume(const char *ptr, size_t n) override { + size_t query, subject, count; + float qcov, scov, bitscore; + const char *end = ptr + n; + while (ptr < end) { + //if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2) + if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &count) != 5) + throw runtime_error("Cluster format error."); + disjointSet->merge(query, subject); + ptr += count; + } + } +public: + NeighborStream(size_t n){ + disjointSet = new LazyDisjointIntegralSet(n); + } + ~NeighborStream(){ + delete disjointSet; + } + vector> getListOfSets(){ + return disjointSet->getListOfSets(); + } +}; + vector rep_bitset(const vector ¢roid, const vector *superset = nullptr) { vector r(centroid.size()); for (int c : centroid) @@ -93,22 +120,18 @@ vector cluster(DatabaseFile &db, const vector *filter) { return Util::Algo::greedy_vortex_cover(nb); } -void run() { - if (config.database == "") - throw runtime_error("Missing parameter: database file (--db/-d)"); - config.command = Config::makedb; - unique_ptr db(DatabaseFile::auto_create_from_fasta()); - const size_t seq_count = db->ref_header.sequences; +void run_two_step_clustering(DatabaseFile& db){ + const size_t seq_count = db.ref_header.sequences; config.min_id = 70; - vector centroid1(cluster(*db, nullptr)); + vector centroid1(cluster(db, nullptr)); vector rep1(rep_bitset(centroid1)); size_t n_rep1 = count_if(rep1.begin(), rep1.end(), [](bool x) { return x; }); message_stream << "Clustering step 1 complete. #Input sequences: " << centroid1.size() << " #Clusters: " << n_rep1 << endl; config.mode_more_sensitive = true; config.min_id = 0; - vector centroid2(cluster(*db, &rep1)); + vector centroid2(cluster(db, &rep1)); vector rep2(rep_bitset(centroid2, &rep1)); for (size_t i = 0; i < centroid2.size(); ++i) if (!rep1[i]) @@ -119,21 +142,21 @@ void run() { Sequence_set *rep_seqs; String_set<0> *rep_ids; vector rep_database_id, rep_block_id(seq_count); - db->rewind(); - db->load_seqs(rep_database_id, (size_t)1e11, &rep_seqs, &rep_ids, true, &rep2); + db.rewind(); + db.load_seqs(rep_database_id, (size_t)1e11, &rep_seqs, &rep_ids, true, &rep2); for (size_t i = 0; i < rep_database_id.size(); ++i) rep_block_id[rep_database_id[i]] = (unsigned)i; ostream *out = config.output_file.empty() ? &cout : new ofstream(config.output_file.c_str()); vector seq; string id; - db->seek_direct(); + db.seek_direct(); Hsp hsp; size_t n; out->precision(3); - for (int i = 0; i < (int)db->ref_header.sequences; ++i) { - db->read_seq(id, seq); + for (int i = 0; i < (int)db.ref_header.sequences; ++i) { + db.read_seq(id, seq); const unsigned r = rep_block_id[centroid2[i]]; (*out) << blast_id(id) << '\t' @@ -151,10 +174,86 @@ void run() { } } - db->close(); if (out != &cout) delete out; delete rep_seqs; delete rep_ids; } -}} \ No newline at end of file + +void run_transitive_closure_clustering(DatabaseFile& db){ + // testing + // LazyDisjointIntegralSet test(5); + // test.merge(1,3); + // test.merge(3,4); + // test.merge(0,2); + // auto los = test.getListOfSets(); + // for(auto& set: los){ + // for(auto item:set){ + // cout<< " " << item; + // } + // cout< s ( {"red","green","blue","rain", "sun", "snow"} ); + // LazyDisjointTypeSet ts(&s); + + // ts.merge("red", "green"); + // ts.merge("snow", "rain"); + // ts.merge("sun", "rain"); + // auto lo = ts.getListOfSets(); + // for(auto& set: lo){ + // cout<< "set :"; + // for(auto item:set){ + // cout<< " " << item; + // } + // cout<::max(); + + Workflow::Search::Options opt; + opt.db = &db; + opt.self = true; + NeighborStream ns(db.ref_header.sequences); + opt.consumer = &ns; + opt.db_filter = nullptr; + + Workflow::Search::run(opt); + + auto clusterResult = ns.getListOfSets(); + cout << "Found "< db(DatabaseFile::auto_create_from_fasta()); + switch(config.cluster_algo){ + case Config::multi_step: + run_two_step_clustering(*db); + break; + case Config::transitive_closure: + run_transitive_closure_clustering(*db); + break; + } + db->close(); +} +}} diff --git a/src/run/disjoint_set.h b/src/run/disjoint_set.h new file mode 100644 index 00000000..4b7acc7b --- /dev/null +++ b/src/run/disjoint_set.h @@ -0,0 +1,216 @@ +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +template class Node { + Node* parent = this; + uint32_t r = 0; +public: + virtual ~Node(){}; + virtual Node* getParent(){ + return parent; + } + virtual void setParent(Node* p){ + parent = p; + } + + virtual const T* getValue() = 0; + + virtual uint32_t getCount(){ + return r; + } + virtual void incrementCount(){ + r++; + } +}; + +template class IntegralNode: public Node { +private: + T value; +public: + virtual ~IntegralNode(){}; + IntegralNode(T v){ + value = v; + } + virtual const T* getValue(){ + return &value; + } +}; + +template class TypeNode: public Node { +private: + const T* value; + uint32_t r; +public: + virtual ~TypeNode(){}; + TypeNode(const T* v){ + value = v; + } + virtual const T* getValue(){ + return value; + } +}; + +template class LazyDisjointSet { +private: + virtual Node* get(const T i) = 0; + virtual Node* get(const T* i) = 0; + virtual uint64_t size() = 0; + virtual vector*>* getNodes() = 0; +public: + virtual ~LazyDisjointSet(){}; + + virtual Node* getRoot(Node* x) { + if (x->getParent() != x) { + // flatten the tree while traversing + x->setParent(getRoot(x->getParent())); + } + return x->getParent(); + } + + virtual void merge(Node* x, Node* y) { + if (x != y) { + Node* rootX = getRoot(x); + Node* rootY = getRoot(y); + if (rootX != rootY) { + if (rootX->getCount() < rootY->getCount()) { + rootX->setParent(rootY); + } + else if (rootX->getCount() > rootY->getCount()) { + rootY->setParent(rootX); + } + else { + rootX->incrementCount(); + rootY->setParent(rootX); + } + } + } + } + + virtual void merge(const T* x, const T* y) { + merge(get(x), get(y)); + } + + void merge(T x, T y) { + merge(get(x), get(y)); + } + + virtual vector> getListOfSets() { + unordered_map*> map; + for (Node* n : *(getNodes())) { + if (n == nullptr) { + continue; + } + const T* r = getRoot(n)->getValue(); + auto it = map.find(r); + T v = *(n->getValue()); + if (it == map.end()) { + it = map.emplace(r, new unordered_set()).first; + } + it->second->emplace(v); + } + vector> listOfSets; + for(auto el=map.begin(); el != map.end(); el++){ + listOfSets.emplace_back(move(*(el->second))); + } + return listOfSets; + } +}; + +template class LazyDisjointIntegralSet : public LazyDisjointSet { +private: + vector*> nodes; + + virtual Node* get(T i){ + assert(i >= 0 && nodes.size() > i); + if(nodes[i] == nullptr){ + nodes[i] = new IntegralNode(i); + } + return nodes[i]; + } + + virtual Node* get(const T* i) { + return get(*i); + } + + virtual uint64_t size(){ + return nodes.size(); + } + virtual vector*>* getNodes(){ + return &nodes; + } + +public: + static_assert(std::is_integral::value, "T needs to be an integral type"); + + LazyDisjointIntegralSet(T size) { + nodes = vector*>(size, nullptr); + } + + virtual ~LazyDisjointIntegralSet() { + for(Node* n : nodes){ + if (n != nullptr){ + delete n; + n = nullptr; + } + } + nodes.clear(); + } +}; + +template class LazyDisjointTypeSet : public LazyDisjointSet { +private: + unordered_set* elements; + vector*> nodes; + unordered_map mapping; + bool isIntegralAndConsecutive; + + virtual Node* get(const T* i) { + auto found = mapping.find(i); + assert(found != mapping.end()); + return nodes[found->second]; + } + + virtual Node* get(T i) { + auto found = elements->find(i); + assert(found != elements->end()); + return get(&(*found)); + } + + virtual uint64_t size(){ + return nodes.size(); + } + virtual vector*>* getNodes(){ + return &nodes; + } + +public: + LazyDisjointTypeSet(unordered_set* s) { + // note that the structure is backed by the set, so it should not be modified + isIntegralAndConsecutive = false; + elements = s; + nodes = vector*>(elements->size()); + mapping = unordered_map(elements->size()); + for(auto e = elements->begin(); e != elements->end(); e++){ + mapping[&(*e)] = nodes.size(); + Node* n = new TypeNode(&(*e)); + nodes.push_back(n); + } + } + virtual ~LazyDisjointTypeSet() { + mapping.clear(); + for(Node* n : nodes){ + if (n != nullptr){ + delete n; + n = nullptr; + } + } + nodes.clear(); + } +}; diff --git a/src/util/command_line_parser.h b/src/util/command_line_parser.h index 1b33eee3..c08231b8 100644 --- a/src/util/command_line_parser.h +++ b/src/util/command_line_parser.h @@ -166,4 +166,4 @@ struct Command_line_parser std::vector > commands_; }; -#endif \ No newline at end of file +#endif From d59f33449fd056333dd5c4af9af53702f8c8fc9b Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Sat, 11 Apr 2020 08:45:46 +0200 Subject: [PATCH 02/21] implemented markov clustering with sparse matrices --- src/run/cluster.cpp | 235 ++++++++++++++++++- src/run/sparse_matrix.h | 498 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 730 insertions(+), 3 deletions(-) create mode 100644 src/run/sparse_matrix.h diff --git a/src/run/cluster.cpp b/src/run/cluster.cpp index 610b25f7..f3ab09c7 100644 --- a/src/run/cluster.cpp +++ b/src/run/cluster.cpp @@ -29,6 +29,7 @@ along with this program. If not, see . #include "../data/reference.h" #include "workflow.h" #include "disjoint_set.h" +#include "sparse_matrix.h" #include "../util/io/consumer.h" #include "../util/algo/algo.h" #include "../basic/statistics.h" @@ -87,6 +88,48 @@ class NeighborStream : public Consumer { } }; +template +class SparseMatrixStream : public Consumer { + SparseSimpleMatrix* m = nullptr; + LazyDisjointSet* disjointSet; + virtual void consume(const char *ptr, size_t n) override { + size_t query, subject, count; + float qcov, scov, bitscore; + const char *end = ptr + n; + while (ptr < end) { + //if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2) + if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &count) != 5) + throw runtime_error("Cluster format error."); + m->set_elm(query, subject, bitscore); + disjointSet->merge(query, subject); + ptr += count; + } + } +public: + SparseMatrixStream(size_t n, bool symmetric){ + disjointSet = new LazyDisjointIntegralSet(n); + if( symmetric ){ + m = new SparseSymmetricSimpleMatrix(n); + } + else{ + m = new SparseSimpleMatrixImpl(n, n); + } + } + ~SparseMatrixStream(){ + if(m) delete m; + delete disjointSet; + } + SparseSimpleMatrix* getMatrix(){ + cout << " getting matrix " << m->get_nrows() << " " << m->get_ncols() << " " << m->get_n_nonzero_elements() << " sparsity "<< (1.0 * m->get_n_nonzero_elements()) / (m->get_nrows()* m->get_ncols()) << endl; + SparseSimpleMatrix* h = m; + m = nullptr; + return h; + } + vector> getListOfSets(){ + return disjointSet->getListOfSets(); + } +}; + vector rep_bitset(const vector ¢roid, const vector *superset = nullptr) { vector r(centroid.size()); for (int c : centroid) @@ -116,7 +159,7 @@ vector cluster(DatabaseFile &db, const vector *filter) { opt.db_filter = filter; Workflow::Search::run(opt); - + return Util::Algo::greedy_vortex_cover(nb); } @@ -158,10 +201,8 @@ void run_two_step_clustering(DatabaseFile& db){ for (int i = 0; i < (int)db.ref_header.sequences; ++i) { db.read_seq(id, seq); const unsigned r = rep_block_id[centroid2[i]]; - (*out) << blast_id(id) << '\t' << blast_id((*rep_ids)[r]) << '\t'; - if ((int)i == centroid2[i]) (*out) << "100\t100\t100\t0" << endl; else { @@ -241,6 +282,191 @@ void run_transitive_closure_clustering(DatabaseFile& db){ // } } +SparseSimpleMatrix* get_gamma(SparseSimpleMatrix* m, uint32_t r){ + SparseSimpleMatrix* gamma = new SparseSimpleMatrixImpl(m->get_nrows(), m->get_ncols()); + float* column_sum = new float[m->get_ncols()]; + for(uint32_t icol = 0; icol< m->get_ncols(); icol++){ + column_sum[icol] = 0.0f; + } + for(auto it = m->begin(); it != m->end(); it++){ + uint32_t i,j; + m->get_indices(it->first, &i, &j); + float el = pow(it->second, r); + gamma->set_elm(i,j,el); + column_sum[j] += el; + } + for(auto it = gamma->begin(); it != gamma->end(); it++){ + uint32_t i,j; + gamma->get_indices(it->first, &i, &j); + gamma->set_elm(i,j, it->second/column_sum[j]); + } + return gamma; +} +SimpleMatrix* get_gamma(SimpleMatrix* m, uint32_t r){ + SimpleMatrix* gamma = new SparseSimpleMatrixImpl(m->get_nrows(), m->get_ncols()); + float* column_sum = new float[m->get_ncols()]; + for(uint32_t icol = 0; icol< m->get_ncols(); icol++){ + column_sum[icol] = 0.0f; + } + for(uint32_t irow = 0; irow < m->get_nrows(); irow++){ + for(uint32_t icol = 0; icol < m->get_ncols(); icol++){ + column_sum[icol]+=pow(m->get_elm(irow, icol), r); + } + } + for(uint32_t irow = 0; irow < m->get_nrows(); irow++){ + for(uint32_t icol = 0; icol < m->get_ncols(); icol++){ + gamma->set_elm(irow, icol, pow(m->get_elm(irow, icol),r) / column_sum[icol]); + } + } + return gamma; +} + +SimpleMatrix* get_exp(SimpleMatrix* m, uint32_t r){ + // TODO: make this more efficient + SimpleMatrix* tmp1 = m->multiply(m); + for(uint32_t i = 2; i* tmp2 = tmp1->multiply(tmp1); + delete tmp1; + tmp1 = tmp2; + } + return tmp1; +} +vector> get_list(SimpleMatrix* m){ + LazyDisjointIntegralSet disjointSet(m->get_ncols()); + for(uint32_t irow = 0; irow < m->get_nrows(); irow++){ + for(uint32_t icol = 0; icol < m->get_ncols(); icol++){ + if(abs(m->get_elm(irow, icol)) > 1e-13){ + disjointSet.merge(irow, icol); + } + } + } + return disjointSet.getListOfSets(); +} +vector> get_list(SparseSimpleMatrix* m){ + LazyDisjointIntegralSet disjointSet(m->get_ncols()); + for(auto it = m->begin(); it != m->end(); it++){ + uint32_t irow, icol; + m->get_indices(it->first, &irow, &icol); + disjointSet.merge(irow, icol); + } + return disjointSet.getListOfSets(); +} + + +vector> markov_process(SimpleMatrix* m){ + uint32_t iteration = 0; + double diff_norm = std::numeric_limits::max(); + while( iteration < 100 && diff_norm > 1e-3 ){ + SimpleMatrix* msquared = get_exp(m, 2); + SimpleMatrix* m_update = get_gamma(msquared, 2); + delete msquared; + SimpleMatrix* diff_m = m_update->minus(m); + diff_norm = diff_m->norm(2.0,2.0); + delete m; + delete diff_m; + m = m_update; + m_update = nullptr; + iteration++; + } + cout<< "Markov Process converged after " << iteration <<" iterations " << diff_norm << endl; + return get_list(m); +} + +void run_markov_clustering(DatabaseFile& db){ + // testing + // SparseSimpleMatrix* tmp = new SparseSimpleMatrixImpl(12, 12); + // tmp->set_elm(0, 0, 0.2f); + // tmp->set_elm(0, 1, 0.25f); + // tmp->set_elm(0, 5, 0.333f); + // tmp->set_elm(0, 6, 0.25f); + // tmp->set_elm(0, 9, 0.25f); + // tmp->set_elm(1, 0, 0.20f); + // tmp->set_elm(1, 1, 0.25f); + // tmp->set_elm(1, 2, 0.25f); + // tmp->set_elm(1, 4, 0.20f); + // tmp->set_elm(2, 1, 0.25f); + // tmp->set_elm(2, 2, 0.25f); + // tmp->set_elm(2, 3, 0.20f); + // tmp->set_elm(2, 4, 0.20f); + // tmp->set_elm(3, 2, 0.25f); + // tmp->set_elm(3, 3, 0.20f); + // tmp->set_elm(3, 7, 0.20f); + // tmp->set_elm(3, 8, 0.20f); + // tmp->set_elm(3, 10, 0.20f); + // tmp->set_elm(4, 1, 0.25f); + // tmp->set_elm(4, 2, 0.25f); + // tmp->set_elm(4, 4, 0.20f); + // tmp->set_elm(4, 6, 0.25f); + // tmp->set_elm(4, 7, 0.20f); + // tmp->set_elm(5, 0, 0.20f); + // tmp->set_elm(5, 5, 0.333f); + // tmp->set_elm(5, 9, 0.25f); + // tmp->set_elm(6, 0, 0.20f); + // tmp->set_elm(6, 4, 0.20f); + // tmp->set_elm(6, 6, 0.25f); + // tmp->set_elm(6, 9, 0.25f); + // tmp->set_elm(7, 3, 0.20f); + // tmp->set_elm(7, 4, 0.20f); + // tmp->set_elm(7, 7, 0.20f); + // tmp->set_elm(7, 8, 0.20f); + // tmp->set_elm(7, 10, 0.20f); + // tmp->set_elm(8, 3, 0.20f); + // tmp->set_elm(8, 7, 0.20f); + // tmp->set_elm(8, 8, 0.20f); + // tmp->set_elm(8, 10, 0.20f); + // tmp->set_elm(8, 11, 0.333f); + // tmp->set_elm(9, 0, 0.20f); + // tmp->set_elm(9, 5, 0.333f); + // tmp->set_elm(9, 6, 0.25f); + // tmp->set_elm(9, 9, 0.25f); + // tmp->set_elm(10, 3, 0.20f); + // tmp->set_elm(10, 7, 0.20f); + // tmp->set_elm(10, 8, 0.20f); + // tmp->set_elm(10, 10, 0.20f); + // tmp->set_elm(10, 11, 0.333f); + // tmp->set_elm(11, 8, 0.20f); + // tmp->set_elm(11, 10, 0.20f); + // tmp->set_elm(11, 11, 0.333f); + // auto cr = markov_process(tmp); + // cout << "Found "<::max(); + + Workflow::Search::Options opt; + opt.db = &db; + opt.self = true; + SparseMatrixStream ms(db.ref_header.sequences, false); + opt.consumer = &ms; + opt.db_filter = nullptr; + Workflow::Search::run(opt); + + SparseSimpleMatrix* m = ms.getMatrix(); + + auto clusterResult = markov_process(m); + cout << "Found "<close(); } diff --git a/src/run/sparse_matrix.h b/src/run/sparse_matrix.h new file mode 100644 index 00000000..d9502996 --- /dev/null +++ b/src/run/sparse_matrix.h @@ -0,0 +1,498 @@ +#pragma once +#include +#include + +template +class SimpleMatrix { +public: + virtual T get_max_elm() = 0; + virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx) = 0; + virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element) = 0; + virtual T& at(uint32_t i_idx, uint32_t j_idx) = 0; + virtual T& at(uint64_t c_idx) = 0; + virtual inline uint64_t get_nrows() = 0; + virtual inline uint64_t get_ncols() = 0; + virtual SimpleMatrix* get_new_instance(int nrows, int ncols) = 0; + virtual SimpleMatrix* get_new_instance() = 0; + virtual void print(){ + for( uint32_t irow=0; irow < get_nrows(); irow++){ + for( uint32_t icol=0; icol < get_ncols(); icol++){ + printf(" %.6f",get_elm(irow,icol)); + } + printf("\n"); + } + } + + virtual inline uint64_t get_idx(uint32_t i_idx, uint32_t j_idx){ + assert(i_idx < get_nrows()); + assert(j_idx < get_ncols()); + return i_idx * get_ncols() + j_idx; + } + + virtual inline void get_indices (uint64_t combined, uint32_t* i, uint32_t* j){ + *j = combined % get_ncols(); + *i = ( combined - *j) / get_ncols(); + } + + virtual SimpleMatrix* minus(SimpleMatrix* m) = 0; + virtual SimpleMatrix* base_minus(SimpleMatrix* m){ + assert(this->get_nrows() == m->get_nrows()); + assert(this->get_ncols() == m->get_ncols()); + SimpleMatrix* result = this->get_new_instance(this->get_nrows(),m->get_ncols()); + for(uint32_t irow = 0; irowget_nrows(); irow++){ + for(uint32_t icol = 0; icolget_ncols(); icol++){ + result->set_elm(irow, icol, this->get_elm(irow, icol) - m->get_elm(irow,icol)); + } + } + return result; + } + virtual SimpleMatrix* plus(SimpleMatrix* m) = 0; + virtual SimpleMatrix* base_plus(SimpleMatrix* m){ + assert(this->get_nrows() == m->get_nrows()); + assert(this->get_ncols() == m->get_ncols()); + SimpleMatrix* result = this->get_new_instance(this->get_nrows(),m->get_ncols()); + for(uint32_t irow = 0; irowget_nrows(); irow++){ + for(uint32_t icol = 0; icolget_ncols(); icol++){ + result->set_elm(irow, icol, this->get_elm(irow, icol) + m->get_elm(irow, icol)); + } + } + return result; + } + virtual SimpleMatrix* multiply(SimpleMatrix* m) = 0; + virtual SimpleMatrix* base_multiply(SimpleMatrix* m){ + printf("standard mult\n"); + SimpleMatrix* result = this->get_new_instance(this->get_nrows(),m->get_ncols()); + for(uint32_t irow = 0; irowget_nrows(); irow++){ + for(uint32_t icol = 0; icolget_ncols(); icol++){ + result->set_elm(irow, icol, 0); + } + } + for(uint32_t irow = 0; irowget_nrows(); irow++){ + for(uint32_t icol = 0; icolget_ncols(); icol++){ + for(uint32_t k = 0; kget_nrows(); k++){ + result->set_elm(irow, icol, result->get_elm(irow, icol) + this->get_elm(irow, k) * m->get_elm(k,icol)); + } + } + } + return result; + } + virtual double norm(double p, double q){ + double result = 0.0; + double exp1 = q/p; + for(uint32_t irow = 0; irowget_nrows(); irow++){ + double colnorm = 0.0; + for(uint32_t icol = 0; icolget_ncols(); icol++){ + colnorm += std::pow(std::abs(this->get_elm(irow, icol)), p); + } + result += std::pow(colnorm, exp1); + } + return std::pow(result, 1.0/q); + } + + virtual ~SimpleMatrix() {}; +}; + +template +class SymmetricSimpleMatrix : virtual public SimpleMatrix { +public: + static inline uint64_t get_lower_index (uint32_t i_idx, uint32_t j_idx){ + uint32_t min = std::min(i_idx,j_idx); + uint32_t max = std::max(i_idx,j_idx); + return max*((uint64_t) max+1)/2 + min; + } + static inline void get_symmetric_index (uint64_t combined, uint32_t* max, uint32_t* min){ + // solving max^2-max-2*combined = 0. -1 to convert to indexing varaible in C, avoiding overflow + // by removing an additional factor of 2 from under the square root + *max = (uint32_t) std::floor(0.5+2*sqrt(0.0625+0.5*combined)) - 1; + *min = combined - (*max*((uint64_t)*max+1))/2; + } +}; + +template +class SparseSimpleMatrix : virtual public SimpleMatrix { +public: + virtual uint64_t get_n_nonzero_elements() = 0; + virtual inline bool has_elm(uint32_t i_idx, uint32_t j_idx) = 0; + virtual typename std::unordered_map::iterator begin() = 0; + virtual typename std::unordered_map::iterator end() = 0; +}; + +template +class SparseSimpleMatrixImpl : public SparseSimpleMatrix { +private: + uint32_t nrows, ncols; + T tol; + std::unordered_map mat; + +public: + SparseSimpleMatrixImpl(int nrows, int ncols) : nrows(nrows), ncols(ncols), tol((T) 1e-12) {}; + SparseSimpleMatrixImpl(int nrows, int ncols, T tol) : nrows(nrows), ncols(ncols), tol(std::abs(tol)) {}; + + virtual SimpleMatrix* get_new_instance(int nrows, int ncols){ + return new SparseSimpleMatrixImpl(nrows, ncols); + } + virtual SimpleMatrix* get_new_instance(){ + return new SparseSimpleMatrixImpl(nrows, ncols); + } + + virtual ~SparseSimpleMatrixImpl(){ mat.clear(); }; + + virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){ + assert(i_idx < nrows); + assert(j_idx < ncols); + if(std::abs(element) > tol) { + mat[SimpleMatrix::get_idx(i_idx,j_idx)] = element; + } + } + virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){ + assert(i_idx < nrows); + assert(j_idx < ncols); + auto it = mat.find(SimpleMatrix::get_idx(i_idx,j_idx)); + return it != mat.end() ? it-> second : (T) 0.0; + } + virtual inline bool has_elm(uint32_t i_idx, uint32_t j_idx){ + assert(i_idx < nrows); + assert(j_idx < ncols); + auto it = mat.find(SimpleMatrix::get_idx(i_idx,j_idx)); + return it != mat.end(); + } + virtual inline uint64_t get_nrows(){ + return nrows; + }; + virtual inline uint64_t get_ncols(){ + return ncols; + }; + virtual inline uint64_t get_n_nonzero_elements(){ + return mat.size(); + }; + virtual typename std::unordered_map::iterator begin(){ + return mat.begin(); + }; + virtual typename std::unordered_map::iterator end(){ + return mat.end(); + }; + virtual T& at(uint32_t i_idx, uint32_t j_idx){ + return at(SimpleMatrix::get_idx(i_idx,j_idx)); + }; + virtual T& at(uint64_t c_idx){ + assert(c_idx < nrows*ncols); + return mat[c_idx]; + }; + + void purge(){ + auto it = mat.begin(); + while(it!=mat.end()){ + if(std::abs(it->second) < tol){ + it = mat.erase(it); + } + else{ + it++; + } + } + } + virtual T get_max_elm(){ + T m = 0.0; + for(auto& it : mat){ + m = std::max(m, it.second); + } + return m; + }; + virtual SimpleMatrix* minus(SimpleMatrix* m){ + if(SparseSimpleMatrix* n = dynamic_cast*> (m)){ + // TODO: Imporove this implementation + SparseSimpleMatrix* result = new SparseSimpleMatrixImpl(this->get_nrows(),n->get_ncols()); + + std::vector> k_to_j(n->get_nrows()); + for(auto it_m = n->begin(); it_m != n->end(); it_m++){ + uint32_t irow,icol; + n->get_indices(it_m->first, &irow, &icol); + result->set_elm(irow, icol, this->get_elm(irow,icol) - it_m->second); + } + for(auto it_this = this->begin(); it_this != this->end(); it_this++){ + uint32_t irow,icol; + this->get_indices(it_this->first, &irow, &icol); + if(!n->has_elm(irow,icol)) result->set_elm(irow, icol, it_this->second); + } + return result; + } + return this->base_minus(m); + } + virtual SimpleMatrix* plus(SimpleMatrix* m){ + if(SparseSimpleMatrix* n = dynamic_cast*> (m)){ + // TODO: Imporove this implementation + SparseSimpleMatrix* result = new SparseSimpleMatrixImpl(this->get_nrows(),n->get_ncols()); + + std::vector> k_to_j(n->get_nrows()); + for(auto it_m = n->begin(); it_m != n->end(); it_m++){ + uint32_t irow,icol; + n->get_indices(it_m->first, &irow, &icol); + result->set_elm(irow, icol, this->get_elm(irow,icol) + it_m->second); + } + for(auto it_this = this->begin(); it_this != this->end(); it_this++){ + uint32_t irow,icol; + this->get_indices(it_this->first, &irow, &icol); + if(!n->has_elm(irow,icol)) result->set_elm(irow, icol, it_this->second); + } + return result; + } + return this->base_plus(m); + } + virtual SimpleMatrix* multiply(SimpleMatrix* m){ + if(SparseSimpleMatrix* n = dynamic_cast*> (m)){ + SparseSimpleMatrix* result = new SparseSimpleMatrixImpl(this->get_nrows(),n->get_ncols()); + std::vector> k_to_j(n->get_nrows()); + for(uint32_t k = 0; kget_nrows(); k++){ + k_to_j[k] = std::vector(); + } + for(auto it_m = n->begin(); it_m != n->end(); it_m++){ + uint32_t k2,icol; + n->get_indices(it_m->first, &k2, &icol); + k_to_j[k2].push_back(icol); + } + + auto it_this = this->begin(); + uint32_t ithis = 0; + while(it_this != this->end()){ + uint32_t irow, k; + this->get_indices(it_this->first, &irow, &k); + for( uint32_t const & icol : k_to_j[k] ){ + T el = result->get_elm(irow, icol); + result->set_elm(irow, icol, el + it_this->second * n->get_elm(k, icol)); + } + it_this++; + } + return result; + } + return this->base_multiply(m); + } +}; + +template +class DenseSimpleMatrix : public SimpleMatrix { +private: + uint32_t nrows, ncols; + T* mat; +public: + DenseSimpleMatrix(int nrows, int ncols) : nrows(nrows), ncols(ncols) { + mat = new T[nrows * ncols]; + }; + DenseSimpleMatrix(int nrows, int ncols, T init) : nrows(nrows), ncols(ncols) { + mat = new T[nrows * ncols]; + for(uint64_t i = 0; i* get_new_instance(int nrows, int ncols){ + return new DenseSimpleMatrix(nrows, ncols); + } + virtual SimpleMatrix* get_new_instance(){ + return new DenseSimpleMatrix(nrows, ncols); + } + + virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){ + mat[SimpleMatrix::get_idx(i_idx,j_idx)] = element; + } + virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){ + return mat[SimpleMatrix::get_idx(i_idx,j_idx)]; + } + virtual inline uint64_t get_nrows(){ + return nrows; + }; + virtual inline uint64_t get_ncols(){ + return ncols; + }; + virtual T& at(uint32_t i_idx, uint32_t j_idx){ + return at(SimpleMatrix::get_idx(i_idx,j_idx)); + }; + virtual T& at(uint64_t c_idx){ + assert(c_idx < nrows * ncols); + return mat[c_idx]; + }; + virtual T get_max_elm(){ + // TODO: implement me + return 0;//*std::max_element(mat,mat+n_max); + }; + + SimpleMatrix* multiply(SimpleMatrix* m){ + if(SparseSimpleMatrix* n = dynamic_cast*> (m)){ + DenseSimpleMatrix* result = new DenseSimpleMatrix(this->get_nrows(),n->get_ncols()); + for(uint32_t irow = 0; irowget_nrows(); irow++){ + for(uint32_t icol = 0; icolget_ncols(); icol++){ + result->set_elm(irow, icol, 0); + } + } + for(uint32_t irow = 0; irowget_nrows(); irow++){ + auto it = n->begin(); + while(it!=n->end()){ + uint32_t k,icol; + n->get_indices (it->first, &k, &icol); + result->at(irow, icol) += this->get_elm(irow, k) * it->second; + it++; + } + } + return result; + } + return this->base_multiply(m); + } +}; + +template +class SparseSymmetricSimpleMatrix : public SparseSimpleMatrix, public SymmetricSimpleMatrix { +private: + uint32_t n_max; + T tol; + std::unordered_map mat; + +public: + SparseSymmetricSimpleMatrix(int n) : n_max(n), tol((T) 1e-12) {}; + SparseSymmetricSimpleMatrix(int n, T tol) : n_max(n), tol(std::abs(tol)) {}; + + virtual ~SparseSymmetricSimpleMatrix(){ mat.clear(); }; + virtual SimpleMatrix* get_new_instance(int nrows, int ncols){ + return new SparseSymmetricSimpleMatrix(nrows, tol); + } + virtual SimpleMatrix* get_new_instance(){ + return new SparseSymmetricSimpleMatrix(n_max, tol); + } + + virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){ + if( std::abs(element) > tol ) { + assert(i_idx < n_max); + assert(j_idx < n_max); + const uint64_t my_idx=this->get_lower_index(i_idx,j_idx); + mat[my_idx] = element; + } + } + virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){ + assert(i_idx < n_max); + assert(j_idx < n_max); + const uint64_t my_idx=this->get_lower_index(i_idx,j_idx); + auto it = mat.find(my_idx); + return it != mat.end() ? it-> second : (T) 0.0; + } + virtual inline bool has_elm(uint32_t i_idx, uint32_t j_idx){ + assert(i_idx < n_max); + assert(j_idx < n_max); + const uint64_t my_idx=this->get_lower_index(i_idx,j_idx); + auto it = mat.find(my_idx); + return it != mat.end(); + } + virtual inline uint64_t get_nrows(){ + // TODO: this can be determined from mat (since symmetric) then the matrix can be unbounded! + return n_max; + }; + virtual inline uint64_t get_ncols(){ + // TODO: this can be determined from mat (since symmetric) then the matrix can be unbounded! + return n_max; + }; + virtual inline uint64_t get_n_nonzero_elements(){ + return mat.size(); + }; + virtual typename std::unordered_map::iterator begin(){ + return mat.begin(); + }; + virtual typename std::unordered_map::iterator end(){ + return mat.end(); + }; + virtual T& at(uint32_t i_idx, uint32_t j_idx){ + assert(i_idx < n_max); + assert(j_idx < n_max); + const uint64_t c_idx=this->get_lower_index(i_idx,j_idx); + return at(c_idx); + }; + virtual T& at(uint64_t c_idx){ + assert(c_idx < n_max*(n_max+1)/2); + return mat[c_idx]; + }; + + void purge(){ + auto it = mat.begin(); + while(it!=mat.end()){ + if (std::abs(it->second) < tol ){ + it = mat.erase(it); + } + else{ + it++; + } + } + } + + virtual T get_max_elm(){ + T m = 0.0; + for ( auto& it : mat ) { + m = std::max(m,it.second); + } + return m; + }; + + void resize(uint32_t new_n_max){ + n_max = new_n_max; + } + // TODO: implement valid matrix ops + virtual SimpleMatrix* minus(SimpleMatrix* m){ + return this->base_minus(m); + } + virtual SimpleMatrix* plus(SimpleMatrix* m){ + return this->base_plus(m); + } + virtual SimpleMatrix* multiply(SimpleMatrix* m){ + return this->base_multiply(m); + } +}; + +template +class DenseSymmetricSimpleMatrix : public SymmetricSimpleMatrix { +private: + uint32_t n_max; + T* mat; +public: + DenseSymmetricSimpleMatrix(int n) : n_max(n) { + mat = new T[(n_max*(n_max+1))/2]; + }; + virtual ~DenseSymmetricSimpleMatrix(){ + delete [] mat; + mat = nullptr; + }; + virtual SimpleMatrix* get_new_instance(int nrows, int ncols){ + return new DenseSymmetricSimpleMatrix(nrows * ncols); + } + virtual SimpleMatrix* get_new_instance(){ + return new DenseSymmetricSimpleMatrix(n_max); + } + + virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){ + assert(i_idx < n_max); + assert(j_idx < n_max); + const uint64_t my_idx=SymmetricSimpleMatrix::get_lower_index(i_idx,j_idx); + mat[my_idx] = element; + } + virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){ + assert(i_idx < n_max); + assert(j_idx < n_max); + const uint64_t my_idx=SymmetricSimpleMatrix::get_lower_index(i_idx,j_idx); + return mat[my_idx]; + } + virtual inline uint64_t get_nrows(){ + return n_max; + }; + virtual inline uint64_t get_ncols(){ + return n_max; + }; + virtual T& at(uint32_t i_idx, uint32_t j_idx){ + assert(i_idx < n_max); + assert(j_idx < n_max); + const uint64_t c_idx=SymmetricSimpleMatrix::get_lower_index(i_idx,j_idx); + return at(c_idx); + }; + virtual T& at(uint64_t c_idx){ + assert(c_idx < n_max*(n_max+1)/2); + return mat[c_idx]; + }; + virtual T get_max_elm(){ + return *std::max_element(mat,mat+n_max); + }; +}; From 8e18435090f6353d8f24808e452f53cb48e20225 Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Sat, 18 Apr 2020 10:32:53 +0200 Subject: [PATCH 03/21] reimplementation of MCL using Eigen I have completely removed my own sparse matrix implementation and replaced it by the Eigen implementation which should also improve the speed in general --- src/basic/config.h | 2 +- src/run/cluster.cpp | 360 +++++++++++++++++------------ src/run/sparse_matrix.h | 498 ---------------------------------------- 3 files changed, 209 insertions(+), 651 deletions(-) delete mode 100644 src/run/sparse_matrix.h diff --git a/src/basic/config.h b/src/basic/config.h index 69023b07..b059a6a6 100644 --- a/src/basic/config.h +++ b/src/basic/config.h @@ -217,7 +217,7 @@ struct Config enum { double_indexed = 0, query_indexed = 1, subject_indexed = 2 }; int algo; - enum { multi_step = 0, transitive_closure = 1}; + enum { multi_step = 0, transitive_closure = 1, markov_clustering = 2}; int cluster_algo; enum { query_parallel = 0, target_parallel = 1 }; diff --git a/src/run/cluster.cpp b/src/run/cluster.cpp index f3ab09c7..49c2cd24 100644 --- a/src/run/cluster.cpp +++ b/src/run/cluster.cpp @@ -16,6 +16,7 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . ****/ +#include #include #include #include @@ -29,7 +30,6 @@ along with this program. If not, see . #include "../data/reference.h" #include "workflow.h" #include "disjoint_set.h" -#include "sparse_matrix.h" #include "../util/io/consumer.h" #include "../util/algo/algo.h" #include "../basic/statistics.h" @@ -90,43 +90,72 @@ class NeighborStream : public Consumer { template class SparseMatrixStream : public Consumer { - SparseSimpleMatrix* m = nullptr; + size_t n; + vector> data; LazyDisjointSet* disjointSet; virtual void consume(const char *ptr, size_t n) override { size_t query, subject, count; - float qcov, scov, bitscore; + float qcov, scov, bitscore, id; const char *end = ptr + n; while (ptr < end) { //if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2) - if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &count) != 5) + if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &id, &count) != 6) throw runtime_error("Cluster format error."); - m->set_elm(query, subject, bitscore); + data.emplace_back(query, subject, (qcov/100.0f) * (scov/100.0f) * (id/100.0f)); disjointSet->merge(query, subject); ptr += count; } } public: - SparseMatrixStream(size_t n, bool symmetric){ + SparseMatrixStream(size_t n){ + this->n = n; disjointSet = new LazyDisjointIntegralSet(n); - if( symmetric ){ - m = new SparseSymmetricSimpleMatrix(n); - } - else{ - m = new SparseSimpleMatrixImpl(n, n); + // Note: this makes sure the self hits are always included, the value needs to be aligned with the measure used in consume! + for(uint32_t idiag=0; idiag* getMatrix(){ - cout << " getting matrix " << m->get_nrows() << " " << m->get_ncols() << " " << m->get_n_nonzero_elements() << " sparsity "<< (1.0 * m->get_n_nonzero_elements()) / (m->get_nrows()* m->get_ncols()) << endl; - SparseSimpleMatrix* h = m; - m = nullptr; - return h; + pair>, vector>> getComponents(){ + vector> sets = disjointSet->getListOfSets(); + vector>> split(sets.size()); + for(uint32_t iset = 0; iset < sets.size(); iset++){ + split.push_back(vector>()); + } + for(Eigen::Triplet d : data){ + for(uint32_t iset = 0; iset < sets.size(); iset++){ + if(sets[iset].count(d.col()) != 0){ + split[iset].emplace_back(d.row(), d.col(), d.value()); + break; + } + } + } + vector> indices(sets.size()); + vector> components(sets.size()); + for(uint32_t iset = 0; iset < sets.size(); iset++){ + Eigen::SparseMatrix component(sets[iset].size(), sets[iset].size()); + vector order(sets[iset].begin(), sets[iset].end()); + map index_map; + uint32_t iel = 0; + for(uint32_t el: order){ + index_map.emplace(el, iel++); + } + vector> remapped(split[iset].size()); + for(Eigen::Triplet t : split[iset]){ + remapped.emplace_back(index_map[t.row()], index_map[t.col()], t.value()); + } + component.setFromTriplets(remapped.begin(), remapped.end()); + components.emplace_back(move(component)); + indices.emplace_back(move(order)); + } + return std::make_pair(indices, components); } - vector> getListOfSets(){ - return disjointSet->getListOfSets(); + Eigen::SparseMatrix getMatrix(){ + Eigen::SparseMatrix m(n,n); + m.setFromTriplets(data.begin(), data.end()); + return m; } }; @@ -282,152 +311,132 @@ void run_transitive_closure_clustering(DatabaseFile& db){ // } } -SparseSimpleMatrix* get_gamma(SparseSimpleMatrix* m, uint32_t r){ - SparseSimpleMatrix* gamma = new SparseSimpleMatrixImpl(m->get_nrows(), m->get_ncols()); - float* column_sum = new float[m->get_ncols()]; - for(uint32_t icol = 0; icol< m->get_ncols(); icol++){ - column_sum[icol] = 0.0f; - } - for(auto it = m->begin(); it != m->end(); it++){ - uint32_t i,j; - m->get_indices(it->first, &i, &j); - float el = pow(it->second, r); - gamma->set_elm(i,j,el); - column_sum[j] += el; - } - for(auto it = gamma->begin(); it != gamma->end(); it++){ - uint32_t i,j; - gamma->get_indices(it->first, &i, &j); - gamma->set_elm(i,j, it->second/column_sum[j]); - } - return gamma; -} -SimpleMatrix* get_gamma(SimpleMatrix* m, uint32_t r){ - SimpleMatrix* gamma = new SparseSimpleMatrixImpl(m->get_nrows(), m->get_ncols()); - float* column_sum = new float[m->get_ncols()]; - for(uint32_t icol = 0; icol< m->get_ncols(); icol++){ - column_sum[icol] = 0.0f; - } - for(uint32_t irow = 0; irow < m->get_nrows(); irow++){ - for(uint32_t icol = 0; icol < m->get_ncols(); icol++){ - column_sum[icol]+=pow(m->get_elm(irow, icol), r); +Eigen::SparseMatrix get_gamma(Eigen::SparseMatrix* m, float r){ + vector> data; + for (uint32_t k=0; kouterSize(); ++k){ + float colSum = 0.0f; + for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ + colSum += pow(it.value(),r); } - } - for(uint32_t irow = 0; irow < m->get_nrows(); irow++){ - for(uint32_t icol = 0; icol < m->get_ncols(); icol++){ - gamma->set_elm(irow, icol, pow(m->get_elm(irow, icol),r) / column_sum[icol]); + for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ + data.emplace_back(it.row(), it.col(), pow(it.value(),r) / colSum); } } - return gamma; + Eigen::SparseMatrix gamma(m->rows(), m->cols()); + gamma.setFromTriplets(data.begin(), data.end()); + return gamma.pruned(1.0, std::numeric_limits::epsilon()); } -SimpleMatrix* get_exp(SimpleMatrix* m, uint32_t r){ - // TODO: make this more efficient - SimpleMatrix* tmp1 = m->multiply(m); - for(uint32_t i = 2; i* tmp2 = tmp1->multiply(tmp1); - delete tmp1; - tmp1 = tmp2; +vector> get_list(Eigen::SparseMatrix* m){ + LazyDisjointIntegralSet disjointSet(m->cols()); + for (uint32_t k=0; kouterSize(); ++k){ + for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ + disjointSet.merge(it.row(), it.col()); + } } - return tmp1; + return disjointSet.getListOfSets(); } -vector> get_list(SimpleMatrix* m){ - LazyDisjointIntegralSet disjointSet(m->get_ncols()); - for(uint32_t irow = 0; irow < m->get_nrows(); irow++){ - for(uint32_t icol = 0; icol < m->get_ncols(); icol++){ - if(abs(m->get_elm(irow, icol)) > 1e-13){ - disjointSet.merge(irow, icol); + +void print(Eigen::SparseMatrix* m, bool sparse){ + if(sparse){ + for (uint32_t k=0; kouterSize(); ++k){ + for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ + printf("%lu %lu %16.14f\n", it.row(), it.col(), it.value()); } } } - return disjointSet.getListOfSets(); -} -vector> get_list(SparseSimpleMatrix* m){ - LazyDisjointIntegralSet disjointSet(m->get_ncols()); - for(auto it = m->begin(); it != m->end(); it++){ - uint32_t irow, icol; - m->get_indices(it->first, &irow, &icol); - disjointSet.merge(irow, icol); + else{ + for(uint32_t irow = 0; irow < m->rows(); irow++){ + for(uint32_t icol = 0; icol < m->cols(); icol++){ + printf(" %8.4f", m->coeffRef(irow, icol)); + } + printf("\n"); + } } - return disjointSet.getListOfSets(); + printf("\n"); } - -vector> markov_process(SimpleMatrix* m){ +vector> markov_process(Eigen::SparseMatrix* m){ uint32_t iteration = 0; double diff_norm = std::numeric_limits::max(); - while( iteration < 100 && diff_norm > 1e-3 ){ - SimpleMatrix* msquared = get_exp(m, 2); - SimpleMatrix* m_update = get_gamma(msquared, 2); - delete msquared; - SimpleMatrix* diff_m = m_update->minus(m); - diff_norm = diff_m->norm(2.0,2.0); - delete m; - delete diff_m; - m = m_update; - m_update = nullptr; + // TODO: Add loops TODO: make sure self-hits are found and have the same unit as the off diagonal elements (bit score or other measure used) before calling this routine, then only check that diagonal elements exist + // for(uint32_t idiag = 0; idiagget_nrows(); idiag++){ + // float e = m->get_elm(idiag, idiag); + // if( abs(e) < 1e-13 ){ + // //cout << "WARNING: did not find a self-hit for "<< idiag << endl; + // m->set_elm(idiag, idiag, 1.0); + // } + // } + //*m = get_gamma(m,1); + while( iteration < 100 && diff_norm > 1e-6*m->rows() ){ + Eigen::SparseMatrix msquared = ((*m) * (*m)).pruned(1.0, std::numeric_limits::epsilon()); + Eigen::SparseMatrix m_update = get_gamma(&msquared, 2); + diff_norm = (*m - m_update).norm(); + *m = m_update.pruned(1.0, std::numeric_limits::epsilon()); iteration++; } - cout<< "Markov Process converged after " << iteration <<" iterations " << diff_norm << endl; + //cout<< "Markov Process converged after " << iteration <<" iterations " << diff_norm << endl; return get_list(m); } void run_markov_clustering(DatabaseFile& db){ // testing - // SparseSimpleMatrix* tmp = new SparseSimpleMatrixImpl(12, 12); - // tmp->set_elm(0, 0, 0.2f); - // tmp->set_elm(0, 1, 0.25f); - // tmp->set_elm(0, 5, 0.333f); - // tmp->set_elm(0, 6, 0.25f); - // tmp->set_elm(0, 9, 0.25f); - // tmp->set_elm(1, 0, 0.20f); - // tmp->set_elm(1, 1, 0.25f); - // tmp->set_elm(1, 2, 0.25f); - // tmp->set_elm(1, 4, 0.20f); - // tmp->set_elm(2, 1, 0.25f); - // tmp->set_elm(2, 2, 0.25f); - // tmp->set_elm(2, 3, 0.20f); - // tmp->set_elm(2, 4, 0.20f); - // tmp->set_elm(3, 2, 0.25f); - // tmp->set_elm(3, 3, 0.20f); - // tmp->set_elm(3, 7, 0.20f); - // tmp->set_elm(3, 8, 0.20f); - // tmp->set_elm(3, 10, 0.20f); - // tmp->set_elm(4, 1, 0.25f); - // tmp->set_elm(4, 2, 0.25f); - // tmp->set_elm(4, 4, 0.20f); - // tmp->set_elm(4, 6, 0.25f); - // tmp->set_elm(4, 7, 0.20f); - // tmp->set_elm(5, 0, 0.20f); - // tmp->set_elm(5, 5, 0.333f); - // tmp->set_elm(5, 9, 0.25f); - // tmp->set_elm(6, 0, 0.20f); - // tmp->set_elm(6, 4, 0.20f); - // tmp->set_elm(6, 6, 0.25f); - // tmp->set_elm(6, 9, 0.25f); - // tmp->set_elm(7, 3, 0.20f); - // tmp->set_elm(7, 4, 0.20f); - // tmp->set_elm(7, 7, 0.20f); - // tmp->set_elm(7, 8, 0.20f); - // tmp->set_elm(7, 10, 0.20f); - // tmp->set_elm(8, 3, 0.20f); - // tmp->set_elm(8, 7, 0.20f); - // tmp->set_elm(8, 8, 0.20f); - // tmp->set_elm(8, 10, 0.20f); - // tmp->set_elm(8, 11, 0.333f); - // tmp->set_elm(9, 0, 0.20f); - // tmp->set_elm(9, 5, 0.333f); - // tmp->set_elm(9, 6, 0.25f); - // tmp->set_elm(9, 9, 0.25f); - // tmp->set_elm(10, 3, 0.20f); - // tmp->set_elm(10, 7, 0.20f); - // tmp->set_elm(10, 8, 0.20f); - // tmp->set_elm(10, 10, 0.20f); - // tmp->set_elm(10, 11, 0.333f); - // tmp->set_elm(11, 8, 0.20f); - // tmp->set_elm(11, 10, 0.20f); - // tmp->set_elm(11, 11, 0.333f); - // auto cr = markov_process(tmp); + // vector> data; + // data.emplace_back(0, 0, 0.2f); + // data.emplace_back(0, 1, 0.25f); + // data.emplace_back(0, 5, 0.333f); + // data.emplace_back(0, 6, 0.25f); + // data.emplace_back(0, 9, 0.25f); + // data.emplace_back(1, 0, 0.20f); + // data.emplace_back(1, 1, 0.25f); + // data.emplace_back(1, 2, 0.25f); + // data.emplace_back(1, 4, 0.20f); + // data.emplace_back(2, 1, 0.25f); + // data.emplace_back(2, 2, 0.25f); + // data.emplace_back(2, 3, 0.20f); + // data.emplace_back(2, 4, 0.20f); + // data.emplace_back(3, 2, 0.25f); + // data.emplace_back(3, 3, 0.20f); + // data.emplace_back(3, 7, 0.20f); + // data.emplace_back(3, 8, 0.20f); + // data.emplace_back(3, 10, 0.20f); + // data.emplace_back(4, 1, 0.25f); + // data.emplace_back(4, 2, 0.25f); + // data.emplace_back(4, 4, 0.20f); + // data.emplace_back(4, 6, 0.25f); + // data.emplace_back(4, 7, 0.20f); + // data.emplace_back(5, 0, 0.20f); + // data.emplace_back(5, 5, 0.333f); + // data.emplace_back(5, 9, 0.25f); + // data.emplace_back(6, 0, 0.20f); + // data.emplace_back(6, 4, 0.20f); + // data.emplace_back(6, 6, 0.25f); + // data.emplace_back(6, 9, 0.25f); + // data.emplace_back(7, 3, 0.20f); + // data.emplace_back(7, 4, 0.20f); + // data.emplace_back(7, 7, 0.20f); + // data.emplace_back(7, 8, 0.20f); + // data.emplace_back(7, 10, 0.20f); + // data.emplace_back(8, 3, 0.20f); + // data.emplace_back(8, 7, 0.20f); + // data.emplace_back(8, 8, 0.20f); + // data.emplace_back(8, 10, 0.20f); + // data.emplace_back(8, 11, 0.333f); + // data.emplace_back(9, 0, 0.20f); + // data.emplace_back(9, 5, 0.333f); + // data.emplace_back(9, 6, 0.25f); + // data.emplace_back(9, 9, 0.25f); + // data.emplace_back(10, 3, 0.20f); + // data.emplace_back(10, 7, 0.20f); + // data.emplace_back(10, 8, 0.20f); + // data.emplace_back(10, 10, 0.20f); + // data.emplace_back(10, 11, 0.333f); + // data.emplace_back(11, 8, 0.20f); + // data.emplace_back(11, 10, 0.20f); + // data.emplace_back(11, 11, 0.333f); + // Eigen::SparseMatrix tmp(12,12); + // tmp.setFromTriplets(data.begin(), data.end()); + // auto cr = markov_process(&tmp); // cout << "Found "< ms(db.ref_header.sequences, false); + SparseMatrixStream ms(db.ref_header.sequences); opt.consumer = &ms; opt.db_filter = nullptr; Workflow::Search::run(opt); - SparseSimpleMatrix* m = ms.getMatrix(); - - auto clusterResult = markov_process(m); - cout << "Found "<> indices = get<0>(componentInfo); + vector> components = get<1>(componentInfo); + uint32_t nComponents = count_if(indices.begin(), indices.end(), [](vector v){ return v.size() > 0;}); + uint32_t nComponentsLt1 = count_if(indices.begin(), indices.end(), [](vector v){ return v.size() > 1;}); + + cout << "DIAMOND done" << endl; + cout << "************" << endl; + cout << "Found " << nComponentsLt1 <<" ("<> clustering_result; + float max_sparsity = 0.0f; + float min_sparsity = 1.0f; + bool full = false; + if( full ){ + // TODO: According to the SIAM publication this is not valid, just for debugging + Eigen::SparseMatrix m = ms.getMatrix(); + for(unordered_set subset : markov_process(&m)){ + clustering_result.emplace_back(subset); + } + } + else { + // TODO: check if using dense matrices for non-sparse components improves performance + for(uint32_t iComponent = 0; iComponent order = indices[iComponent]; + if(order.size() > 1){ + Eigen::SparseMatrix m = components[iComponent]; + float sparsity = 1.0-(1.0 * m.nonZeros()) / (m.rows()*m.cols()); + max_sparsity = max(max_sparsity, sparsity); + min_sparsity = min(min_sparsity, sparsity); + // map back to original ids + for(unordered_set subset : markov_process(&m)){ + unordered_set s; + for(uint32_t el : subset){ + s.emplace(order[el]); + } + clustering_result.emplace_back(s); + } + } + else if (order.size() == 1){ + clustering_result.emplace_back(order.begin(), order.end()); + } + } + } + uint32_t nClusters = clustering_result.size(); + uint32_t nClustersLt1 = count_if(clustering_result.begin(), clustering_result.end(), [](unordered_set v){ return v.size() > 1;}); + cout << "Found "< -#include - -template -class SimpleMatrix { -public: - virtual T get_max_elm() = 0; - virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx) = 0; - virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element) = 0; - virtual T& at(uint32_t i_idx, uint32_t j_idx) = 0; - virtual T& at(uint64_t c_idx) = 0; - virtual inline uint64_t get_nrows() = 0; - virtual inline uint64_t get_ncols() = 0; - virtual SimpleMatrix* get_new_instance(int nrows, int ncols) = 0; - virtual SimpleMatrix* get_new_instance() = 0; - virtual void print(){ - for( uint32_t irow=0; irow < get_nrows(); irow++){ - for( uint32_t icol=0; icol < get_ncols(); icol++){ - printf(" %.6f",get_elm(irow,icol)); - } - printf("\n"); - } - } - - virtual inline uint64_t get_idx(uint32_t i_idx, uint32_t j_idx){ - assert(i_idx < get_nrows()); - assert(j_idx < get_ncols()); - return i_idx * get_ncols() + j_idx; - } - - virtual inline void get_indices (uint64_t combined, uint32_t* i, uint32_t* j){ - *j = combined % get_ncols(); - *i = ( combined - *j) / get_ncols(); - } - - virtual SimpleMatrix* minus(SimpleMatrix* m) = 0; - virtual SimpleMatrix* base_minus(SimpleMatrix* m){ - assert(this->get_nrows() == m->get_nrows()); - assert(this->get_ncols() == m->get_ncols()); - SimpleMatrix* result = this->get_new_instance(this->get_nrows(),m->get_ncols()); - for(uint32_t irow = 0; irowget_nrows(); irow++){ - for(uint32_t icol = 0; icolget_ncols(); icol++){ - result->set_elm(irow, icol, this->get_elm(irow, icol) - m->get_elm(irow,icol)); - } - } - return result; - } - virtual SimpleMatrix* plus(SimpleMatrix* m) = 0; - virtual SimpleMatrix* base_plus(SimpleMatrix* m){ - assert(this->get_nrows() == m->get_nrows()); - assert(this->get_ncols() == m->get_ncols()); - SimpleMatrix* result = this->get_new_instance(this->get_nrows(),m->get_ncols()); - for(uint32_t irow = 0; irowget_nrows(); irow++){ - for(uint32_t icol = 0; icolget_ncols(); icol++){ - result->set_elm(irow, icol, this->get_elm(irow, icol) + m->get_elm(irow, icol)); - } - } - return result; - } - virtual SimpleMatrix* multiply(SimpleMatrix* m) = 0; - virtual SimpleMatrix* base_multiply(SimpleMatrix* m){ - printf("standard mult\n"); - SimpleMatrix* result = this->get_new_instance(this->get_nrows(),m->get_ncols()); - for(uint32_t irow = 0; irowget_nrows(); irow++){ - for(uint32_t icol = 0; icolget_ncols(); icol++){ - result->set_elm(irow, icol, 0); - } - } - for(uint32_t irow = 0; irowget_nrows(); irow++){ - for(uint32_t icol = 0; icolget_ncols(); icol++){ - for(uint32_t k = 0; kget_nrows(); k++){ - result->set_elm(irow, icol, result->get_elm(irow, icol) + this->get_elm(irow, k) * m->get_elm(k,icol)); - } - } - } - return result; - } - virtual double norm(double p, double q){ - double result = 0.0; - double exp1 = q/p; - for(uint32_t irow = 0; irowget_nrows(); irow++){ - double colnorm = 0.0; - for(uint32_t icol = 0; icolget_ncols(); icol++){ - colnorm += std::pow(std::abs(this->get_elm(irow, icol)), p); - } - result += std::pow(colnorm, exp1); - } - return std::pow(result, 1.0/q); - } - - virtual ~SimpleMatrix() {}; -}; - -template -class SymmetricSimpleMatrix : virtual public SimpleMatrix { -public: - static inline uint64_t get_lower_index (uint32_t i_idx, uint32_t j_idx){ - uint32_t min = std::min(i_idx,j_idx); - uint32_t max = std::max(i_idx,j_idx); - return max*((uint64_t) max+1)/2 + min; - } - static inline void get_symmetric_index (uint64_t combined, uint32_t* max, uint32_t* min){ - // solving max^2-max-2*combined = 0. -1 to convert to indexing varaible in C, avoiding overflow - // by removing an additional factor of 2 from under the square root - *max = (uint32_t) std::floor(0.5+2*sqrt(0.0625+0.5*combined)) - 1; - *min = combined - (*max*((uint64_t)*max+1))/2; - } -}; - -template -class SparseSimpleMatrix : virtual public SimpleMatrix { -public: - virtual uint64_t get_n_nonzero_elements() = 0; - virtual inline bool has_elm(uint32_t i_idx, uint32_t j_idx) = 0; - virtual typename std::unordered_map::iterator begin() = 0; - virtual typename std::unordered_map::iterator end() = 0; -}; - -template -class SparseSimpleMatrixImpl : public SparseSimpleMatrix { -private: - uint32_t nrows, ncols; - T tol; - std::unordered_map mat; - -public: - SparseSimpleMatrixImpl(int nrows, int ncols) : nrows(nrows), ncols(ncols), tol((T) 1e-12) {}; - SparseSimpleMatrixImpl(int nrows, int ncols, T tol) : nrows(nrows), ncols(ncols), tol(std::abs(tol)) {}; - - virtual SimpleMatrix* get_new_instance(int nrows, int ncols){ - return new SparseSimpleMatrixImpl(nrows, ncols); - } - virtual SimpleMatrix* get_new_instance(){ - return new SparseSimpleMatrixImpl(nrows, ncols); - } - - virtual ~SparseSimpleMatrixImpl(){ mat.clear(); }; - - virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){ - assert(i_idx < nrows); - assert(j_idx < ncols); - if(std::abs(element) > tol) { - mat[SimpleMatrix::get_idx(i_idx,j_idx)] = element; - } - } - virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){ - assert(i_idx < nrows); - assert(j_idx < ncols); - auto it = mat.find(SimpleMatrix::get_idx(i_idx,j_idx)); - return it != mat.end() ? it-> second : (T) 0.0; - } - virtual inline bool has_elm(uint32_t i_idx, uint32_t j_idx){ - assert(i_idx < nrows); - assert(j_idx < ncols); - auto it = mat.find(SimpleMatrix::get_idx(i_idx,j_idx)); - return it != mat.end(); - } - virtual inline uint64_t get_nrows(){ - return nrows; - }; - virtual inline uint64_t get_ncols(){ - return ncols; - }; - virtual inline uint64_t get_n_nonzero_elements(){ - return mat.size(); - }; - virtual typename std::unordered_map::iterator begin(){ - return mat.begin(); - }; - virtual typename std::unordered_map::iterator end(){ - return mat.end(); - }; - virtual T& at(uint32_t i_idx, uint32_t j_idx){ - return at(SimpleMatrix::get_idx(i_idx,j_idx)); - }; - virtual T& at(uint64_t c_idx){ - assert(c_idx < nrows*ncols); - return mat[c_idx]; - }; - - void purge(){ - auto it = mat.begin(); - while(it!=mat.end()){ - if(std::abs(it->second) < tol){ - it = mat.erase(it); - } - else{ - it++; - } - } - } - virtual T get_max_elm(){ - T m = 0.0; - for(auto& it : mat){ - m = std::max(m, it.second); - } - return m; - }; - virtual SimpleMatrix* minus(SimpleMatrix* m){ - if(SparseSimpleMatrix* n = dynamic_cast*> (m)){ - // TODO: Imporove this implementation - SparseSimpleMatrix* result = new SparseSimpleMatrixImpl(this->get_nrows(),n->get_ncols()); - - std::vector> k_to_j(n->get_nrows()); - for(auto it_m = n->begin(); it_m != n->end(); it_m++){ - uint32_t irow,icol; - n->get_indices(it_m->first, &irow, &icol); - result->set_elm(irow, icol, this->get_elm(irow,icol) - it_m->second); - } - for(auto it_this = this->begin(); it_this != this->end(); it_this++){ - uint32_t irow,icol; - this->get_indices(it_this->first, &irow, &icol); - if(!n->has_elm(irow,icol)) result->set_elm(irow, icol, it_this->second); - } - return result; - } - return this->base_minus(m); - } - virtual SimpleMatrix* plus(SimpleMatrix* m){ - if(SparseSimpleMatrix* n = dynamic_cast*> (m)){ - // TODO: Imporove this implementation - SparseSimpleMatrix* result = new SparseSimpleMatrixImpl(this->get_nrows(),n->get_ncols()); - - std::vector> k_to_j(n->get_nrows()); - for(auto it_m = n->begin(); it_m != n->end(); it_m++){ - uint32_t irow,icol; - n->get_indices(it_m->first, &irow, &icol); - result->set_elm(irow, icol, this->get_elm(irow,icol) + it_m->second); - } - for(auto it_this = this->begin(); it_this != this->end(); it_this++){ - uint32_t irow,icol; - this->get_indices(it_this->first, &irow, &icol); - if(!n->has_elm(irow,icol)) result->set_elm(irow, icol, it_this->second); - } - return result; - } - return this->base_plus(m); - } - virtual SimpleMatrix* multiply(SimpleMatrix* m){ - if(SparseSimpleMatrix* n = dynamic_cast*> (m)){ - SparseSimpleMatrix* result = new SparseSimpleMatrixImpl(this->get_nrows(),n->get_ncols()); - std::vector> k_to_j(n->get_nrows()); - for(uint32_t k = 0; kget_nrows(); k++){ - k_to_j[k] = std::vector(); - } - for(auto it_m = n->begin(); it_m != n->end(); it_m++){ - uint32_t k2,icol; - n->get_indices(it_m->first, &k2, &icol); - k_to_j[k2].push_back(icol); - } - - auto it_this = this->begin(); - uint32_t ithis = 0; - while(it_this != this->end()){ - uint32_t irow, k; - this->get_indices(it_this->first, &irow, &k); - for( uint32_t const & icol : k_to_j[k] ){ - T el = result->get_elm(irow, icol); - result->set_elm(irow, icol, el + it_this->second * n->get_elm(k, icol)); - } - it_this++; - } - return result; - } - return this->base_multiply(m); - } -}; - -template -class DenseSimpleMatrix : public SimpleMatrix { -private: - uint32_t nrows, ncols; - T* mat; -public: - DenseSimpleMatrix(int nrows, int ncols) : nrows(nrows), ncols(ncols) { - mat = new T[nrows * ncols]; - }; - DenseSimpleMatrix(int nrows, int ncols, T init) : nrows(nrows), ncols(ncols) { - mat = new T[nrows * ncols]; - for(uint64_t i = 0; i* get_new_instance(int nrows, int ncols){ - return new DenseSimpleMatrix(nrows, ncols); - } - virtual SimpleMatrix* get_new_instance(){ - return new DenseSimpleMatrix(nrows, ncols); - } - - virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){ - mat[SimpleMatrix::get_idx(i_idx,j_idx)] = element; - } - virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){ - return mat[SimpleMatrix::get_idx(i_idx,j_idx)]; - } - virtual inline uint64_t get_nrows(){ - return nrows; - }; - virtual inline uint64_t get_ncols(){ - return ncols; - }; - virtual T& at(uint32_t i_idx, uint32_t j_idx){ - return at(SimpleMatrix::get_idx(i_idx,j_idx)); - }; - virtual T& at(uint64_t c_idx){ - assert(c_idx < nrows * ncols); - return mat[c_idx]; - }; - virtual T get_max_elm(){ - // TODO: implement me - return 0;//*std::max_element(mat,mat+n_max); - }; - - SimpleMatrix* multiply(SimpleMatrix* m){ - if(SparseSimpleMatrix* n = dynamic_cast*> (m)){ - DenseSimpleMatrix* result = new DenseSimpleMatrix(this->get_nrows(),n->get_ncols()); - for(uint32_t irow = 0; irowget_nrows(); irow++){ - for(uint32_t icol = 0; icolget_ncols(); icol++){ - result->set_elm(irow, icol, 0); - } - } - for(uint32_t irow = 0; irowget_nrows(); irow++){ - auto it = n->begin(); - while(it!=n->end()){ - uint32_t k,icol; - n->get_indices (it->first, &k, &icol); - result->at(irow, icol) += this->get_elm(irow, k) * it->second; - it++; - } - } - return result; - } - return this->base_multiply(m); - } -}; - -template -class SparseSymmetricSimpleMatrix : public SparseSimpleMatrix, public SymmetricSimpleMatrix { -private: - uint32_t n_max; - T tol; - std::unordered_map mat; - -public: - SparseSymmetricSimpleMatrix(int n) : n_max(n), tol((T) 1e-12) {}; - SparseSymmetricSimpleMatrix(int n, T tol) : n_max(n), tol(std::abs(tol)) {}; - - virtual ~SparseSymmetricSimpleMatrix(){ mat.clear(); }; - virtual SimpleMatrix* get_new_instance(int nrows, int ncols){ - return new SparseSymmetricSimpleMatrix(nrows, tol); - } - virtual SimpleMatrix* get_new_instance(){ - return new SparseSymmetricSimpleMatrix(n_max, tol); - } - - virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){ - if( std::abs(element) > tol ) { - assert(i_idx < n_max); - assert(j_idx < n_max); - const uint64_t my_idx=this->get_lower_index(i_idx,j_idx); - mat[my_idx] = element; - } - } - virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){ - assert(i_idx < n_max); - assert(j_idx < n_max); - const uint64_t my_idx=this->get_lower_index(i_idx,j_idx); - auto it = mat.find(my_idx); - return it != mat.end() ? it-> second : (T) 0.0; - } - virtual inline bool has_elm(uint32_t i_idx, uint32_t j_idx){ - assert(i_idx < n_max); - assert(j_idx < n_max); - const uint64_t my_idx=this->get_lower_index(i_idx,j_idx); - auto it = mat.find(my_idx); - return it != mat.end(); - } - virtual inline uint64_t get_nrows(){ - // TODO: this can be determined from mat (since symmetric) then the matrix can be unbounded! - return n_max; - }; - virtual inline uint64_t get_ncols(){ - // TODO: this can be determined from mat (since symmetric) then the matrix can be unbounded! - return n_max; - }; - virtual inline uint64_t get_n_nonzero_elements(){ - return mat.size(); - }; - virtual typename std::unordered_map::iterator begin(){ - return mat.begin(); - }; - virtual typename std::unordered_map::iterator end(){ - return mat.end(); - }; - virtual T& at(uint32_t i_idx, uint32_t j_idx){ - assert(i_idx < n_max); - assert(j_idx < n_max); - const uint64_t c_idx=this->get_lower_index(i_idx,j_idx); - return at(c_idx); - }; - virtual T& at(uint64_t c_idx){ - assert(c_idx < n_max*(n_max+1)/2); - return mat[c_idx]; - }; - - void purge(){ - auto it = mat.begin(); - while(it!=mat.end()){ - if (std::abs(it->second) < tol ){ - it = mat.erase(it); - } - else{ - it++; - } - } - } - - virtual T get_max_elm(){ - T m = 0.0; - for ( auto& it : mat ) { - m = std::max(m,it.second); - } - return m; - }; - - void resize(uint32_t new_n_max){ - n_max = new_n_max; - } - // TODO: implement valid matrix ops - virtual SimpleMatrix* minus(SimpleMatrix* m){ - return this->base_minus(m); - } - virtual SimpleMatrix* plus(SimpleMatrix* m){ - return this->base_plus(m); - } - virtual SimpleMatrix* multiply(SimpleMatrix* m){ - return this->base_multiply(m); - } -}; - -template -class DenseSymmetricSimpleMatrix : public SymmetricSimpleMatrix { -private: - uint32_t n_max; - T* mat; -public: - DenseSymmetricSimpleMatrix(int n) : n_max(n) { - mat = new T[(n_max*(n_max+1))/2]; - }; - virtual ~DenseSymmetricSimpleMatrix(){ - delete [] mat; - mat = nullptr; - }; - virtual SimpleMatrix* get_new_instance(int nrows, int ncols){ - return new DenseSymmetricSimpleMatrix(nrows * ncols); - } - virtual SimpleMatrix* get_new_instance(){ - return new DenseSymmetricSimpleMatrix(n_max); - } - - virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){ - assert(i_idx < n_max); - assert(j_idx < n_max); - const uint64_t my_idx=SymmetricSimpleMatrix::get_lower_index(i_idx,j_idx); - mat[my_idx] = element; - } - virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){ - assert(i_idx < n_max); - assert(j_idx < n_max); - const uint64_t my_idx=SymmetricSimpleMatrix::get_lower_index(i_idx,j_idx); - return mat[my_idx]; - } - virtual inline uint64_t get_nrows(){ - return n_max; - }; - virtual inline uint64_t get_ncols(){ - return n_max; - }; - virtual T& at(uint32_t i_idx, uint32_t j_idx){ - assert(i_idx < n_max); - assert(j_idx < n_max); - const uint64_t c_idx=SymmetricSimpleMatrix::get_lower_index(i_idx,j_idx); - return at(c_idx); - }; - virtual T& at(uint64_t c_idx){ - assert(c_idx < n_max*(n_max+1)/2); - return mat[c_idx]; - }; - virtual T get_max_elm(){ - return *std::max_element(mat,mat+n_max); - }; -}; From 07fdac0e478210c5f4ae50b9f0ef754bb04440a7 Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Sat, 18 Apr 2020 10:58:19 +0200 Subject: [PATCH 04/21] Two minor code changes with big effect - create a matrix of random walks from the orignal matrix before calling markov_process - improve speed in result collection --- src/run/cluster.cpp | 42 +++++++----------------------------------- 1 file changed, 7 insertions(+), 35 deletions(-) diff --git a/src/run/cluster.cpp b/src/run/cluster.cpp index 49c2cd24..c88944df 100644 --- a/src/run/cluster.cpp +++ b/src/run/cluster.cpp @@ -121,16 +121,16 @@ class SparseMatrixStream : public Consumer { pair>, vector>> getComponents(){ vector> sets = disjointSet->getListOfSets(); vector>> split(sets.size()); + unordered_map indexToSetId; for(uint32_t iset = 0; iset < sets.size(); iset++){ split.push_back(vector>()); + for(size_t index : sets[iset]){ + indexToSetId.emplace(index, iset); + } } for(Eigen::Triplet d : data){ - for(uint32_t iset = 0; iset < sets.size(); iset++){ - if(sets[iset].count(d.col()) != 0){ - split[iset].emplace_back(d.row(), d.col(), d.value()); - break; - } - } + assert(indexToSetId[d.row()] == indexToSetId[d.row()]); + split[indexToSetId[d.row()]].emplace_back(d.row(), d.col(), d.value()); } vector> indices(sets.size()); vector> components(sets.size()); @@ -337,37 +337,10 @@ vector> get_list(Eigen::SparseMatrix* m){ return disjointSet.getListOfSets(); } -void print(Eigen::SparseMatrix* m, bool sparse){ - if(sparse){ - for (uint32_t k=0; kouterSize(); ++k){ - for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ - printf("%lu %lu %16.14f\n", it.row(), it.col(), it.value()); - } - } - } - else{ - for(uint32_t irow = 0; irow < m->rows(); irow++){ - for(uint32_t icol = 0; icol < m->cols(); icol++){ - printf(" %8.4f", m->coeffRef(irow, icol)); - } - printf("\n"); - } - } - printf("\n"); -} - vector> markov_process(Eigen::SparseMatrix* m){ uint32_t iteration = 0; double diff_norm = std::numeric_limits::max(); - // TODO: Add loops TODO: make sure self-hits are found and have the same unit as the off diagonal elements (bit score or other measure used) before calling this routine, then only check that diagonal elements exist - // for(uint32_t idiag = 0; idiagget_nrows(); idiag++){ - // float e = m->get_elm(idiag, idiag); - // if( abs(e) < 1e-13 ){ - // //cout << "WARNING: did not find a self-hit for "<< idiag << endl; - // m->set_elm(idiag, idiag, 1.0); - // } - // } - //*m = get_gamma(m,1); + *m = get_gamma(m,1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable while( iteration < 100 && diff_norm > 1e-6*m->rows() ){ Eigen::SparseMatrix msquared = ((*m) * (*m)).pruned(1.0, std::numeric_limits::epsilon()); Eigen::SparseMatrix m_update = get_gamma(&msquared, 2); @@ -375,7 +348,6 @@ vector> markov_process(Eigen::SparseMatrix* m){ *m = m_update.pruned(1.0, std::numeric_limits::epsilon()); iteration++; } - //cout<< "Markov Process converged after " << iteration <<" iterations " << diff_norm << endl; return get_list(m); } From 94dec7877586a6e44a1c53b3dd56de8b15f23cfd Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Wed, 22 Apr 2020 21:33:20 +0200 Subject: [PATCH 05/21] addressing some of the code-review suggestions --- CMakeLists.txt | 3 +- src/basic/config.cpp | 2 +- src/basic/config.h | 3 +- src/{run => cluster}/disjoint_set.h | 0 src/cluster/multi_step_cluster.cpp | 161 +++++++++ src/run/cluster.cpp | 516 ---------------------------- src/run/main.cpp | 11 +- src/run/workflow.h | 9 +- 8 files changed, 180 insertions(+), 525 deletions(-) rename src/{run => cluster}/disjoint_set.h (100%) create mode 100644 src/cluster/multi_step_cluster.cpp delete mode 100644 src/run/cluster.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8eb8f16b..91b00ab7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,7 +151,6 @@ set(OBJECTS src/data/seed_array.cpp src/output/paf_format.cpp src/util/system/system.cpp - src/run/cluster.cpp src/util/algo/greedy_vortex_cover.cpp src/util/sequence/sequence.cpp src/tools/tsv_record.cpp @@ -172,6 +171,8 @@ set(OBJECTS src/align/gapped.cpp src/align/culling.cpp src/cluster/medoid.cpp + src/cluster/multi_step_cluster.cpp + src/cluster/mcl.cpp src/align/output.cpp src/tools/roc.cpp src/test/data.cpp diff --git a/src/basic/config.cpp b/src/basic/config.cpp index e67fa5d8..dbde2e8c 100644 --- a/src/basic/config.cpp +++ b/src/basic/config.cpp @@ -145,7 +145,7 @@ Config::Config(int argc, const char **argv, bool check_io) Options_group cluster("Cluster options"); cluster.add() - ("calgo", 0, "Clustering algorithm (0=multi-step/1=transitive-closure)", cluster_algo, -1); + ("cluster-algo", 0, "Clustering algorithm (\"multi-step\", \"mcl\")", cluster_algo); Options_group aligner("Aligner options"); aligner.add() diff --git a/src/basic/config.h b/src/basic/config.h index 46bbff7a..7d5e9243 100644 --- a/src/basic/config.h +++ b/src/basic/config.h @@ -222,8 +222,7 @@ struct Config enum { double_indexed = 0, query_indexed = 1, subject_indexed = 2 }; int algo; - enum { multi_step = 0, transitive_closure = 1, markov_clustering = 2}; - int cluster_algo; + string cluster_algo; enum { query_parallel = 0, target_parallel = 1 }; unsigned load_balancing; diff --git a/src/run/disjoint_set.h b/src/cluster/disjoint_set.h similarity index 100% rename from src/run/disjoint_set.h rename to src/cluster/disjoint_set.h diff --git a/src/cluster/multi_step_cluster.cpp b/src/cluster/multi_step_cluster.cpp new file mode 100644 index 00000000..6cb2f704 --- /dev/null +++ b/src/cluster/multi_step_cluster.cpp @@ -0,0 +1,161 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2013-2018 Benjamin Buchfink + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +****/ + +#include +#include +#include +#include +#include +#include +#include +#include "../util/system/system.h" +#include "../util/util.h" +#include "../basic/config.h" +#include "../data/reference.h" +#include "../run/workflow.h" +#include "../util/io/consumer.h" +#include "../util/algo/algo.h" +#include "../basic/statistics.h" +#include "../util/log_stream.h" +#include "../dp/dp.h" +#include "../basic/masking.h" + +using namespace std; + +namespace Workflow { namespace Cluster{ namespace MultiStep { + +struct Neighbors : public vector>, public Consumer { + Neighbors(size_t n): + vector>(n) + {} + virtual void consume(const char *ptr, size_t n) override { + int query, subject, count; + float qcov, scov, bitscore; + const char *end = ptr + n; + while (ptr < end) { + //if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2) + if (sscanf(ptr, "%i\t%i\t%f\t%f\t%f\n%n", &query, &subject, &qcov, &scov, &bitscore, &count) != 5) + throw runtime_error("Cluster format error."); + ptr += count; + //cout << query << '\t' << subject << '\t' << qcov << '\t' << scov << '\t' << endl; + (*this)[query].push_back(subject); + edges.push_back({ query, subject, (int)bitscore }); + } + } + vector edges; +}; + +vector rep_bitset(const vector ¢roid, const vector *superset = nullptr) { + vector r(centroid.size()); + for (int c : centroid) + if(!superset || (*superset)[c]) + r[c] = true; + return r; +} + +vector cluster(DatabaseFile &db, const vector *filter) { + statistics.reset(); + config.command = Config::blastp; + config.no_self_hits = true; + //config.output_format = { "6", "qnum", "snum" }; + config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore" }; + config.query_cover = 80; + config.subject_cover = 80; + config.algo = 0; + config.index_mode = 0; + config.freq_sd = 0; + config.max_alignments = numeric_limits::max(); + + Workflow::Search::Options opt; + opt.db = &db; + opt.self = true; + Neighbors nb(db.ref_header.sequences); + opt.consumer = &nb; + opt.db_filter = filter; + + Workflow::Search::run(opt); + + return Util::Algo::greedy_vortex_cover(nb); +} + +void run_two_step_clustering(DatabaseFile& db){ + const size_t seq_count = db.ref_header.sequences; + + config.min_id = 70; + vector centroid1(cluster(db, nullptr)); + vector rep1(rep_bitset(centroid1)); + size_t n_rep1 = count_if(rep1.begin(), rep1.end(), [](bool x) { return x; }); + message_stream << "Clustering step 1 complete. #Input sequences: " << centroid1.size() << " #Clusters: " << n_rep1 << endl; + + config.mode_more_sensitive = true; + config.min_id = 0; + vector centroid2(cluster(db, &rep1)); + vector rep2(rep_bitset(centroid2, &rep1)); + for (size_t i = 0; i < centroid2.size(); ++i) + if (!rep1[i]) + centroid2[i] = centroid2[centroid1[i]]; + message_stream << "Clustering step 2 complete. #Input sequences: " << n_rep1 << " #Clusters: " << count_if(rep2.begin(), rep2.end(), [](bool x) { return x; }) << endl; + + task_timer timer("Generating output"); + Sequence_set *rep_seqs; + String_set *rep_ids; + vector rep_database_id, rep_block_id(seq_count); + db.rewind(); + db.load_seqs(rep_database_id, (size_t)1e11, &rep_seqs, &rep_ids, true, &rep2); + for (size_t i = 0; i < rep_database_id.size(); ++i) + rep_block_id[rep_database_id[i]] = (unsigned)i; + + ostream *out = config.output_file.empty() ? &cout : new ofstream(config.output_file.c_str()); + vector seq; + string id; + db.seek_direct(); + Hsp hsp; + size_t n; + out->precision(3); + + for (int i = 0; i < (int)db.ref_header.sequences; ++i) { + db.read_seq(id, seq); + const unsigned r = rep_block_id[centroid2[i]]; + (*out) << blast_id(id) << '\t' + << blast_id((*rep_ids)[r]) << '\t'; + if ((int)i == centroid2[i]) + (*out) << "100\t100\t100\t0" << endl; + else { + Masking::get().bit_to_hard_mask(seq.data(), seq.size(), n); + smith_waterman(sequence(seq), (*rep_seqs)[r], hsp); + (*out) << hsp.id_percent() << '\t' + << hsp.query_cover_percent((unsigned)seq.size()) << '\t' + << hsp.subject_cover_percent((unsigned)(*rep_seqs)[r].length()) << '\t' + << score_matrix.bitscore(hsp.score) << endl; + } + } + + if (out != &cout) delete out; + delete rep_seqs; + delete rep_ids; +} + +void run() { + if (config.database == "") + throw runtime_error("Missing parameter: database file (--db/-d)"); + config.command = Config::makedb; + unique_ptr db(DatabaseFile::auto_create_from_fasta()); + run_two_step_clustering(*db); + db->close(); +} +}}} diff --git a/src/run/cluster.cpp b/src/run/cluster.cpp deleted file mode 100644 index c88944df..00000000 --- a/src/run/cluster.cpp +++ /dev/null @@ -1,516 +0,0 @@ -/**** -DIAMOND protein aligner -Copyright (C) 2013-2018 Benjamin Buchfink - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -****/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../util/system/system.h" -#include "../util/util.h" -#include "../basic/config.h" -#include "../data/reference.h" -#include "workflow.h" -#include "disjoint_set.h" -#include "../util/io/consumer.h" -#include "../util/algo/algo.h" -#include "../basic/statistics.h" -#include "../util/log_stream.h" -#include "../dp/dp.h" -#include "../basic/masking.h" - -using namespace std; - -namespace Workflow { namespace Cluster { - -struct Neighbors : public vector>, public Consumer { - Neighbors(size_t n): - vector>(n) - {} - virtual void consume(const char *ptr, size_t n) override { - int query, subject, count; - float qcov, scov, bitscore; - const char *end = ptr + n; - while (ptr < end) { - //if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2) - if (sscanf(ptr, "%i\t%i\t%f\t%f\t%f\n%n", &query, &subject, &qcov, &scov, &bitscore, &count) != 5) - throw runtime_error("Cluster format error."); - ptr += count; - //cout << query << '\t' << subject << '\t' << qcov << '\t' << scov << '\t' << endl; - (*this)[query].push_back(subject); - edges.push_back({ query, subject, (int)bitscore }); - } - } - vector edges; -}; - -class NeighborStream : public Consumer { - LazyDisjointSet* disjointSet; - virtual void consume(const char *ptr, size_t n) override { - size_t query, subject, count; - float qcov, scov, bitscore; - const char *end = ptr + n; - while (ptr < end) { - //if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2) - if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &count) != 5) - throw runtime_error("Cluster format error."); - disjointSet->merge(query, subject); - ptr += count; - } - } -public: - NeighborStream(size_t n){ - disjointSet = new LazyDisjointIntegralSet(n); - } - ~NeighborStream(){ - delete disjointSet; - } - vector> getListOfSets(){ - return disjointSet->getListOfSets(); - } -}; - -template -class SparseMatrixStream : public Consumer { - size_t n; - vector> data; - LazyDisjointSet* disjointSet; - virtual void consume(const char *ptr, size_t n) override { - size_t query, subject, count; - float qcov, scov, bitscore, id; - const char *end = ptr + n; - while (ptr < end) { - //if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2) - if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &id, &count) != 6) - throw runtime_error("Cluster format error."); - data.emplace_back(query, subject, (qcov/100.0f) * (scov/100.0f) * (id/100.0f)); - disjointSet->merge(query, subject); - ptr += count; - } - } -public: - SparseMatrixStream(size_t n){ - this->n = n; - disjointSet = new LazyDisjointIntegralSet(n); - // Note: this makes sure the self hits are always included, the value needs to be aligned with the measure used in consume! - for(uint32_t idiag=0; idiag>, vector>> getComponents(){ - vector> sets = disjointSet->getListOfSets(); - vector>> split(sets.size()); - unordered_map indexToSetId; - for(uint32_t iset = 0; iset < sets.size(); iset++){ - split.push_back(vector>()); - for(size_t index : sets[iset]){ - indexToSetId.emplace(index, iset); - } - } - for(Eigen::Triplet d : data){ - assert(indexToSetId[d.row()] == indexToSetId[d.row()]); - split[indexToSetId[d.row()]].emplace_back(d.row(), d.col(), d.value()); - } - vector> indices(sets.size()); - vector> components(sets.size()); - for(uint32_t iset = 0; iset < sets.size(); iset++){ - Eigen::SparseMatrix component(sets[iset].size(), sets[iset].size()); - vector order(sets[iset].begin(), sets[iset].end()); - map index_map; - uint32_t iel = 0; - for(uint32_t el: order){ - index_map.emplace(el, iel++); - } - vector> remapped(split[iset].size()); - for(Eigen::Triplet t : split[iset]){ - remapped.emplace_back(index_map[t.row()], index_map[t.col()], t.value()); - } - component.setFromTriplets(remapped.begin(), remapped.end()); - components.emplace_back(move(component)); - indices.emplace_back(move(order)); - } - return std::make_pair(indices, components); - } - Eigen::SparseMatrix getMatrix(){ - Eigen::SparseMatrix m(n,n); - m.setFromTriplets(data.begin(), data.end()); - return m; - } -}; - -vector rep_bitset(const vector ¢roid, const vector *superset = nullptr) { - vector r(centroid.size()); - for (int c : centroid) - if(!superset || (*superset)[c]) - r[c] = true; - return r; -} - -vector cluster(DatabaseFile &db, const vector *filter) { - statistics.reset(); - config.command = Config::blastp; - config.no_self_hits = true; - //config.output_format = { "6", "qnum", "snum" }; - config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore" }; - config.query_cover = 80; - config.subject_cover = 80; - config.algo = 0; - config.index_mode = 0; - config.freq_sd = 0; - config.max_alignments = numeric_limits::max(); - - Workflow::Search::Options opt; - opt.db = &db; - opt.self = true; - Neighbors nb(db.ref_header.sequences); - opt.consumer = &nb; - opt.db_filter = filter; - - Workflow::Search::run(opt); - - return Util::Algo::greedy_vortex_cover(nb); -} - -void run_two_step_clustering(DatabaseFile& db){ - const size_t seq_count = db.ref_header.sequences; - - config.min_id = 70; - vector centroid1(cluster(db, nullptr)); - vector rep1(rep_bitset(centroid1)); - size_t n_rep1 = count_if(rep1.begin(), rep1.end(), [](bool x) { return x; }); - message_stream << "Clustering step 1 complete. #Input sequences: " << centroid1.size() << " #Clusters: " << n_rep1 << endl; - - config.mode_more_sensitive = true; - config.min_id = 0; - vector centroid2(cluster(db, &rep1)); - vector rep2(rep_bitset(centroid2, &rep1)); - for (size_t i = 0; i < centroid2.size(); ++i) - if (!rep1[i]) - centroid2[i] = centroid2[centroid1[i]]; - message_stream << "Clustering step 2 complete. #Input sequences: " << n_rep1 << " #Clusters: " << count_if(rep2.begin(), rep2.end(), [](bool x) { return x; }) << endl; - - task_timer timer("Generating output"); - Sequence_set *rep_seqs; - String_set *rep_ids; - vector rep_database_id, rep_block_id(seq_count); - db.rewind(); - db.load_seqs(rep_database_id, (size_t)1e11, &rep_seqs, &rep_ids, true, &rep2); - for (size_t i = 0; i < rep_database_id.size(); ++i) - rep_block_id[rep_database_id[i]] = (unsigned)i; - - ostream *out = config.output_file.empty() ? &cout : new ofstream(config.output_file.c_str()); - vector seq; - string id; - db.seek_direct(); - Hsp hsp; - size_t n; - out->precision(3); - - for (int i = 0; i < (int)db.ref_header.sequences; ++i) { - db.read_seq(id, seq); - const unsigned r = rep_block_id[centroid2[i]]; - (*out) << blast_id(id) << '\t' - << blast_id((*rep_ids)[r]) << '\t'; - if ((int)i == centroid2[i]) - (*out) << "100\t100\t100\t0" << endl; - else { - Masking::get().bit_to_hard_mask(seq.data(), seq.size(), n); - smith_waterman(sequence(seq), (*rep_seqs)[r], hsp); - (*out) << hsp.id_percent() << '\t' - << hsp.query_cover_percent((unsigned)seq.size()) << '\t' - << hsp.subject_cover_percent((unsigned)(*rep_seqs)[r].length()) << '\t' - << score_matrix.bitscore(hsp.score) << endl; - } - } - - if (out != &cout) delete out; - delete rep_seqs; - delete rep_ids; -} - - -void run_transitive_closure_clustering(DatabaseFile& db){ - // testing - // LazyDisjointIntegralSet test(5); - // test.merge(1,3); - // test.merge(3,4); - // test.merge(0,2); - // auto los = test.getListOfSets(); - // for(auto& set: los){ - // for(auto item:set){ - // cout<< " " << item; - // } - // cout< s ( {"red","green","blue","rain", "sun", "snow"} ); - // LazyDisjointTypeSet ts(&s); - - // ts.merge("red", "green"); - // ts.merge("snow", "rain"); - // ts.merge("sun", "rain"); - // auto lo = ts.getListOfSets(); - // for(auto& set: lo){ - // cout<< "set :"; - // for(auto item:set){ - // cout<< " " << item; - // } - // cout<::max(); - - Workflow::Search::Options opt; - opt.db = &db; - opt.self = true; - NeighborStream ns(db.ref_header.sequences); - opt.consumer = &ns; - opt.db_filter = nullptr; - - Workflow::Search::run(opt); - - auto clusterResult = ns.getListOfSets(); - cout << "Found "< get_gamma(Eigen::SparseMatrix* m, float r){ - vector> data; - for (uint32_t k=0; kouterSize(); ++k){ - float colSum = 0.0f; - for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ - colSum += pow(it.value(),r); - } - for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ - data.emplace_back(it.row(), it.col(), pow(it.value(),r) / colSum); - } - } - Eigen::SparseMatrix gamma(m->rows(), m->cols()); - gamma.setFromTriplets(data.begin(), data.end()); - return gamma.pruned(1.0, std::numeric_limits::epsilon()); -} - -vector> get_list(Eigen::SparseMatrix* m){ - LazyDisjointIntegralSet disjointSet(m->cols()); - for (uint32_t k=0; kouterSize(); ++k){ - for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ - disjointSet.merge(it.row(), it.col()); - } - } - return disjointSet.getListOfSets(); -} - -vector> markov_process(Eigen::SparseMatrix* m){ - uint32_t iteration = 0; - double diff_norm = std::numeric_limits::max(); - *m = get_gamma(m,1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable - while( iteration < 100 && diff_norm > 1e-6*m->rows() ){ - Eigen::SparseMatrix msquared = ((*m) * (*m)).pruned(1.0, std::numeric_limits::epsilon()); - Eigen::SparseMatrix m_update = get_gamma(&msquared, 2); - diff_norm = (*m - m_update).norm(); - *m = m_update.pruned(1.0, std::numeric_limits::epsilon()); - iteration++; - } - return get_list(m); -} - -void run_markov_clustering(DatabaseFile& db){ - // testing - // vector> data; - // data.emplace_back(0, 0, 0.2f); - // data.emplace_back(0, 1, 0.25f); - // data.emplace_back(0, 5, 0.333f); - // data.emplace_back(0, 6, 0.25f); - // data.emplace_back(0, 9, 0.25f); - // data.emplace_back(1, 0, 0.20f); - // data.emplace_back(1, 1, 0.25f); - // data.emplace_back(1, 2, 0.25f); - // data.emplace_back(1, 4, 0.20f); - // data.emplace_back(2, 1, 0.25f); - // data.emplace_back(2, 2, 0.25f); - // data.emplace_back(2, 3, 0.20f); - // data.emplace_back(2, 4, 0.20f); - // data.emplace_back(3, 2, 0.25f); - // data.emplace_back(3, 3, 0.20f); - // data.emplace_back(3, 7, 0.20f); - // data.emplace_back(3, 8, 0.20f); - // data.emplace_back(3, 10, 0.20f); - // data.emplace_back(4, 1, 0.25f); - // data.emplace_back(4, 2, 0.25f); - // data.emplace_back(4, 4, 0.20f); - // data.emplace_back(4, 6, 0.25f); - // data.emplace_back(4, 7, 0.20f); - // data.emplace_back(5, 0, 0.20f); - // data.emplace_back(5, 5, 0.333f); - // data.emplace_back(5, 9, 0.25f); - // data.emplace_back(6, 0, 0.20f); - // data.emplace_back(6, 4, 0.20f); - // data.emplace_back(6, 6, 0.25f); - // data.emplace_back(6, 9, 0.25f); - // data.emplace_back(7, 3, 0.20f); - // data.emplace_back(7, 4, 0.20f); - // data.emplace_back(7, 7, 0.20f); - // data.emplace_back(7, 8, 0.20f); - // data.emplace_back(7, 10, 0.20f); - // data.emplace_back(8, 3, 0.20f); - // data.emplace_back(8, 7, 0.20f); - // data.emplace_back(8, 8, 0.20f); - // data.emplace_back(8, 10, 0.20f); - // data.emplace_back(8, 11, 0.333f); - // data.emplace_back(9, 0, 0.20f); - // data.emplace_back(9, 5, 0.333f); - // data.emplace_back(9, 6, 0.25f); - // data.emplace_back(9, 9, 0.25f); - // data.emplace_back(10, 3, 0.20f); - // data.emplace_back(10, 7, 0.20f); - // data.emplace_back(10, 8, 0.20f); - // data.emplace_back(10, 10, 0.20f); - // data.emplace_back(10, 11, 0.333f); - // data.emplace_back(11, 8, 0.20f); - // data.emplace_back(11, 10, 0.20f); - // data.emplace_back(11, 11, 0.333f); - // Eigen::SparseMatrix tmp(12,12); - // tmp.setFromTriplets(data.begin(), data.end()); - // auto cr = markov_process(&tmp); - // cout << "Found "<::max(); - - Workflow::Search::Options opt; - opt.db = &db; - opt.self = true; - SparseMatrixStream ms(db.ref_header.sequences); - opt.consumer = &ms; - opt.db_filter = nullptr; - Workflow::Search::run(opt); - - auto componentInfo = ms.getComponents(); - vector> indices = get<0>(componentInfo); - vector> components = get<1>(componentInfo); - uint32_t nComponents = count_if(indices.begin(), indices.end(), [](vector v){ return v.size() > 0;}); - uint32_t nComponentsLt1 = count_if(indices.begin(), indices.end(), [](vector v){ return v.size() > 1;}); - - cout << "DIAMOND done" << endl; - cout << "************" << endl; - cout << "Found " << nComponentsLt1 <<" ("<> clustering_result; - float max_sparsity = 0.0f; - float min_sparsity = 1.0f; - bool full = false; - if( full ){ - // TODO: According to the SIAM publication this is not valid, just for debugging - Eigen::SparseMatrix m = ms.getMatrix(); - for(unordered_set subset : markov_process(&m)){ - clustering_result.emplace_back(subset); - } - } - else { - // TODO: check if using dense matrices for non-sparse components improves performance - for(uint32_t iComponent = 0; iComponent order = indices[iComponent]; - if(order.size() > 1){ - Eigen::SparseMatrix m = components[iComponent]; - float sparsity = 1.0-(1.0 * m.nonZeros()) / (m.rows()*m.cols()); - max_sparsity = max(max_sparsity, sparsity); - min_sparsity = min(min_sparsity, sparsity); - // map back to original ids - for(unordered_set subset : markov_process(&m)){ - unordered_set s; - for(uint32_t el : subset){ - s.emplace(order[el]); - } - clustering_result.emplace_back(s); - } - } - else if (order.size() == 1){ - clustering_result.emplace_back(order.begin(), order.end()); - } - } - } - uint32_t nClusters = clustering_result.size(); - uint32_t nClustersLt1 = count_if(clustering_result.begin(), clustering_result.end(), [](unordered_set v){ return v.size() > 1;}); - cout << "Found "< db(DatabaseFile::auto_create_from_fasta()); - switch(config.cluster_algo){ - case Config::multi_step: - run_two_step_clustering(*db); - break; - case Config::transitive_closure: - run_transitive_closure_clustering(*db); - break; - case Config::markov_clustering: - run_markov_clustering(*db); - break; - } - db->close(); -} -}} diff --git a/src/run/main.cpp b/src/run/main.cpp index b597e41c..01df35a2 100644 --- a/src/run/main.cpp +++ b/src/run/main.cpp @@ -116,8 +116,15 @@ int main(int ac, const char* av[]) pairwise(); break; case Config::cluster: - Workflow::Cluster::run(); - break; + if(config.cluster_algo == "multi-step"){ + Workflow::Cluster::MultiStep::run(); + } + else if(config.cluster_algo == "mcl"){ + Workflow::Cluster::MCL::run(); + } + else{ + throw std::runtime_error("The --cluster-algo option currently only accepts \"multi-step\" and \"mcl\""); + } case Config::translate: translate(); break; diff --git a/src/run/workflow.h b/src/run/workflow.h index ce85b5a4..7e9c35ab 100644 --- a/src/run/workflow.h +++ b/src/run/workflow.h @@ -31,11 +31,14 @@ void run(const Options &options); } namespace Cluster { - +namespace MultiStep { void run(); - +} +namespace MCL { +void run(); +} } } -#endif \ No newline at end of file +#endif From 2383f40692749aeee00541a6f6d58d610983ad58 Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Sat, 25 Apr 2020 10:56:41 +0200 Subject: [PATCH 06/21] refactoring of clustering algos made it simpler to include further clustering algorithms in the future. --- CMakeLists.txt | 1 + src/basic/config.cpp | 10 ++++ src/basic/masking.h | 3 +- src/cluster/multi_step_cluster.cpp | 80 +++++++----------------------- src/run/main.cpp | 12 ++--- src/run/workflow.h | 11 ---- 6 files changed, 34 insertions(+), 83 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 91b00ab7..c1284de7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,6 +171,7 @@ set(OBJECTS src/align/gapped.cpp src/align/culling.cpp src/cluster/medoid.cpp + src/cluster/cluster_registry.cpp src/cluster/multi_step_cluster.cpp src/cluster/mcl.cpp src/align/output.cpp diff --git a/src/basic/config.cpp b/src/basic/config.cpp index b097858a..fb43885c 100644 --- a/src/basic/config.cpp +++ b/src/basic/config.cpp @@ -28,6 +28,7 @@ along with this program. If not, see . #include "shape_config.h" #include "../util/io/temp_file.h" #include "../basic/match.h" +#include "../cluster/cluster_registry.h" #include "../basic/translate.h" #include "../dp/dp.h" #include "masking.h" @@ -469,6 +470,15 @@ Config::Config(int argc, const char **argv, bool check_io) throw std::runtime_error("Custom scoring matrices require setting the --gapopen and --gapextend options."); score_matrix = Score_matrix(matrix_file, lambda, K, gap_open, gap_extend); } + if(command == Config::cluster && !Workflow::Cluster::ClusterRegistry::has(cluster_algo)){ + ostream &header_out = command == Config::help ? cout : cerr; + header_out << "Unkown clustering algorithm: " << cluster_algo << endl; + header_out << "Available options are: " << endl; + for(string c_algo : Workflow::Cluster::ClusterRegistry::getKeys()){ + header_out << "\t" << c_algo << endl; + } + throw std::runtime_error("Clustering algorithm not found."); + } message_stream << "Scoring parameters: " << score_matrix << endl; if (masking == 1) Masking::instance = unique_ptr(new Masking(score_matrix)); diff --git a/src/basic/masking.h b/src/basic/masking.h index 24e872d9..28f12c6f 100644 --- a/src/basic/masking.h +++ b/src/basic/masking.h @@ -16,6 +16,7 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . ****/ +#pragma once #include #include "value.h" #include "score_matrix.h" @@ -41,4 +42,4 @@ struct Masking char mask_table_x_[size], mask_table_bit_[size]; }; -size_t mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask = true); \ No newline at end of file +size_t mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask = true); diff --git a/src/cluster/multi_step_cluster.cpp b/src/cluster/multi_step_cluster.cpp index 6cb2f704..85f90abf 100644 --- a/src/cluster/multi_step_cluster.cpp +++ b/src/cluster/multi_step_cluster.cpp @@ -16,51 +16,12 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . ****/ -#include -#include -#include -#include -#include -#include -#include -#include "../util/system/system.h" -#include "../util/util.h" -#include "../basic/config.h" -#include "../data/reference.h" -#include "../run/workflow.h" -#include "../util/io/consumer.h" -#include "../util/algo/algo.h" -#include "../basic/statistics.h" -#include "../util/log_stream.h" -#include "../dp/dp.h" -#include "../basic/masking.h" +#include "multi_step_cluster.h" using namespace std; -namespace Workflow { namespace Cluster{ namespace MultiStep { - -struct Neighbors : public vector>, public Consumer { - Neighbors(size_t n): - vector>(n) - {} - virtual void consume(const char *ptr, size_t n) override { - int query, subject, count; - float qcov, scov, bitscore; - const char *end = ptr + n; - while (ptr < end) { - //if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2) - if (sscanf(ptr, "%i\t%i\t%f\t%f\t%f\n%n", &query, &subject, &qcov, &scov, &bitscore, &count) != 5) - throw runtime_error("Cluster format error."); - ptr += count; - //cout << query << '\t' << subject << '\t' << qcov << '\t' << scov << '\t' << endl; - (*this)[query].push_back(subject); - edges.push_back({ query, subject, (int)bitscore }); - } - } - vector edges; -}; - -vector rep_bitset(const vector ¢roid, const vector *superset = nullptr) { +namespace Workflow { namespace Cluster{ +vector MultiStep::rep_bitset(const vector ¢roid, const vector *superset) { vector r(centroid.size()); for (int c : centroid) if(!superset || (*superset)[c]) @@ -68,7 +29,7 @@ vector rep_bitset(const vector ¢roid, const vector *superse return r; } -vector cluster(DatabaseFile &db, const vector *filter) { +vector MultiStep::cluster(DatabaseFile &db, const vector *filter) { statistics.reset(); config.command = Config::blastp; config.no_self_hits = true; @@ -93,18 +54,22 @@ vector cluster(DatabaseFile &db, const vector *filter) { return Util::Algo::greedy_vortex_cover(nb); } -void run_two_step_clustering(DatabaseFile& db){ - const size_t seq_count = db.ref_header.sequences; +void MultiStep::run_clustering() { + if (config.database == "") + throw runtime_error("Missing parameter: database file (--db/-d)"); + config.command = Config::makedb; + unique_ptr db(DatabaseFile::auto_create_from_fasta()); + const size_t seq_count = db->ref_header.sequences; config.min_id = 70; - vector centroid1(cluster(db, nullptr)); + vector centroid1(cluster(*db, nullptr)); vector rep1(rep_bitset(centroid1)); size_t n_rep1 = count_if(rep1.begin(), rep1.end(), [](bool x) { return x; }); message_stream << "Clustering step 1 complete. #Input sequences: " << centroid1.size() << " #Clusters: " << n_rep1 << endl; config.mode_more_sensitive = true; config.min_id = 0; - vector centroid2(cluster(db, &rep1)); + vector centroid2(cluster(*db, &rep1)); vector rep2(rep_bitset(centroid2, &rep1)); for (size_t i = 0; i < centroid2.size(); ++i) if (!rep1[i]) @@ -115,21 +80,21 @@ void run_two_step_clustering(DatabaseFile& db){ Sequence_set *rep_seqs; String_set *rep_ids; vector rep_database_id, rep_block_id(seq_count); - db.rewind(); - db.load_seqs(rep_database_id, (size_t)1e11, &rep_seqs, &rep_ids, true, &rep2); + db->rewind(); + db->load_seqs(rep_database_id, (size_t)1e11, &rep_seqs, &rep_ids, true, &rep2); for (size_t i = 0; i < rep_database_id.size(); ++i) rep_block_id[rep_database_id[i]] = (unsigned)i; ostream *out = config.output_file.empty() ? &cout : new ofstream(config.output_file.c_str()); vector seq; string id; - db.seek_direct(); + db->seek_direct(); Hsp hsp; size_t n; out->precision(3); - for (int i = 0; i < (int)db.ref_header.sequences; ++i) { - db.read_seq(id, seq); + for (int i = 0; i < (int)db->ref_header.sequences; ++i) { + db->read_seq(id, seq); const unsigned r = rep_block_id[centroid2[i]]; (*out) << blast_id(id) << '\t' << blast_id((*rep_ids)[r]) << '\t'; @@ -144,18 +109,9 @@ void run_two_step_clustering(DatabaseFile& db){ << score_matrix.bitscore(hsp.score) << endl; } } - if (out != &cout) delete out; delete rep_seqs; delete rep_ids; -} - -void run() { - if (config.database == "") - throw runtime_error("Missing parameter: database file (--db/-d)"); - config.command = Config::makedb; - unique_ptr db(DatabaseFile::auto_create_from_fasta()); - run_two_step_clustering(*db); db->close(); } -}}} +}} diff --git a/src/run/main.cpp b/src/run/main.cpp index d5b73201..65a77c79 100644 --- a/src/run/main.cpp +++ b/src/run/main.cpp @@ -21,6 +21,7 @@ along with this program. If not, see . #include "tools.h" #include "../data/reference.h" #include "workflow.h" +#include "../cluster/cluster_registry.h" #include "../util/simd.h" #ifdef EXTRA #include "../extra/compare.h" @@ -117,15 +118,8 @@ int main(int ac, const char* av[]) pairwise(); break; case Config::cluster: - if(config.cluster_algo == "multi-step"){ - Workflow::Cluster::MultiStep::run(); - } - else if(config.cluster_algo == "mcl"){ - Workflow::Cluster::MCL::run(); - } - else{ - throw std::runtime_error("The --cluster-algo option currently only accepts \"multi-step\" and \"mcl\""); - } + Workflow::Cluster::ClusterRegistry::get(config.cluster_algo)->run(); + break; case Config::translate: translate(); break; diff --git a/src/run/workflow.h b/src/run/workflow.h index 7e9c35ab..babccf95 100644 --- a/src/run/workflow.h +++ b/src/run/workflow.h @@ -8,7 +8,6 @@ struct Consumer; struct TextInputFile; namespace Workflow { - namespace Search { struct Options { @@ -29,16 +28,6 @@ struct Options { void run(const Options &options); } - -namespace Cluster { -namespace MultiStep { -void run(); -} -namespace MCL { -void run(); -} -} - } #endif From 495001cd2fe7453b1f2a9767da0ef1866c684ffe Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Sat, 25 Apr 2020 10:58:04 +0200 Subject: [PATCH 07/21] add forgotten files --- src/cluster/cluster.h | 7 + src/cluster/cluster_registry.cpp | 10 ++ src/cluster/cluster_registry.h | 43 +++++ src/cluster/clustering_algorithms.cpp | 30 ++++ src/cluster/mcl.cpp | 224 ++++++++++++++++++++++++++ src/cluster/mcl.h | 129 +++++++++++++++ src/cluster/multi_step_cluster.h | 78 +++++++++ 7 files changed, 521 insertions(+) create mode 100644 src/cluster/cluster.h create mode 100644 src/cluster/cluster_registry.cpp create mode 100644 src/cluster/cluster_registry.h create mode 100644 src/cluster/clustering_algorithms.cpp create mode 100644 src/cluster/mcl.cpp create mode 100644 src/cluster/mcl.h create mode 100644 src/cluster/multi_step_cluster.h diff --git a/src/cluster/cluster.h b/src/cluster/cluster.h new file mode 100644 index 00000000..2fdff00a --- /dev/null +++ b/src/cluster/cluster.h @@ -0,0 +1,7 @@ +#pragma once +#include +class ClusteringAlgorithm { +public: + virtual void run() = 0; + static std::string get_key(); +}; diff --git a/src/cluster/cluster_registry.cpp b/src/cluster/cluster_registry.cpp new file mode 100644 index 00000000..ddf883da --- /dev/null +++ b/src/cluster/cluster_registry.cpp @@ -0,0 +1,10 @@ +#include "cluster_registry.h" + +using namespace std; + +namespace Workflow { namespace Cluster{ + map ClusterRegistry::regMap; + MCL ClusterRegistry::mcl; + MultiStep ClusterRegistry::multiStep; + ClusterRegistry::StaticConstructor ClusterRegistry::_staticConstructor; +}} diff --git a/src/cluster/cluster_registry.h b/src/cluster/cluster_registry.h new file mode 100644 index 00000000..53653931 --- /dev/null +++ b/src/cluster/cluster_registry.h @@ -0,0 +1,43 @@ +#pragma once +#include "cluster.h" +#include "mcl.h" +#include "multi_step_cluster.h" + +using namespace std; + +namespace Workflow { namespace Cluster{ +class ClusterRegistry{ +private: + ClusterRegistry(){}; + static MCL mcl; + static MultiStep multiStep; +public: + static map regMap; + static ClusteringAlgorithm* get(string key){ + auto ca = ClusterRegistry::regMap.find(key); + if(ca == ClusterRegistry::regMap.end()){ + throw std::runtime_error("Clustering algorithm not found."); + } + return ca->second; + } + static bool has(string key){ + return ClusterRegistry::regMap.find(key) != ClusterRegistry::regMap.end(); + } + static vector getKeys(){ + auto it = regMap.begin(); + vector keys; + while(it != regMap.end()){ + keys.push_back(it->first); + it++; + } + return keys; + } + static struct StaticConstructor { + StaticConstructor() { + regMap.emplace(multiStep.get_key(), &multiStep); + regMap.emplace(mcl.get_key(), &mcl); + } + } _staticConstructor; +}; + +}} diff --git a/src/cluster/clustering_algorithms.cpp b/src/cluster/clustering_algorithms.cpp new file mode 100644 index 00000000..ad423373 --- /dev/null +++ b/src/cluster/clustering_algorithms.cpp @@ -0,0 +1,30 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2013-2018 Benjamin Buchfink + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +****/ + +#include "cluster.h" +#include "cluster.h" + +using namespace std; + +namespace Workflow { namespace Cluster{ + static map clustering_algorithms; + static{ + clustering_algorithms.add(C + } + +}} diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp new file mode 100644 index 00000000..b2cc8eee --- /dev/null +++ b/src/cluster/mcl.cpp @@ -0,0 +1,224 @@ +#include "mcl.h" + +using namespace std; + +namespace Workflow { namespace Cluster{ + +Eigen::SparseMatrix MCL::get_gamma(Eigen::SparseMatrix* m, float r){ + vector> data; + for (uint32_t k=0; kouterSize(); ++k){ + float colSum = 0.0f; + for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ + colSum += pow(it.value(),r); + } + for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ + data.emplace_back(it.row(), it.col(), pow(it.value(),r) / colSum); + } + } + Eigen::SparseMatrix gamma(m->rows(), m->cols()); + gamma.setFromTriplets(data.begin(), data.end()); + return gamma.pruned(1.0, std::numeric_limits::epsilon()); +} + +vector> MCL::get_list(Eigen::SparseMatrix* m){ + LazyDisjointIntegralSet disjointSet(m->cols()); + for (uint32_t k=0; kouterSize(); ++k){ + for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ + disjointSet.merge(it.row(), it.col()); + } + } + return disjointSet.getListOfSets(); +} + +vector> MCL::markov_process(Eigen::SparseMatrix* m){ + uint32_t iteration = 0; + double diff_norm = std::numeric_limits::max(); + *m = get_gamma(m,1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable + while( iteration < 100 && diff_norm > 1e-6*m->rows() ){ + Eigen::SparseMatrix msquared = ((*m) * (*m)).pruned(1.0, std::numeric_limits::epsilon()); + Eigen::SparseMatrix m_update = get_gamma(&msquared, 2); + diff_norm = (*m - m_update).norm(); + *m = m_update.pruned(1.0, std::numeric_limits::epsilon()); + iteration++; + } + return get_list(m); +} + +void MCL::run_clustering() { + if (config.database == "") + throw runtime_error("Missing parameter: database file (--db/-d)"); + config.command = Config::makedb; + unique_ptr db(DatabaseFile::auto_create_from_fasta()); + + // testing + // LazyDisjointIntegralSet test(5); + // test.merge(1,3); + // test.merge(3,4); + // test.merge(0,2); + // auto los = test.getListOfSets(); + // for(auto& set: los){ + // for(auto item:set){ + // cout<< " " << item; + // } + // cout< s ( {"red","green","blue","rain", "sun", "snow"} ); + // LazyDisjointTypeSet ts(&s); + + // ts.merge("red", "green"); + // ts.merge("snow", "rain"); + // ts.merge("sun", "rain"); + // auto lo = ts.getListOfSets(); + // for(auto& set: lo){ + // cout<< "set :"; + // for(auto item:set){ + // cout<< " " << item; + // } + // cout<> data; + // data.emplace_back(0, 0, 0.2f); + // data.emplace_back(0, 1, 0.25f); + // data.emplace_back(0, 5, 0.333f); + // data.emplace_back(0, 6, 0.25f); + // data.emplace_back(0, 9, 0.25f); + // data.emplace_back(1, 0, 0.20f); + // data.emplace_back(1, 1, 0.25f); + // data.emplace_back(1, 2, 0.25f); + // data.emplace_back(1, 4, 0.20f); + // data.emplace_back(2, 1, 0.25f); + // data.emplace_back(2, 2, 0.25f); + // data.emplace_back(2, 3, 0.20f); + // data.emplace_back(2, 4, 0.20f); + // data.emplace_back(3, 2, 0.25f); + // data.emplace_back(3, 3, 0.20f); + // data.emplace_back(3, 7, 0.20f); + // data.emplace_back(3, 8, 0.20f); + // data.emplace_back(3, 10, 0.20f); + // data.emplace_back(4, 1, 0.25f); + // data.emplace_back(4, 2, 0.25f); + // data.emplace_back(4, 4, 0.20f); + // data.emplace_back(4, 6, 0.25f); + // data.emplace_back(4, 7, 0.20f); + // data.emplace_back(5, 0, 0.20f); + // data.emplace_back(5, 5, 0.333f); + // data.emplace_back(5, 9, 0.25f); + // data.emplace_back(6, 0, 0.20f); + // data.emplace_back(6, 4, 0.20f); + // data.emplace_back(6, 6, 0.25f); + // data.emplace_back(6, 9, 0.25f); + // data.emplace_back(7, 3, 0.20f); + // data.emplace_back(7, 4, 0.20f); + // data.emplace_back(7, 7, 0.20f); + // data.emplace_back(7, 8, 0.20f); + // data.emplace_back(7, 10, 0.20f); + // data.emplace_back(8, 3, 0.20f); + // data.emplace_back(8, 7, 0.20f); + // data.emplace_back(8, 8, 0.20f); + // data.emplace_back(8, 10, 0.20f); + // data.emplace_back(8, 11, 0.333f); + // data.emplace_back(9, 0, 0.20f); + // data.emplace_back(9, 5, 0.333f); + // data.emplace_back(9, 6, 0.25f); + // data.emplace_back(9, 9, 0.25f); + // data.emplace_back(10, 3, 0.20f); + // data.emplace_back(10, 7, 0.20f); + // data.emplace_back(10, 8, 0.20f); + // data.emplace_back(10, 10, 0.20f); + // data.emplace_back(10, 11, 0.333f); + // data.emplace_back(11, 8, 0.20f); + // data.emplace_back(11, 10, 0.20f); + // data.emplace_back(11, 11, 0.333f); + // Eigen::SparseMatrix tmp(12,12); + // tmp.setFromTriplets(data.begin(), data.end()); + // auto cr = markov_process(&tmp); + // cout << "Found "<ref_header.sequences; + statistics.reset(); + config.command = Config::blastp; + config.no_self_hits = true; + config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore", "pident" }; + config.query_cover = 80; + config.subject_cover = 80; + config.algo = 0; + config.index_mode = 0; + config.freq_sd = 0; + config.max_alignments = numeric_limits::max(); + + Workflow::Search::Options opt; + opt.db = &(*db); + opt.self = true; + SparseMatrixStream ms(db->ref_header.sequences); + opt.consumer = &ms; + opt.db_filter = nullptr; + Workflow::Search::run(opt); + + auto componentInfo = ms.getComponents(); + vector> indices = get<0>(componentInfo); + vector> components = get<1>(componentInfo); + uint32_t nComponents = count_if(indices.begin(), indices.end(), [](vector v){ return v.size() > 0;}); + uint32_t nComponentsLt1 = count_if(indices.begin(), indices.end(), [](vector v){ return v.size() > 1;}); + + cout << "DIAMOND done" << endl; + cout << "************" << endl; + cout << "Found " << nComponentsLt1 <<" ("<> clustering_result; + float max_sparsity = 0.0f; + float min_sparsity = 1.0f; + bool full = false; + if( full ){ + // TODO: According to the SIAM publication this is not valid, just for debugging + Eigen::SparseMatrix m = ms.getMatrix(); + for(unordered_set subset : markov_process(&m)){ + clustering_result.emplace_back(subset); + } + } + else { + // TODO: check if using dense matrices for non-sparse components improves performance + for(uint32_t iComponent = 0; iComponent order = indices[iComponent]; + if(order.size() > 1){ + Eigen::SparseMatrix m = components[iComponent]; + float sparsity = 1.0-(1.0 * m.nonZeros()) / (m.rows()*m.cols()); + max_sparsity = max(max_sparsity, sparsity); + min_sparsity = min(min_sparsity, sparsity); + // map back to original ids + for(unordered_set subset : markov_process(&m)){ + unordered_set s; + for(uint32_t el : subset){ + s.emplace(order[el]); + } + clustering_result.emplace_back(s); + } + } + else if (order.size() == 1){ + clustering_result.emplace_back(order.begin(), order.end()); + } + } + } + uint32_t nClusters = clustering_result.size(); + uint32_t nClustersLt1 = count_if(clustering_result.begin(), clustering_result.end(), [](unordered_set v){ return v.size() > 1;}); + cout << "Found "<close(); +} +}} diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h new file mode 100644 index 00000000..4e5c1226 --- /dev/null +++ b/src/cluster/mcl.h @@ -0,0 +1,129 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2013-2018 Benjamin Buchfink + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +****/ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include "../util/system/system.h" +#include "../util/util.h" +#include "../basic/config.h" +#include "../data/reference.h" +#include "../run/workflow.h" +#include "disjoint_set.h" +#include "../util/io/consumer.h" +#include "../util/algo/algo.h" +#include "../basic/statistics.h" +#include "../util/log_stream.h" +#include "../dp/dp.h" +#include "cluster.h" + +using namespace std; + +namespace Workflow { namespace Cluster{ +class MCL: public ClusteringAlgorithm { +private: + Eigen::SparseMatrix get_gamma(Eigen::SparseMatrix* m, float r); + vector> get_list(Eigen::SparseMatrix* m); + vector> markov_process(Eigen::SparseMatrix* m); + void run_clustering(); +public: + void run(){ + run_clustering(); + } + static string get_key(){ + return "mcl"; + } +}; + +template +class SparseMatrixStream : public Consumer { + size_t n; + vector> data; + LazyDisjointSet* disjointSet; + virtual void consume(const char *ptr, size_t n) override { + size_t query, subject, count; + float qcov, scov, bitscore, id; + const char *end = ptr + n; + while (ptr < end) { + //if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2) + if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &id, &count) != 6) + throw runtime_error("Cluster format error."); + data.emplace_back(query, subject, (qcov/100.0f) * (scov/100.0f) * (id/100.0f)); + disjointSet->merge(query, subject); + ptr += count; + } + } +public: + SparseMatrixStream(size_t n){ + this->n = n; + disjointSet = new LazyDisjointIntegralSet(n); + // Note: this makes sure the self hits are always included, the value needs to be aligned with the measure used in consume! + for(uint32_t idiag=0; idiag>, vector>> getComponents(){ + vector> sets = disjointSet->getListOfSets(); + vector>> split(sets.size()); + unordered_map indexToSetId; + for(uint32_t iset = 0; iset < sets.size(); iset++){ + split.push_back(vector>()); + for(size_t index : sets[iset]){ + indexToSetId.emplace(index, iset); + } + } + for(Eigen::Triplet d : data){ + assert(indexToSetId[d.row()] == indexToSetId[d.row()]); + split[indexToSetId[d.row()]].emplace_back(d.row(), d.col(), d.value()); + } + vector> indices(sets.size()); + vector> components(sets.size()); + for(uint32_t iset = 0; iset < sets.size(); iset++){ + Eigen::SparseMatrix component(sets[iset].size(), sets[iset].size()); + vector order(sets[iset].begin(), sets[iset].end()); + map index_map; + uint32_t iel = 0; + for(uint32_t el: order){ + index_map.emplace(el, iel++); + } + vector> remapped(split[iset].size()); + for(Eigen::Triplet t : split[iset]){ + remapped.emplace_back(index_map[t.row()], index_map[t.col()], t.value()); + } + component.setFromTriplets(remapped.begin(), remapped.end()); + components.emplace_back(move(component)); + indices.emplace_back(move(order)); + } + return make_pair(move(indices), move(components)); + } + Eigen::SparseMatrix getMatrix(){ + Eigen::SparseMatrix m(n,n); + m.setFromTriplets(data.begin(), data.end()); + return m; + } +}; +}} diff --git a/src/cluster/multi_step_cluster.h b/src/cluster/multi_step_cluster.h new file mode 100644 index 00000000..b4cc2258 --- /dev/null +++ b/src/cluster/multi_step_cluster.h @@ -0,0 +1,78 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2013-2018 Benjamin Buchfink + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +****/ + +#pragma once +#include +#include +#include +#include +#include +#include +#include "../util/system/system.h" +#include "../util/util.h" +#include "../basic/config.h" +#include "../data/reference.h" +#include "../run/workflow.h" +#include "../util/io/consumer.h" +#include "../util/algo/algo.h" +#include "../basic/statistics.h" +#include "../util/log_stream.h" +#include "../dp/dp.h" +#include "../basic/masking.h" +#include "cluster.h" + +using namespace std; + +namespace Workflow { namespace Cluster{ + +class MultiStep : public ClusteringAlgorithm { +private: + vector rep_bitset(const vector ¢roid, const vector *superset = nullptr); + vector cluster(DatabaseFile &db, const vector *filter); + void run_clustering(); +public: + void run(){ + run_clustering(); + } + static string get_key(){ + return "multi-step"; + } +}; + +struct Neighbors : public vector>, public Consumer { + Neighbors(size_t n): + vector>(n) + {} + virtual void consume(const char *ptr, size_t n) override { + int query, subject, count; + float qcov, scov, bitscore; + const char *end = ptr + n; + while (ptr < end) { + //if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2) + if (sscanf(ptr, "%i\t%i\t%f\t%f\t%f\n%n", &query, &subject, &qcov, &scov, &bitscore, &count) != 5) + throw runtime_error("Cluster format error."); + ptr += count; + //cout << query << '\t' << subject << '\t' << qcov << '\t' << scov << '\t' << endl; + (*this)[query].push_back(subject); + edges.push_back({ query, subject, (int)bitscore }); + } + } + vector edges; +}; + +}} From d49c9a4f0318f4c7bb22df311a89c33f1b544249 Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Sat, 25 Apr 2020 11:01:39 +0200 Subject: [PATCH 08/21] adding headers and descriptions --- src/cluster/cluster.h | 18 ++++++++++++++++ src/cluster/cluster_registry.cpp | 18 ++++++++++++++++ src/cluster/cluster_registry.h | 20 ++++++++++++++++++ src/cluster/clustering_algorithms.cpp | 30 --------------------------- src/cluster/mcl.cpp | 18 ++++++++++++++++ 5 files changed, 74 insertions(+), 30 deletions(-) delete mode 100644 src/cluster/clustering_algorithms.cpp diff --git a/src/cluster/cluster.h b/src/cluster/cluster.h index 2fdff00a..fcbbb875 100644 --- a/src/cluster/cluster.h +++ b/src/cluster/cluster.h @@ -1,3 +1,21 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2013-2018 Benjamin Buchfink + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +****/ + #pragma once #include class ClusteringAlgorithm { diff --git a/src/cluster/cluster_registry.cpp b/src/cluster/cluster_registry.cpp index ddf883da..b3345f5c 100644 --- a/src/cluster/cluster_registry.cpp +++ b/src/cluster/cluster_registry.cpp @@ -1,3 +1,21 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2013-2018 Benjamin Buchfink + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +****/ + #include "cluster_registry.h" using namespace std; diff --git a/src/cluster/cluster_registry.h b/src/cluster/cluster_registry.h index 53653931..7f89d20f 100644 --- a/src/cluster/cluster_registry.h +++ b/src/cluster/cluster_registry.h @@ -1,3 +1,21 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2013-2018 Benjamin Buchfink + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +****/ + #pragma once #include "cluster.h" #include "mcl.h" @@ -9,6 +27,7 @@ namespace Workflow { namespace Cluster{ class ClusterRegistry{ private: ClusterRegistry(){}; + // To include new clustering algorithms add the instantiation here and in the cluster_registry.cpp file. Then add it to the StaticConstructor below static MCL mcl; static MultiStep multiStep; public: @@ -34,6 +53,7 @@ class ClusterRegistry{ } static struct StaticConstructor { StaticConstructor() { + // Add any new clustering algorithm here regMap.emplace(multiStep.get_key(), &multiStep); regMap.emplace(mcl.get_key(), &mcl); } diff --git a/src/cluster/clustering_algorithms.cpp b/src/cluster/clustering_algorithms.cpp deleted file mode 100644 index ad423373..00000000 --- a/src/cluster/clustering_algorithms.cpp +++ /dev/null @@ -1,30 +0,0 @@ -/**** -DIAMOND protein aligner -Copyright (C) 2013-2018 Benjamin Buchfink - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -****/ - -#include "cluster.h" -#include "cluster.h" - -using namespace std; - -namespace Workflow { namespace Cluster{ - static map clustering_algorithms; - static{ - clustering_algorithms.add(C - } - -}} diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp index b2cc8eee..d47a0d44 100644 --- a/src/cluster/mcl.cpp +++ b/src/cluster/mcl.cpp @@ -1,3 +1,21 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2013-2018 Benjamin Buchfink + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +****/ + #include "mcl.h" using namespace std; From c31e615c536511c8b2762fd6672e01a796fb66e3 Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Mon, 4 May 2020 19:51:43 +0200 Subject: [PATCH 09/21] several improvements - updated the cluster registry to also print a bit of info for the available clustering algos - implemented a dense matrix mcl clustering and an automatic switch at 80% sparsity - fixed a (previously silent) bug related to putting entries into the matrices --- src/basic/config.cpp | 2 +- src/cluster/cluster.h | 3 +- src/cluster/cluster_registry.h | 2 +- src/cluster/mcl.cpp | 196 +++++++++++++++++++++++------ src/cluster/mcl.h | 53 ++++---- src/cluster/multi_step_cluster.cpp | 9 +- src/cluster/multi_step_cluster.h | 10 +- 7 files changed, 197 insertions(+), 78 deletions(-) diff --git a/src/basic/config.cpp b/src/basic/config.cpp index fb43885c..ac4b235e 100644 --- a/src/basic/config.cpp +++ b/src/basic/config.cpp @@ -475,7 +475,7 @@ Config::Config(int argc, const char **argv, bool check_io) header_out << "Unkown clustering algorithm: " << cluster_algo << endl; header_out << "Available options are: " << endl; for(string c_algo : Workflow::Cluster::ClusterRegistry::getKeys()){ - header_out << "\t" << c_algo << endl; + header_out << "\t" << c_algo << "\t"<< Workflow::Cluster::ClusterRegistry::get(c_algo)->get_description() << endl; } throw std::runtime_error("Clustering algorithm not found."); } diff --git a/src/cluster/cluster.h b/src/cluster/cluster.h index fcbbb875..864cd79b 100644 --- a/src/cluster/cluster.h +++ b/src/cluster/cluster.h @@ -21,5 +21,6 @@ along with this program. If not, see . class ClusteringAlgorithm { public: virtual void run() = 0; - static std::string get_key(); + virtual std::string get_key() = 0; + virtual std::string get_description() = 0; }; diff --git a/src/cluster/cluster_registry.h b/src/cluster/cluster_registry.h index 7f89d20f..d6c872da 100644 --- a/src/cluster/cluster_registry.h +++ b/src/cluster/cluster_registry.h @@ -33,7 +33,7 @@ class ClusterRegistry{ public: static map regMap; static ClusteringAlgorithm* get(string key){ - auto ca = ClusterRegistry::regMap.find(key); + map::iterator ca = ClusterRegistry::regMap.find(key); if(ca == ClusterRegistry::regMap.end()){ throw std::runtime_error("Clustering algorithm not found."); } diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp index d47a0d44..e25bf696 100644 --- a/src/cluster/mcl.cpp +++ b/src/cluster/mcl.cpp @@ -15,27 +15,76 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . ****/ - #include "mcl.h" using namespace std; namespace Workflow { namespace Cluster{ +string MCL::get_key() { + return "mcl"; +} +string MCL::get_description() { + return "Markov clustering according to doi:10.1137/040608635"; +} + +void MCL::get_exp(Eigen::SparseMatrix* in, Eigen::SparseMatrix* out, float r){ + if( r - (int) r == 0 ){ + // TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues + *out = *in; + for(uint32_t i=1; ipruned(); +} + +void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ + if( r - (int) r == 0 ){ + // TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues + *out = *in; + for(uint32_t i=1; i solver(*in); + Eigen::MatrixXcf d = solver.eigenvalues().asDiagonal(); + for(uint32_t idiag = 0; idiag MCL::get_gamma(Eigen::SparseMatrix* m, float r){ +void MCL::get_gamma(Eigen::SparseMatrix* in, Eigen::SparseMatrix* out, float r){ vector> data; - for (uint32_t k=0; kouterSize(); ++k){ + for (uint32_t k=0; kouterSize(); ++k){ float colSum = 0.0f; - for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ + for (Eigen::SparseMatrix::InnerIterator it(*in, k); it; ++it){ colSum += pow(it.value(),r); } - for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ + for (Eigen::SparseMatrix::InnerIterator it(*in, k); it; ++it){ data.emplace_back(it.row(), it.col(), pow(it.value(),r) / colSum); } } - Eigen::SparseMatrix gamma(m->rows(), m->cols()); - gamma.setFromTriplets(data.begin(), data.end()); - return gamma.pruned(1.0, std::numeric_limits::epsilon()); + out->setFromTriplets(data.begin(), data.end(), [] (const float&, const float &b) { return b; }); + *out = out->pruned(1.0, std::numeric_limits::epsilon()); +} + +void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ + for (uint32_t icol=0; icolcols(); ++icol){ + float colSum = 0.0f; + for (uint32_t irow=0; irowrows(); ++irow){ + colSum += pow((*in)(irow, icol) , r); + } + for (uint32_t irow=0; irowrows(); ++irow){ + (*out)(irow, icol) = pow((*in)(irow, icol) , r)/colSum; + } + } } vector> MCL::get_list(Eigen::SparseMatrix* m){ @@ -48,21 +97,53 @@ vector> MCL::get_list(Eigen::SparseMatrix* m){ return disjointSet.getListOfSets(); } +vector> MCL::get_list(Eigen::MatrixXf* m){ + LazyDisjointIntegralSet disjointSet(m->cols()); + for (uint32_t icol=0; icolcols(); ++icol){ + for (uint32_t irow=0; irowrows(); ++irow){ + if( abs((*m)(irow, icol)) > 1e-7 ){ + disjointSet.merge(irow, icol); + } + } + } + return disjointSet.getListOfSets(); +} + vector> MCL::markov_process(Eigen::SparseMatrix* m){ uint32_t iteration = 0; double diff_norm = std::numeric_limits::max(); - *m = get_gamma(m,1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable + Eigen::SparseMatrix msquared(m->rows(), m->cols()); + Eigen::SparseMatrix m_update(m->rows(), m->cols()); + get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable + while( iteration < 100 && diff_norm > 1e-6*m->rows() ){ + get_exp(m, &msquared, 2); + get_gamma(&msquared, &m_update, 2); + *m -= m_update; + diff_norm = m->norm(); + *m = m_update; + iteration++; + } + return get_list(m); +} + +vector> MCL::markov_process(Eigen::MatrixXf* m){ + uint32_t iteration = 0; + double diff_norm = std::numeric_limits::max(); + Eigen::MatrixXf msquared(m->rows(), m->cols()); + Eigen::MatrixXf m_update(m->rows(), m->cols()); + get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable while( iteration < 100 && diff_norm > 1e-6*m->rows() ){ - Eigen::SparseMatrix msquared = ((*m) * (*m)).pruned(1.0, std::numeric_limits::epsilon()); - Eigen::SparseMatrix m_update = get_gamma(&msquared, 2); - diff_norm = (*m - m_update).norm(); - *m = m_update.pruned(1.0, std::numeric_limits::epsilon()); + get_exp(m, &msquared, 2); + get_gamma(&msquared, &m_update, 2); + *m -= m_update; + diff_norm = m->norm(); + *m = m_update; iteration++; } return get_list(m); } -void MCL::run_clustering() { +void MCL::run(){ if (config.database == "") throw runtime_error("Missing parameter: database file (--db/-d)"); config.command = Config::makedb; @@ -165,7 +246,7 @@ void MCL::run_clustering() { const size_t seq_count = db->ref_header.sequences; statistics.reset(); config.command = Config::blastp; - config.no_self_hits = true; + config.no_self_hits = false; config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore", "pident" }; config.query_cover = 80; config.subject_cover = 80; @@ -182,25 +263,35 @@ void MCL::run_clustering() { opt.db_filter = nullptr; Workflow::Search::run(opt); + task_timer timer; + timer.go("Computing independent components"); auto componentInfo = ms.getComponents(); vector> indices = get<0>(componentInfo); - vector> components = get<1>(componentInfo); + vector>> components = get<1>(componentInfo); uint32_t nComponents = count_if(indices.begin(), indices.end(), [](vector v){ return v.size() > 0;}); uint32_t nComponentsLt1 = count_if(indices.begin(), indices.end(), [](vector v){ return v.size() > 1;}); - - cout << "DIAMOND done" << endl; - cout << "************" << endl; + timer.finish(); cout << "Found " << nComponentsLt1 <<" ("<> clustering_result; + timer.go("Clustering components"); + bool full = false; + vector clustering_result(db->ref_header.sequences, std::numeric_limits::max()); float max_sparsity = 0.0f; float min_sparsity = 1.0f; - bool full = false; + uint32_t cluster_id = 0; + uint32_t nClustersEq1 = 0; + uint32_t n_dense_calculations = 0; + uint32_t n_sparse_calculations = 0; + if( full ){ // TODO: According to the SIAM publication this is not valid, just for debugging Eigen::SparseMatrix m = ms.getMatrix(); for(unordered_set subset : markov_process(&m)){ - clustering_result.emplace_back(subset); + for(uint32_t el : subset){ + clustering_result[el] = cluster_id; + } + cluster_id++; + if(subset.size() == 1) nClustersEq1 ++; } } else { @@ -208,35 +299,60 @@ void MCL::run_clustering() { for(uint32_t iComponent = 0; iComponent order = indices[iComponent]; if(order.size() > 1){ - Eigen::SparseMatrix m = components[iComponent]; - float sparsity = 1.0-(1.0 * m.nonZeros()) / (m.rows()*m.cols()); + vector> m = components[iComponent]; + assert(m.size() <= order.size() * order.size()); + float sparsity = 1.0-(1.0 * m.size()) / (order.size()*order.size()); max_sparsity = max(max_sparsity, sparsity); min_sparsity = min(min_sparsity, sparsity); // map back to original ids - for(unordered_set subset : markov_process(&m)){ - unordered_set s; + vector> list_of_sets; + if(sparsity >= 0.8){ //TODO: a size limit should be given as well + n_sparse_calculations++; + Eigen::SparseMatrix m_sparse(order.size(), order.size()); + m_sparse.setFromTriplets(m.begin(), m.end()); + list_of_sets = markov_process(&m_sparse); + } + else{ + n_dense_calculations++; + Eigen::MatrixXf m_dense(order.size(), order.size()); + for(Eigen::Triplet const & t : m){ + m_dense(t.row(), t.col()) = t.value(); + } + list_of_sets = markov_process(&m_dense); + } + for(unordered_set subset : list_of_sets){ for(uint32_t el : subset){ - s.emplace(order[el]); + clustering_result[order[el]] = cluster_id; } - clustering_result.emplace_back(s); + cluster_id++; + if(subset.size() == 1) nClustersEq1 ++; } } else if (order.size() == 1){ - clustering_result.emplace_back(order.begin(), order.end()); + clustering_result[order[0]] = cluster_id++; + nClustersEq1++; } } } - uint32_t nClusters = clustering_result.size(); - uint32_t nClustersLt1 = count_if(clustering_result.begin(), clustering_result.end(), [](unordered_set v){ return v.size() > 1;}); - cout << "Found "< seq; + string id; + db->seek_direct(); + Hsp hsp; + size_t n; + out->precision(3); + for (int i = 0; i < (int)db->ref_header.sequences; ++i) { + db->read_seq(id, seq); + (*out) << blast_id(id) << '\t' << clustering_result[i] << endl; + id.clear(); + seq.clear(); + } + if (out != &cout) delete out; db->close(); + timer.finish(); } }} diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h index 4e5c1226..511433d6 100644 --- a/src/cluster/mcl.h +++ b/src/cluster/mcl.h @@ -18,6 +18,7 @@ along with this program. If not, see . #pragma once #include +#include #include #include #include @@ -43,17 +44,18 @@ using namespace std; namespace Workflow { namespace Cluster{ class MCL: public ClusteringAlgorithm { private: - Eigen::SparseMatrix get_gamma(Eigen::SparseMatrix* m, float r); + void get_exp(Eigen::SparseMatrix* in, Eigen::SparseMatrix* out, float r); + void get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r); + void get_gamma(Eigen::SparseMatrix* in, Eigen::SparseMatrix* out, float r); + void get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r); vector> get_list(Eigen::SparseMatrix* m); + vector> get_list(Eigen::MatrixXf* m); vector> markov_process(Eigen::SparseMatrix* m); - void run_clustering(); + vector> markov_process(Eigen::MatrixXf* m); public: - void run(){ - run_clustering(); - } - static string get_key(){ - return "mcl"; - } + void run(); + string get_key(); + string get_description(); }; template @@ -66,10 +68,10 @@ class SparseMatrixStream : public Consumer { float qcov, scov, bitscore, id; const char *end = ptr + n; while (ptr < end) { - //if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2) if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &id, &count) != 6) throw runtime_error("Cluster format error."); - data.emplace_back(query, subject, (qcov/100.0f) * (scov/100.0f) * (id/100.0f)); + const float value = (qcov/100.0f) * (scov/100.0f) * (id/100.0f); + data.emplace_back(query, subject, value); disjointSet->merge(query, subject); ptr += count; } @@ -78,44 +80,41 @@ class SparseMatrixStream : public Consumer { SparseMatrixStream(size_t n){ this->n = n; disjointSet = new LazyDisjointIntegralSet(n); - // Note: this makes sure the self hits are always included, the value needs to be aligned with the measure used in consume! - for(uint32_t idiag=0; idiag>, vector>> getComponents(){ + pair>, vector>>> getComponents(){ vector> sets = disjointSet->getListOfSets(); vector>> split(sets.size()); - unordered_map indexToSetId; + vector indexToSetId(this->n, sets.size()); for(uint32_t iset = 0; iset < sets.size(); iset++){ split.push_back(vector>()); for(size_t index : sets[iset]){ - indexToSetId.emplace(index, iset); + indexToSetId[index] = iset; } } - for(Eigen::Triplet d : data){ - assert(indexToSetId[d.row()] == indexToSetId[d.row()]); - split[indexToSetId[d.row()]].emplace_back(d.row(), d.col(), d.value()); + for(Eigen::Triplet t : data){ + uint32_t iset = indexToSetId[t.row()]; + assert( iset == indexToSetId[t.col()]); + split[iset].emplace_back(t.row(), t.col(), t.value()); } vector> indices(sets.size()); - vector> components(sets.size()); + vector>> components(sets.size()); for(uint32_t iset = 0; iset < sets.size(); iset++){ - Eigen::SparseMatrix component(sets[iset].size(), sets[iset].size()); vector order(sets[iset].begin(), sets[iset].end()); map index_map; uint32_t iel = 0; - for(uint32_t el: order){ + for(uint32_t const & el: order){ index_map.emplace(el, iel++); } - vector> remapped(split[iset].size()); - for(Eigen::Triplet t : split[iset]){ + vector> remapped; + for(Eigen::Triplet const & t : split[iset]){ remapped.emplace_back(index_map[t.row()], index_map[t.col()], t.value()); } - component.setFromTriplets(remapped.begin(), remapped.end()); - components.emplace_back(move(component)); + remapped.shrink_to_fit(); + components.emplace_back(move(remapped)); indices.emplace_back(move(order)); } return make_pair(move(indices), move(components)); diff --git a/src/cluster/multi_step_cluster.cpp b/src/cluster/multi_step_cluster.cpp index 85f90abf..95e6ad43 100644 --- a/src/cluster/multi_step_cluster.cpp +++ b/src/cluster/multi_step_cluster.cpp @@ -21,6 +21,13 @@ along with this program. If not, see . using namespace std; namespace Workflow { namespace Cluster{ +string MultiStep::get_key() { + return "multi-step"; +} +string MultiStep::get_description(){ + return "A greedy stepwise vortex cover algorithm"; +} + vector MultiStep::rep_bitset(const vector ¢roid, const vector *superset) { vector r(centroid.size()); for (int c : centroid) @@ -54,7 +61,7 @@ vector MultiStep::cluster(DatabaseFile &db, const vector *filter) { return Util::Algo::greedy_vortex_cover(nb); } -void MultiStep::run_clustering() { +void MultiStep::run() { if (config.database == "") throw runtime_error("Missing parameter: database file (--db/-d)"); config.command = Config::makedb; diff --git a/src/cluster/multi_step_cluster.h b/src/cluster/multi_step_cluster.h index b4cc2258..5f5bae69 100644 --- a/src/cluster/multi_step_cluster.h +++ b/src/cluster/multi_step_cluster.h @@ -44,14 +44,10 @@ class MultiStep : public ClusteringAlgorithm { private: vector rep_bitset(const vector ¢roid, const vector *superset = nullptr); vector cluster(DatabaseFile &db, const vector *filter); - void run_clustering(); public: - void run(){ - run_clustering(); - } - static string get_key(){ - return "multi-step"; - } + void run(); + string get_key(); + string get_description(); }; struct Neighbors : public vector>, public Consumer { From 33217d6bfdeef09212bfbd888b9e14b0eefa92fe Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Tue, 5 May 2020 11:05:07 +0200 Subject: [PATCH 10/21] added inflation and expansion parameters it is now possible to control the inflation and expansion parameters from the command line --- src/basic/config.cpp | 5 +- src/basic/config.h | 3 ++ src/cluster/mcl.cpp | 110 +++++++++++++++++++++++++++++++++---------- src/cluster/mcl.h | 4 +- 4 files changed, 95 insertions(+), 27 deletions(-) diff --git a/src/basic/config.cpp b/src/basic/config.cpp index ac4b235e..e350f9d2 100644 --- a/src/basic/config.cpp +++ b/src/basic/config.cpp @@ -147,7 +147,10 @@ Config::Config(int argc, const char **argv, bool check_io) Options_group cluster("Cluster options"); cluster.add() - ("cluster-algo", 0, "Clustering algorithm (\"multi-step\", \"mcl\")", cluster_algo); + ("cluster-algo", 0, "Clustering algorithm (\"multi-step\", \"mcl\")", cluster_algo) + ("mcl-expansion", 0, "MCL expansion coefficient (default=2.0)", cluster_mcl_expansion, 2.0) + ("mcl-inflation", 0, "MCL inflation coefficient (default=2.0)", cluster_mcl_inflation, 2.0) + ("mcl-sparsity-switch", 0, "MCL switch to sparse matrix computation (default=0.8) ", cluster_mcl_sparsity_switch, 0.8); Options_group aligner("Aligner options"); aligner.add() diff --git a/src/basic/config.h b/src/basic/config.h index dfc631b4..647bcecf 100644 --- a/src/basic/config.h +++ b/src/basic/config.h @@ -223,6 +223,9 @@ struct Config int algo; string cluster_algo; + double cluster_mcl_inflation; + double cluster_mcl_expansion; + double cluster_mcl_sparsity_switch; enum { query_parallel = 0, target_parallel = 1 }; unsigned load_balancing; diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp index e25bf696..4a567f9e 100644 --- a/src/cluster/mcl.cpp +++ b/src/cluster/mcl.cpp @@ -42,21 +42,22 @@ void MCL::get_exp(Eigen::SparseMatrix* in, Eigen::SparseMatrix* ou } void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ - if( r - (int) r == 0 ){ - // TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues + // TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues + if( r - (int) r == 0){ *out = *in; for(uint32_t i=1; i solver(*in); Eigen::MatrixXcf d = solver.eigenvalues().asDiagonal(); for(uint32_t idiag = 0; idiagnoalias() = (V * d.real() * V.inverse()).real(); } } @@ -76,13 +77,14 @@ void MCL::get_gamma(Eigen::SparseMatrix* in, Eigen::SparseMatrix* } void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ + // Note that Eigen matrices are column-major, so this is the most efficient way for (uint32_t icol=0; icolcols(); ++icol){ float colSum = 0.0f; for (uint32_t irow=0; irowrows(); ++irow){ - colSum += pow((*in)(irow, icol) , r); + colSum += pow(in->coeffRef(irow, icol) , r); } for (uint32_t irow=0; irowrows(); ++irow){ - (*out)(irow, icol) = pow((*in)(irow, icol) , r)/colSum; + out->coeffRef(irow, icol) = pow(in->coeffRef(irow, icol) , r) / colSum; } } } @@ -101,7 +103,7 @@ vector> MCL::get_list(Eigen::MatrixXf* m){ LazyDisjointIntegralSet disjointSet(m->cols()); for (uint32_t icol=0; icolcols(); ++icol){ for (uint32_t irow=0; irowrows(); ++irow){ - if( abs((*m)(irow, icol)) > 1e-7 ){ + if( abs((*m)(irow, icol)) > std::numeric_limits::epsilon()){ disjointSet.merge(irow, icol); } } @@ -109,15 +111,51 @@ vector> MCL::get_list(Eigen::MatrixXf* m){ return disjointSet.getListOfSets(); } -vector> MCL::markov_process(Eigen::SparseMatrix* m){ +// void printMatrix(Eigen::SparseMatrix* m){ +// printf("["); +// for (uint32_t irow=0; irowrows(); ++irow){ +// printf("["); +// for (uint32_t icol=0; icolcols(); ++icol){ +// printf("%12.7f",m->coeffRef(irow, icol)); +// } +// if(irowrows() - 1){ +// printf("];\n"); +// } +// else{ +// printf("]"); +// } +// } +// printf("]\n\n"); +// } +// void printMatrix(Eigen::MatrixXf* m){ +// printf("["); +// for (uint32_t irow=0; irowrows(); ++irow){ +// printf("["); +// for (uint32_t icol=0; icolcols(); ++icol){ +// printf("%12.7f",m->coeffRef(irow, icol)); +// } +// if(irowrows() - 1){ +// printf("];\n"); +// } +// else{ +// printf("]"); +// } +// } +// printf("]\n\n"); +// } + +vector> MCL::markov_process(Eigen::SparseMatrix* m, float inflation, float expansion){ + for (uint32_t idiag=0; idiagcols(); ++idiag){ + assert(abs(m->coeffRef(idiag, idiag)) > std::numeric_limits::epsilon()); + } uint32_t iteration = 0; - double diff_norm = std::numeric_limits::max(); + float diff_norm = std::numeric_limits::max(); Eigen::SparseMatrix msquared(m->rows(), m->cols()); Eigen::SparseMatrix m_update(m->rows(), m->cols()); get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable while( iteration < 100 && diff_norm > 1e-6*m->rows() ){ - get_exp(m, &msquared, 2); - get_gamma(&msquared, &m_update, 2); + get_exp(m, &msquared, expansion); + get_gamma(&msquared, &m_update, inflation); *m -= m_update; diff_norm = m->norm(); *m = m_update; @@ -126,15 +164,19 @@ vector> MCL::markov_process(Eigen::SparseMatrix* return get_list(m); } -vector> MCL::markov_process(Eigen::MatrixXf* m){ +vector> MCL::markov_process(Eigen::MatrixXf* m, float inflation, float expansion){ + for (uint32_t idiag=0; idiagcols(); ++idiag){ + assert(abs(m->coeffRef(idiag, idiag)) > std::numeric_limits::epsilon()); + } uint32_t iteration = 0; - double diff_norm = std::numeric_limits::max(); + float diff_norm = std::numeric_limits::max(); Eigen::MatrixXf msquared(m->rows(), m->cols()); Eigen::MatrixXf m_update(m->rows(), m->cols()); get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable while( iteration < 100 && diff_norm > 1e-6*m->rows() ){ - get_exp(m, &msquared, 2); - get_gamma(&msquared, &m_update, 2); + get_exp(m, &msquared, expansion); + if(isnan(msquared.norm())) break; // converged if expansion is not integral + get_gamma(&msquared, &m_update, inflation); *m -= m_update; diff_norm = m->norm(); *m = m_update; @@ -230,9 +272,25 @@ void MCL::run(){ // data.emplace_back(11, 8, 0.20f); // data.emplace_back(11, 10, 0.20f); // data.emplace_back(11, 11, 0.333f); - // Eigen::SparseMatrix tmp(12,12); - // tmp.setFromTriplets(data.begin(), data.end()); - // auto cr = markov_process(&tmp); + // vector> cr; + // float inf = 2.0; + // float exp = 2.0; + // if(false){ + // Eigen::SparseMatrix tmp(12,12); + // tmp.setFromTriplets(data.begin(), data.end()); + // printMatrix(&tmp); + // cr = markov_process(&tmp, inf, exp); + // printMatrix(&tmp); + // } + // else{ + // Eigen::MatrixXf tmp = Eigen::MatrixXf::Zero(12,12); + // for(Eigen::Triplet const & t : data){ + // tmp(t.row(), t.col()) = t.value(); + // } + // printMatrix(&tmp); + // cr = markov_process(&tmp, inf, exp); + // printMatrix(&tmp); + // } // cout << "Found "< m = ms.getMatrix(); - for(unordered_set subset : markov_process(&m)){ + for(unordered_set subset : markov_process(&m, inflation, expansion)){ for(uint32_t el : subset){ clustering_result[el] = cluster_id; } @@ -295,7 +355,7 @@ void MCL::run(){ } } else { - // TODO: check if using dense matrices for non-sparse components improves performance + // TODO: check when/if using dense matrices for non-sparse components improves performance for(uint32_t iComponent = 0; iComponent order = indices[iComponent]; if(order.size() > 1){ @@ -306,19 +366,21 @@ void MCL::run(){ min_sparsity = min(min_sparsity, sparsity); // map back to original ids vector> list_of_sets; - if(sparsity >= 0.8){ //TODO: a size limit should be given as well + + //TODO: a size limit for the dense matrix should control this as well + if(sparsity >= config.cluster_mcl_sparsity_switch && expansion - (int) expansion == 0){ n_sparse_calculations++; Eigen::SparseMatrix m_sparse(order.size(), order.size()); m_sparse.setFromTriplets(m.begin(), m.end()); - list_of_sets = markov_process(&m_sparse); + list_of_sets = markov_process(&m_sparse, inflation, expansion); } else{ n_dense_calculations++; - Eigen::MatrixXf m_dense(order.size(), order.size()); + Eigen::MatrixXf m_dense = Eigen::MatrixXf::Zero(order.size(), order.size()); for(Eigen::Triplet const & t : m){ m_dense(t.row(), t.col()) = t.value(); } - list_of_sets = markov_process(&m_dense); + list_of_sets = markov_process(&m_dense, inflation, expansion); } for(unordered_set subset : list_of_sets){ for(uint32_t el : subset){ diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h index 511433d6..60f06c2e 100644 --- a/src/cluster/mcl.h +++ b/src/cluster/mcl.h @@ -50,8 +50,8 @@ class MCL: public ClusteringAlgorithm { void get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r); vector> get_list(Eigen::SparseMatrix* m); vector> get_list(Eigen::MatrixXf* m); - vector> markov_process(Eigen::SparseMatrix* m); - vector> markov_process(Eigen::MatrixXf* m); + vector> markov_process(Eigen::SparseMatrix* m, float inflation, float expansion); + vector> markov_process(Eigen::MatrixXf* m, float inflation, float expansion); public: void run(); string get_key(); From 00756f13f6897c9b910b95b0336ebfe4fa19c213 Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Thu, 7 May 2020 12:49:30 +0200 Subject: [PATCH 11/21] switching to binary format --- src/cluster/mcl.cpp | 35 +++++++++++++++++++++++++++++------ src/cluster/mcl.h | 24 ++++++++++++++---------- src/output/output_format.cpp | 19 ++++++++++++++++++- src/output/output_format.h | 15 +++++++++++++++ 4 files changed, 76 insertions(+), 17 deletions(-) diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp index 4a567f9e..0f334ad1 100644 --- a/src/cluster/mcl.cpp +++ b/src/cluster/mcl.cpp @@ -28,6 +28,7 @@ string MCL::get_description() { } void MCL::get_exp(Eigen::SparseMatrix* in, Eigen::SparseMatrix* out, float r){ + std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); if( r - (int) r == 0 ){ // TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues *out = *in; @@ -39,29 +40,38 @@ void MCL::get_exp(Eigen::SparseMatrix* in, Eigen::SparseMatrix* ou throw runtime_error(" Eigen does not provide an eigenvalue solver for sparse matrices"); } *out = out->pruned(); + sparse_exp_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; } void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ // TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues - if( r - (int) r == 0){ + std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); + if(r - (int) r == 0){ *out = *in; for(uint32_t i=1; i(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; } else{ // TODO: check whether the matrix is self-adjoint and use SelfAdjointEigenSolver instead + // TODO: try and get out->noalias() = in->pow(r); to work (unsupported! http://eigen.tuxfamily.org/dox/unsupported/group__MatrixFunctions__Module.html). Eigen::EigenSolver solver(*in); Eigen::MatrixXcf d = solver.eigenvalues().asDiagonal(); for(uint32_t idiag = 0; idiagnoalias() = (V * d.real() * V.inverse()).real(); + double thr = 0.5 * abs(V.determinant()); + if( thr > std::numeric_limits::epsilon() ){ + out->noalias() = (V * d.real() * V.inverse()).real(); + } + dense_gen_exp_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; } } void MCL::get_gamma(Eigen::SparseMatrix* in, Eigen::SparseMatrix* out, float r){ + std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); vector> data; for (uint32_t k=0; kouterSize(); ++k){ float colSum = 0.0f; @@ -74,9 +84,11 @@ void MCL::get_gamma(Eigen::SparseMatrix* in, Eigen::SparseMatrix* } out->setFromTriplets(data.begin(), data.end(), [] (const float&, const float &b) { return b; }); *out = out->pruned(1.0, std::numeric_limits::epsilon()); + sparse_gamma_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; } void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ + std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); // Note that Eigen matrices are column-major, so this is the most efficient way for (uint32_t icol=0; icolcols(); ++icol){ float colSum = 0.0f; @@ -87,19 +99,23 @@ void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ out->coeffRef(irow, icol) = pow(in->coeffRef(irow, icol) , r) / colSum; } } + dense_gamma_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; } vector> MCL::get_list(Eigen::SparseMatrix* m){ + std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); LazyDisjointIntegralSet disjointSet(m->cols()); for (uint32_t k=0; kouterSize(); ++k){ for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ disjointSet.merge(it.row(), it.col()); } } + sparse_list_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; return disjointSet.getListOfSets(); } vector> MCL::get_list(Eigen::MatrixXf* m){ + std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); LazyDisjointIntegralSet disjointSet(m->cols()); for (uint32_t icol=0; icolcols(); ++icol){ for (uint32_t irow=0; irowrows(); ++irow){ @@ -108,6 +124,7 @@ vector> MCL::get_list(Eigen::MatrixXf* m){ } } } + dense_list_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; return disjointSet.getListOfSets(); } @@ -173,9 +190,8 @@ vector> MCL::markov_process(Eigen::MatrixXf* m, float in Eigen::MatrixXf msquared(m->rows(), m->cols()); Eigen::MatrixXf m_update(m->rows(), m->cols()); get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable - while( iteration < 100 && diff_norm > 1e-6*m->rows() ){ + while( iteration < 100 && diff_norm > std::numeric_limits::epsilon()*m->rows() ){ get_exp(m, &msquared, expansion); - if(isnan(msquared.norm())) break; // converged if expansion is not integral get_gamma(&msquared, &m_update, inflation); *m -= m_update; diff_norm = m->norm(); @@ -305,7 +321,7 @@ void MCL::run(){ statistics.reset(); config.command = Config::blastp; config.no_self_hits = false; - config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore", "pident" }; + config.output_format = { "clus" }; config.query_cover = 80; config.subject_cover = 80; config.algo = 0; @@ -368,10 +384,12 @@ void MCL::run(){ vector> list_of_sets; //TODO: a size limit for the dense matrix should control this as well + std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); if(sparsity >= config.cluster_mcl_sparsity_switch && expansion - (int) expansion == 0){ n_sparse_calculations++; Eigen::SparseMatrix m_sparse(order.size(), order.size()); m_sparse.setFromTriplets(m.begin(), m.end()); + sparse_create_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; list_of_sets = markov_process(&m_sparse, inflation, expansion); } else{ @@ -380,6 +398,7 @@ void MCL::run(){ for(Eigen::Triplet const & t : m){ m_dense(t.row(), t.col()) = t.value(); } + dense_create_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; list_of_sets = markov_process(&m_dense, inflation, expansion); } for(unordered_set subset : list_of_sets){ @@ -399,6 +418,10 @@ void MCL::run(){ timer.finish(); cout << "Found "< seq; diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h index 60f06c2e..6bb40b3d 100644 --- a/src/cluster/mcl.h +++ b/src/cluster/mcl.h @@ -52,6 +52,10 @@ class MCL: public ClusteringAlgorithm { vector> get_list(Eigen::MatrixXf* m); vector> markov_process(Eigen::SparseMatrix* m, float inflation, float expansion); vector> markov_process(Eigen::MatrixXf* m, float inflation, float expansion); + double sparse_create_time, dense_create_time = 0.0; + double sparse_exp_time, dense_int_exp_time, dense_gen_exp_time = 0.0; + double sparse_gamma_time, dense_gamma_time = 0.0; + double sparse_list_time, dense_list_time = 0.0; public: void run(); string get_key(); @@ -62,36 +66,36 @@ template class SparseMatrixStream : public Consumer { size_t n; vector> data; - LazyDisjointSet* disjointSet; + LazyDisjointSet* disjointSet; virtual void consume(const char *ptr, size_t n) override { - size_t query, subject, count; - float qcov, scov, bitscore, id; const char *end = ptr + n; + uint32_t query = *(uint32_t*) ptr; + ptr += sizeof(uint32_t); while (ptr < end) { - if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &id, &count) != 6) - throw runtime_error("Cluster format error."); - const float value = (qcov/100.0f) * (scov/100.0f) * (id/100.0f); + const uint32_t subject = *(uint32_t*) ptr; + ptr += sizeof(uint32_t); + const double value = *(double*) ptr; + ptr += sizeof(double); data.emplace_back(query, subject, value); disjointSet->merge(query, subject); - ptr += count; } } public: SparseMatrixStream(size_t n){ this->n = n; - disjointSet = new LazyDisjointIntegralSet(n); + disjointSet = new LazyDisjointIntegralSet(n); } ~SparseMatrixStream(){ delete disjointSet; } pair>, vector>>> getComponents(){ - vector> sets = disjointSet->getListOfSets(); + vector> sets = disjointSet->getListOfSets(); vector>> split(sets.size()); vector indexToSetId(this->n, sets.size()); for(uint32_t iset = 0; iset < sets.size(); iset++){ split.push_back(vector>()); - for(size_t index : sets[iset]){ + for(uint32_t index : sets[iset]){ indexToSetId[index] = iset; } } diff --git a/src/output/output_format.cpp b/src/output/output_format.cpp index 7b8f8602..f34f4be2 100644 --- a/src/output/output_format.cpp +++ b/src/output/output_format.cpp @@ -91,6 +91,8 @@ Output_format* get_output_format() return new PAF_format; else if (f[0] == "bin1") return new Bin1_format; + else if (f[0] == "clus") + return new Clustering_format; else throw std::runtime_error("Invalid output format. Allowed values: 0,5,6,100,101,102"); } @@ -131,4 +133,19 @@ void Bin1_format::print_match(const Hsp_context& r, const Metadata &metadata, Te out.write((uint32_t)r.subject_id); out.write(r.bit_score() / std::max((unsigned)r.query.source().length(), r.subject_len)); } -} \ No newline at end of file +} + +void Clustering_format::print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const { + out.write((uint32_t)query_num); +} + +void Clustering_format::print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out) { + if (r.query_id <= r.subject_id) { + out.write((uint32_t)r.subject_id); + // TODO: enable other clustering measures + double m = (double) r.query_source_range().length()*1.0 / r.query.source().length(); + m *= (double) r.subject_range().length() * 1.0 / r.subject_len; + m *= (double)r.identities() * 1.0 / r.length(); + out.write(m); + } +} diff --git a/src/output/output_format.h b/src/output/output_format.h index 126a0997..a9291bb7 100644 --- a/src/output/output_format.h +++ b/src/output/output_format.h @@ -210,6 +210,21 @@ struct Bin1_format : public Output_format } }; +struct Clustering_format : public Output_format +{ + Clustering_format(): + Output_format(bin1) + {} + virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const override; + virtual void print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out) override; + virtual ~Clustering_format() + { } + virtual Output_format* clone() const override + { + return new Clustering_format(*this); + } +}; + Output_format* get_output_format(); void init_output(bool have_taxon_id_lists, bool have_taxon_nodes, bool have_taxon_scientific_names); void print_hsp(Hsp &hsp, const TranslatedSequence &query); From 027aeeebcb8594d9eb3b644cb03ab5333deeba29 Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Tue, 19 May 2020 14:07:15 +0200 Subject: [PATCH 12/21] CR comment: enabling other similarity measures --- CMakeLists.txt | 2 ++ src/basic/config.cpp | 3 ++- src/basic/config.h | 3 ++- src/cluster/cluster.h | 2 +- src/cluster/cluster_registry.cpp | 2 +- src/cluster/cluster_registry.h | 2 +- src/cluster/disjoint_set.h | 18 ++++++++++++++++++ src/cluster/mcl.cpp | 16 +++++++++++----- src/cluster/mcl.h | 2 +- src/output/output_format.cpp | 16 +--------------- src/output/output_format.h | 6 +++--- 11 files changed, 43 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a7887ee0..d91db285 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -122,6 +122,8 @@ set(OBJECTS src/run/tools.cpp src/chaining/greedy_align.cpp src/output/output_format.cpp + src/output/clustering_variables.cpp + src/output/clustering_format.cpp src/output/join_blocks.cpp src/data/frequent_seeds.cpp src/align/legacy/query_mapper.cpp diff --git a/src/basic/config.cpp b/src/basic/config.cpp index e350f9d2..f693ef2f 100644 --- a/src/basic/config.cpp +++ b/src/basic/config.cpp @@ -148,7 +148,8 @@ Config::Config(int argc, const char **argv, bool check_io) Options_group cluster("Cluster options"); cluster.add() ("cluster-algo", 0, "Clustering algorithm (\"multi-step\", \"mcl\")", cluster_algo) - ("mcl-expansion", 0, "MCL expansion coefficient (default=2.0)", cluster_mcl_expansion, 2.0) + ("cluster-similarity", 0, "Clustering similarity measure", cluster_similarity) + ("mcl-expansion", 0, "MCL expansion coefficient (default=2)", cluster_mcl_expansion, (uint32_t) 2) ("mcl-inflation", 0, "MCL inflation coefficient (default=2.0)", cluster_mcl_inflation, 2.0) ("mcl-sparsity-switch", 0, "MCL switch to sparse matrix computation (default=0.8) ", cluster_mcl_sparsity_switch, 0.8); diff --git a/src/basic/config.h b/src/basic/config.h index 647bcecf..c086252b 100644 --- a/src/basic/config.h +++ b/src/basic/config.h @@ -224,8 +224,9 @@ struct Config string cluster_algo; double cluster_mcl_inflation; - double cluster_mcl_expansion; + uint32_t cluster_mcl_expansion; double cluster_mcl_sparsity_switch; + string cluster_similarity; enum { query_parallel = 0, target_parallel = 1 }; unsigned load_balancing; diff --git a/src/cluster/cluster.h b/src/cluster/cluster.h index 864cd79b..62323f94 100644 --- a/src/cluster/cluster.h +++ b/src/cluster/cluster.h @@ -1,6 +1,6 @@ /**** DIAMOND protein aligner -Copyright (C) 2013-2018 Benjamin Buchfink +Copyright (C) 2020 Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/cluster/cluster_registry.cpp b/src/cluster/cluster_registry.cpp index b3345f5c..4c59d2bd 100644 --- a/src/cluster/cluster_registry.cpp +++ b/src/cluster/cluster_registry.cpp @@ -1,6 +1,6 @@ /**** DIAMOND protein aligner -Copyright (C) 2013-2018 Benjamin Buchfink +Copyright (C) 2020 Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/cluster/cluster_registry.h b/src/cluster/cluster_registry.h index d6c872da..a2aaa2f3 100644 --- a/src/cluster/cluster_registry.h +++ b/src/cluster/cluster_registry.h @@ -1,6 +1,6 @@ /**** DIAMOND protein aligner -Copyright (C) 2013-2018 Benjamin Buchfink +Copyright (C) 2020 Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/cluster/disjoint_set.h b/src/cluster/disjoint_set.h index 4b7acc7b..df5e5f96 100644 --- a/src/cluster/disjoint_set.h +++ b/src/cluster/disjoint_set.h @@ -1,3 +1,21 @@ +/**** +DIAMOND protein aligner - Markov Clustering Module +Copyright (C) 2019 Patrick Ettenhuber + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +****/ + #include #include #include diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp index 0f334ad1..225a8368 100644 --- a/src/cluster/mcl.cpp +++ b/src/cluster/mcl.cpp @@ -1,6 +1,6 @@ /**** DIAMOND protein aligner -Copyright (C) 2013-2018 Benjamin Buchfink +Copyright (C) 2020 Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -321,13 +321,14 @@ void MCL::run(){ statistics.reset(); config.command = Config::blastp; config.no_self_hits = false; - config.output_format = { "clus" }; - config.query_cover = 80; - config.subject_cover = 80; + string format = config.cluster_similarity; + if(format.empty()){ + format = "qcovhsp/100*scovhsp/100*pident/100"; + } + config.output_format = {"clus", format}; config.algo = 0; config.index_mode = 0; config.freq_sd = 0; - config.max_alignments = numeric_limits::max(); Workflow::Search::Options opt; opt.db = &(*db); @@ -372,6 +373,11 @@ void MCL::run(){ } else { // TODO: check when/if using dense matrices for non-sparse components improves performance + // vector threads; + // for (size_t i = 0; i < config.threads_; ++i) + // threads.emplace_back(seed_join_worker, query_idx, ref_idx, &seedp, &range, query_seed_hits, ref_seed_hits); + // for (auto &t : threads) + // t.join(); for(uint32_t iComponent = 0; iComponent order = indices[iComponent]; if(order.size() > 1){ diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h index 6bb40b3d..8901b544 100644 --- a/src/cluster/mcl.h +++ b/src/cluster/mcl.h @@ -1,6 +1,6 @@ /**** DIAMOND protein aligner -Copyright (C) 2013-2018 Benjamin Buchfink +Copyright (C) 2020 Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/output/output_format.cpp b/src/output/output_format.cpp index f34f4be2..8da70715 100644 --- a/src/output/output_format.cpp +++ b/src/output/output_format.cpp @@ -92,7 +92,7 @@ Output_format* get_output_format() else if (f[0] == "bin1") return new Bin1_format; else if (f[0] == "clus") - return new Clustering_format; + return new Clustering_format(&f[1]); else throw std::runtime_error("Invalid output format. Allowed values: 0,5,6,100,101,102"); } @@ -135,17 +135,3 @@ void Bin1_format::print_match(const Hsp_context& r, const Metadata &metadata, Te } } -void Clustering_format::print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const { - out.write((uint32_t)query_num); -} - -void Clustering_format::print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out) { - if (r.query_id <= r.subject_id) { - out.write((uint32_t)r.subject_id); - // TODO: enable other clustering measures - double m = (double) r.query_source_range().length()*1.0 / r.query.source().length(); - m *= (double) r.subject_range().length() * 1.0 / r.subject_len; - m *= (double)r.identities() * 1.0 / r.length(); - out.write(m); - } -} diff --git a/src/output/output_format.h b/src/output/output_format.h index a9291bb7..52d60013 100644 --- a/src/output/output_format.h +++ b/src/output/output_format.h @@ -212,9 +212,9 @@ struct Bin1_format : public Output_format struct Clustering_format : public Output_format { - Clustering_format(): - Output_format(bin1) - {} + const string* const format; + Clustering_format(const string* const format): Output_format(bin1), format(format) { + } virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const override; virtual void print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out) override; virtual ~Clustering_format() From a848031a6f3cd63c91f9f709015e234c444309c9 Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Tue, 19 May 2020 19:18:42 +0200 Subject: [PATCH 13/21] adding forgotten files --- src/cluster/mcl.cpp | 6 +- src/output/clustering_format.cpp | 163 +++++++++++++++ src/output/clustering_variables.cpp | 44 ++++ src/output/clustering_variables.h | 309 ++++++++++++++++++++++++++++ 4 files changed, 521 insertions(+), 1 deletion(-) create mode 100644 src/output/clustering_format.cpp create mode 100644 src/output/clustering_variables.cpp create mode 100644 src/output/clustering_variables.h diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp index 225a8368..cfe346d6 100644 --- a/src/cluster/mcl.cpp +++ b/src/cluster/mcl.cpp @@ -343,6 +343,10 @@ void MCL::run(){ auto componentInfo = ms.getComponents(); vector> indices = get<0>(componentInfo); vector>> components = get<1>(componentInfo); + std::vector sort_order(components.size()); + std::iota(sort_order.begin(), sort_order.end(), 0); + std::sort(sort_order.begin(), sort_order.end(), [&](uint32_t i, uint32_t j){return indices[i].size() > indices[j].size();}); + uint32_t nComponents = count_if(indices.begin(), indices.end(), [](vector v){ return v.size() > 0;}); uint32_t nComponentsLt1 = count_if(indices.begin(), indices.end(), [](vector v){ return v.size() > 1;}); timer.finish(); @@ -378,7 +382,7 @@ void MCL::run(){ // threads.emplace_back(seed_join_worker, query_idx, ref_idx, &seedp, &range, query_seed_hits, ref_seed_hits); // for (auto &t : threads) // t.join(); - for(uint32_t iComponent = 0; iComponent order = indices[iComponent]; if(order.size() > 1){ vector> m = components[iComponent]; diff --git a/src/output/clustering_format.cpp b/src/output/clustering_format.cpp new file mode 100644 index 00000000..826c16c2 --- /dev/null +++ b/src/output/clustering_format.cpp @@ -0,0 +1,163 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2020 Patrick Ettenhuber + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +****/ + +#include "output_format.h" +#include "clustering_variables.h" + + +class RecursiveParser{ + const Hsp_context& r; + const char * expression_to_parse; + char peek() + { + return *expression_to_parse; + } + char peek(uint32_t ahead) + { + return *(expression_to_parse+ahead); + } + char get() + { + return *expression_to_parse++; + } + void advance(uint32_t ahead) + { + expression_to_parse+=ahead; + } + std::string variable() + { + vector v; + while ( (peek() >= 'a' && peek() <= 'z') || (peek() >= 'A' && peek() <= 'Z') || peek() == '_') + { + v.push_back(get()); + } + return std::string(v.begin(), v.end()); + } + uint32_t integer() + { + uint32_t result = get() - '0'; + while (peek() >= '0' && peek() <= '9') + { + result = 10*result + get() - '0'; + } + return result; + } + double number() + { + double result = integer(); + if(peek() == '.'){ + advance(1); // '.' + const char *pos = expression_to_parse; + double decimals = integer() * 1.0 / (expression_to_parse - pos); + result += decimals; + } + return result; + } + + double factor() + { + if (peek() >= '0' && peek() <= '9') { + return number(); + } + else if (peek() == '(') { + advance(1); // '(' + double result = expression(); + advance(1); // ')' + return result; + } + else if (peek() == '-') { + advance(1); + return -factor(); + } + else if (peek() == 'm' && peek(1)=='a' && peek(2)=='x' && peek(3)=='('){ + advance(4); // 'max(' + double result1 = expression(); + advance(1); // ',' + double result2 = expression(); + advance(1); // ')' + return std::max(result1, result2); + } + else if (peek() == 'm' && peek(1)=='i' && peek(2)=='n' && peek(3)=='('){ + advance(4); // 'min(' + double result1 = expression(); + advance(1); // ',' + double result2 = expression(); + advance(1); // ')' + return std::min(result1, result2); + } + else if (peek() == 'e' && peek(1)=='x' && peek(2)=='p' && peek(3)=='('){ + advance(4); // 'exp(' + double result1 = expression(); + advance(1); // ')' + return std::exp(result1); + } + else { + return VariableRegistry::get(variable())->get(r); + } + } + + double term() + { + double result = factor(); + while (peek() == '*' || peek() == '/'){ + if (get() == '*'){ + result *= factor(); + } + else{ + result /= factor(); + } + } + return result; + } + + double expression() + { + double result = term(); + while (peek() == '+' || peek() == '-'){ + if (get() == '+'){ + result += term(); + } + else{ + result -= term(); + } + } + return result; + } + +public: + static const vector variables; + RecursiveParser(const Hsp_context& r, const char * c): r(r), expression_to_parse(c){} + double evaluate(){ + return expression(); + } +}; + + +void Clustering_format::print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const +{ + out.write((uint32_t)query_num); +} + +void Clustering_format::print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out) +{ + if (r.query_id <= r.subject_id) { + out.write((uint32_t)r.subject_id); + RecursiveParser rp(r, format->c_str()); + out.write(rp.evaluate()); + } +} diff --git a/src/output/clustering_variables.cpp b/src/output/clustering_variables.cpp new file mode 100644 index 00000000..6011d905 --- /dev/null +++ b/src/output/clustering_variables.cpp @@ -0,0 +1,44 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2020 Patrick Ettenhuber + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +****/ + +#include "clustering_variables.h" + +std::map VariableRegistry::regMap; +QueryStart VariableRegistry::queryStart; +QueryEnd VariableRegistry::queryEnd; +SubjectStart VariableRegistry::subjectStart; +SubjectEnd VariableRegistry::subjectEnd; +EValue VariableRegistry::eValue; +BitScore VariableRegistry::bitScore; +Score VariableRegistry::score; +Length VariableRegistry::length; +PercentIdenticalMatches VariableRegistry::percentIdenticalMatches; +NumberIdenticalMatches VariableRegistry::numberIdenticalMatches; +NumberMismatches VariableRegistry::numberMismatches; +NumberPositiveMatches VariableRegistry::numberPositiveMatches; +NumberGapOpenings VariableRegistry::numberGapOpenings; +NumberGaps VariableRegistry::numberGaps; +PercentagePositiveMatches VariableRegistry::percentagePositiveMatches; +QueryFrame VariableRegistry::queryFrame; +QueryCoveragePerHsp VariableRegistry::queryCoveragePerHsp; +SwDiff VariableRegistry::swdiff; +Time VariableRegistry::time; +SubjectCoveragePerHsp VariableRegistry::subjectCoveragePerHsp; +UngappedScore VariableRegistry::ungappedScore; +VariableRegistry::StaticConstructor VariableRegistry::_staticConstructor; + diff --git a/src/output/clustering_variables.h b/src/output/clustering_variables.h new file mode 100644 index 00000000..f29f06bc --- /dev/null +++ b/src/output/clustering_variables.h @@ -0,0 +1,309 @@ +/**** +DIAMOND protein aligner +Copyright (C) 2020 Patrick Ettenhuber + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +****/ + +#include +#include +#include "../basic/score_matrix.h" +#include "../basic/match.h" + +class Variable{ +public: + virtual std::string get_name() = 0; + virtual double get(const Hsp_context& r) = 0; +}; + +class QueryLength: public Variable{ +public: + std::string get_name(){ + return "qlen"; + } + double get(const Hsp_context& r){ + return r.query.source().length(); + } +}; +class SubjectLength: public Variable{ +public: + std::string get_name(){ + return "slen"; + } + double get(const Hsp_context& r){ + return r.subject_len; + } +}; +class QueryStart: public Variable{ +public: + std::string get_name(){ + return "qstart"; + } + double get(const Hsp_context& r){ + return r.oriented_query_range().begin_ + 1; + } +}; +class QueryEnd: public Variable{ +public: + std::string get_name(){ + return "qend"; + } + double get(const Hsp_context& r){ + return r.oriented_query_range().end_ + 1; + } +}; +class SubjectStart: public Variable{ +public: + std::string get_name(){ + return "sstart"; + } + double get(const Hsp_context& r){ + return r.subject_range().begin_ + 1; + } +}; +class SubjectEnd: public Variable{ +public: + std::string get_name(){ + return "send"; + } + double get(const Hsp_context& r){ + return r.subject_range().end_; + } +}; +class EValue: public Variable{ +public: + std::string get_name(){ + return "evalue"; + } + double get(const Hsp_context& r){ + return r.evalue(); + } +}; +class BitScore: public Variable{ +public: + std::string get_name(){ + return "bitscore"; + } + double get(const Hsp_context& r){ + return r.bit_score(); + } +}; +class Score: public Variable{ +public: + std::string get_name(){ + return "score"; + } + double get(const Hsp_context& r){ + return r.score(); + } +}; +class Length: public Variable{ +public: + std::string get_name(){ + return "length"; + } + double get(const Hsp_context& r){ + return r.length(); + } +}; +class PercentIdenticalMatches: public Variable{ +public: + std::string get_name(){ + return "pident"; + } + double get(const Hsp_context& r){ + return (double)r.identities() * 100 / r.length(); + } +}; +class NumberIdenticalMatches: public Variable{ +public: + std::string get_name(){ + return "nident"; + } + double get(const Hsp_context& r){ + return r.identities(); + } +}; +class NumberMismatches: public Variable{ +public: + std::string get_name(){ + return "mismatch"; + } + double get(const Hsp_context& r){ + return r.mismatches(); + } +}; +class NumberPositiveMatches: public Variable{ +public: + std::string get_name(){ + return "positive"; + } + double get(const Hsp_context& r){ + return r.positives(); + } +}; +class NumberGapOpenings: public Variable{ +public: + std::string get_name(){ + return "gapopen"; + } + double get(const Hsp_context& r){ + return r.gap_openings(); + } +}; +class NumberGaps: public Variable{ +public: + std::string get_name(){ + return "gaps"; + } + double get(const Hsp_context& r){ + return r.gaps(); + } +}; +class PercentagePositiveMatches: public Variable{ +public: + std::string get_name(){ + return "ppos"; + } + double get(const Hsp_context& r){ + return (double)r.positives() * 100.0 / r.length(); + } +}; +class QueryFrame: public Variable{ +public: + std::string get_name(){ + return "qframe"; + } + double get(const Hsp_context& r){ + return r.blast_query_frame(); + } +}; +class QueryCoveragePerHsp: public Variable{ +public: + std::string get_name(){ + return "qcovhsp"; + } + double get(const Hsp_context& r){ + return (double)r.query_source_range().length()*100.0 / r.query.source().length(); + } +}; +class SwDiff: public Variable{ +public: + std::string get_name(){ + return "swdiff"; + } + double get(const Hsp_context& r){ + return r.sw_score() - r.bit_score(); + } +}; +class Time: public Variable{ +public: + std::string get_name(){ + return "time"; + } + double get(const Hsp_context& r){ + return r.time(); + } +}; +class SubjectCoveragePerHsp: public Variable{ +public: + std::string get_name(){ + return "scovhsp"; + } + double get(const Hsp_context& r){ + return (double)r.subject_range().length() * 100.0 / r.subject_len; + } +}; +class UngappedScore: public Variable{ +public: + std::string get_name(){ + return "ungapped_score"; + } + double get(const Hsp_context& r){ + return score_matrix.bitscore(r.ungapped_score); + } +}; + +class VariableRegistry{ +private: + VariableRegistry(){}; + // To include new clustering algorithms add the instantiation here and in the cluster_registry.cpp file. Then add it to the StaticConstructor below + static QueryStart queryStart; + static QueryEnd queryEnd; + static SubjectStart subjectStart; + static SubjectEnd subjectEnd; + static EValue eValue; + static BitScore bitScore; + static Score score; + static Length length; + static PercentIdenticalMatches percentIdenticalMatches; + static NumberIdenticalMatches numberIdenticalMatches; + static NumberMismatches numberMismatches; + static NumberPositiveMatches numberPositiveMatches; + static NumberGapOpenings numberGapOpenings; + static NumberGaps numberGaps; + static PercentagePositiveMatches percentagePositiveMatches; + static QueryFrame queryFrame; + static QueryCoveragePerHsp queryCoveragePerHsp; + static SwDiff swdiff; + static Time time; + static SubjectCoveragePerHsp subjectCoveragePerHsp; + static UngappedScore ungappedScore; +public: + static std::map regMap; + static Variable* get(std::string key){ + std::map::iterator ca = VariableRegistry::regMap.find(key); + if(ca == VariableRegistry::regMap.end()){ + throw std::runtime_error(std::string("Variable not found:")+ key); + } + return ca->second; + } + static bool has(std::string key){ + return VariableRegistry::regMap.find(key) != VariableRegistry::regMap.end(); + } + static vector getKeys(){ + auto it = regMap.begin(); + vector keys; + while(it != regMap.end()){ + keys.push_back(it->first); + it++; + } + return keys; + } + static struct StaticConstructor { + StaticConstructor() { + regMap.emplace(queryStart.get_name(), &queryStart); + regMap.emplace(queryEnd.get_name(), &queryEnd); + regMap.emplace(subjectStart.get_name(), &subjectStart); + regMap.emplace(subjectEnd.get_name(), &subjectEnd); + regMap.emplace(eValue.get_name(), &eValue); + regMap.emplace(bitScore.get_name(), &bitScore); + regMap.emplace(score.get_name(), &score); + regMap.emplace(length.get_name(), &length); + regMap.emplace(percentIdenticalMatches.get_name(), &percentIdenticalMatches); + regMap.emplace(numberIdenticalMatches.get_name(), &numberIdenticalMatches); + regMap.emplace(numberMismatches.get_name(), &numberMismatches); + regMap.emplace(numberPositiveMatches.get_name(), &numberPositiveMatches); + regMap.emplace(numberGapOpenings.get_name(), &numberGapOpenings); + regMap.emplace(numberGaps.get_name(), &numberGaps); + regMap.emplace(percentagePositiveMatches.get_name(), &percentagePositiveMatches); + regMap.emplace(queryFrame.get_name(), &queryFrame); + regMap.emplace(queryCoveragePerHsp.get_name(), &queryCoveragePerHsp); + regMap.emplace(swdiff.get_name(), &swdiff); + regMap.emplace(time.get_name(), &time); + regMap.emplace(subjectCoveragePerHsp.get_name(), &subjectCoveragePerHsp); + regMap.emplace(ungappedScore.get_name(), &ungappedScore); + } + } _staticConstructor; +}; From 5f2ab15b995cf76e6adba5eee0b4f40d7aa4b72c Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Fri, 22 May 2020 07:46:46 +0200 Subject: [PATCH 14/21] update copyright --- src/cluster/cluster.h | 3 ++- src/cluster/cluster_registry.cpp | 3 ++- src/cluster/cluster_registry.h | 3 ++- src/cluster/disjoint_set.h | 3 ++- src/cluster/mcl.cpp | 3 ++- src/cluster/mcl.h | 3 ++- src/output/clustering_format.cpp | 3 ++- src/output/clustering_variables.cpp | 3 ++- src/output/clustering_variables.h | 3 ++- 9 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/cluster/cluster.h b/src/cluster/cluster.h index 62323f94..48e3021a 100644 --- a/src/cluster/cluster.h +++ b/src/cluster/cluster.h @@ -1,6 +1,7 @@ /**** DIAMOND protein aligner -Copyright (C) 2020 Patrick Ettenhuber +Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark) +Code developed by Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/cluster/cluster_registry.cpp b/src/cluster/cluster_registry.cpp index 4c59d2bd..ce28ec1d 100644 --- a/src/cluster/cluster_registry.cpp +++ b/src/cluster/cluster_registry.cpp @@ -1,6 +1,7 @@ /**** DIAMOND protein aligner -Copyright (C) 2020 Patrick Ettenhuber +Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark) +Code developed by Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/cluster/cluster_registry.h b/src/cluster/cluster_registry.h index a2aaa2f3..696116cf 100644 --- a/src/cluster/cluster_registry.h +++ b/src/cluster/cluster_registry.h @@ -1,6 +1,7 @@ /**** DIAMOND protein aligner -Copyright (C) 2020 Patrick Ettenhuber +Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark) +Code developed by Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/cluster/disjoint_set.h b/src/cluster/disjoint_set.h index df5e5f96..1ac9ae5f 100644 --- a/src/cluster/disjoint_set.h +++ b/src/cluster/disjoint_set.h @@ -1,6 +1,7 @@ /**** DIAMOND protein aligner - Markov Clustering Module -Copyright (C) 2019 Patrick Ettenhuber +Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark) +Code developed by Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp index cfe346d6..b47c3721 100644 --- a/src/cluster/mcl.cpp +++ b/src/cluster/mcl.cpp @@ -1,6 +1,7 @@ /**** DIAMOND protein aligner -Copyright (C) 2020 Patrick Ettenhuber +Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark) +Code developed by Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h index 8901b544..7b11ebc5 100644 --- a/src/cluster/mcl.h +++ b/src/cluster/mcl.h @@ -1,6 +1,7 @@ /**** DIAMOND protein aligner -Copyright (C) 2020 Patrick Ettenhuber +Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark) +Code developed by Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/output/clustering_format.cpp b/src/output/clustering_format.cpp index 826c16c2..1f68efab 100644 --- a/src/output/clustering_format.cpp +++ b/src/output/clustering_format.cpp @@ -1,6 +1,7 @@ /**** DIAMOND protein aligner -Copyright (C) 2020 Patrick Ettenhuber +Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark) +Code developed by Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/output/clustering_variables.cpp b/src/output/clustering_variables.cpp index 6011d905..048aada2 100644 --- a/src/output/clustering_variables.cpp +++ b/src/output/clustering_variables.cpp @@ -1,6 +1,7 @@ /**** DIAMOND protein aligner -Copyright (C) 2020 Patrick Ettenhuber +Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark) +Code developed by Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/output/clustering_variables.h b/src/output/clustering_variables.h index f29f06bc..2410cb09 100644 --- a/src/output/clustering_variables.h +++ b/src/output/clustering_variables.h @@ -1,6 +1,7 @@ /**** DIAMOND protein aligner -Copyright (C) 2020 Patrick Ettenhuber +Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark) +Code developed by Patrick Ettenhuber This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by From 9a77bf212ee3aa0d34bef709315d55cd5efb3138 Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Sun, 24 May 2020 21:33:42 +0200 Subject: [PATCH 15/21] added multithreading --- src/cluster/mcl.cpp | 180 ++++++++++++++++++++++++++++---------------- src/cluster/mcl.h | 14 +++- 2 files changed, 125 insertions(+), 69 deletions(-) diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp index b47c3721..0224a2d7 100644 --- a/src/cluster/mcl.cpp +++ b/src/cluster/mcl.cpp @@ -21,6 +21,7 @@ along with this program. If not, see . using namespace std; namespace Workflow { namespace Cluster{ + string MCL::get_key() { return "mcl"; } @@ -29,7 +30,7 @@ string MCL::get_description() { } void MCL::get_exp(Eigen::SparseMatrix* in, Eigen::SparseMatrix* out, float r){ - std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); + chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); if( r - (int) r == 0 ){ // TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues *out = *in; @@ -41,18 +42,18 @@ void MCL::get_exp(Eigen::SparseMatrix* in, Eigen::SparseMatrix* ou throw runtime_error(" Eigen does not provide an eigenvalue solver for sparse matrices"); } *out = out->pruned(); - sparse_exp_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; + sparse_exp_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); } void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ // TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues - std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); + chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); if(r - (int) r == 0){ *out = *in; for(uint32_t i=1; i(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; + dense_int_exp_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); } else{ // TODO: check whether the matrix is self-adjoint and use SelfAdjointEigenSolver instead @@ -64,15 +65,15 @@ void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ } Eigen::MatrixXcf V = solver.eigenvectors(); double thr = 0.5 * abs(V.determinant()); - if( thr > std::numeric_limits::epsilon() ){ + if( thr > numeric_limits::epsilon() ){ out->noalias() = (V * d.real() * V.inverse()).real(); } - dense_gen_exp_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; + dense_gen_exp_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); } } void MCL::get_gamma(Eigen::SparseMatrix* in, Eigen::SparseMatrix* out, float r){ - std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); + chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); vector> data; for (uint32_t k=0; kouterSize(); ++k){ float colSum = 0.0f; @@ -84,12 +85,12 @@ void MCL::get_gamma(Eigen::SparseMatrix* in, Eigen::SparseMatrix* } } out->setFromTriplets(data.begin(), data.end(), [] (const float&, const float &b) { return b; }); - *out = out->pruned(1.0, std::numeric_limits::epsilon()); - sparse_gamma_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; + *out = out->pruned(1.0, numeric_limits::epsilon()); + sparse_gamma_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); } void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ - std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); + chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); // Note that Eigen matrices are column-major, so this is the most efficient way for (uint32_t icol=0; icolcols(); ++icol){ float colSum = 0.0f; @@ -100,32 +101,32 @@ void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ out->coeffRef(irow, icol) = pow(in->coeffRef(irow, icol) , r) / colSum; } } - dense_gamma_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; + dense_gamma_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); } vector> MCL::get_list(Eigen::SparseMatrix* m){ - std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); + chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); LazyDisjointIntegralSet disjointSet(m->cols()); for (uint32_t k=0; kouterSize(); ++k){ for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ disjointSet.merge(it.row(), it.col()); } } - sparse_list_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; + sparse_list_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); return disjointSet.getListOfSets(); } vector> MCL::get_list(Eigen::MatrixXf* m){ - std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); + chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); LazyDisjointIntegralSet disjointSet(m->cols()); for (uint32_t icol=0; icolcols(); ++icol){ for (uint32_t irow=0; irowrows(); ++irow){ - if( abs((*m)(irow, icol)) > std::numeric_limits::epsilon()){ + if( abs((*m)(irow, icol)) > numeric_limits::epsilon()){ disjointSet.merge(irow, icol); } } } - dense_list_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; + dense_list_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); return disjointSet.getListOfSets(); } @@ -164,10 +165,10 @@ vector> MCL::get_list(Eigen::MatrixXf* m){ vector> MCL::markov_process(Eigen::SparseMatrix* m, float inflation, float expansion){ for (uint32_t idiag=0; idiagcols(); ++idiag){ - assert(abs(m->coeffRef(idiag, idiag)) > std::numeric_limits::epsilon()); + assert(abs(m->coeffRef(idiag, idiag)) > numeric_limits::epsilon()); } uint32_t iteration = 0; - float diff_norm = std::numeric_limits::max(); + float diff_norm = numeric_limits::max(); Eigen::SparseMatrix msquared(m->rows(), m->cols()); Eigen::SparseMatrix m_update(m->rows(), m->cols()); get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable @@ -184,14 +185,14 @@ vector> MCL::markov_process(Eigen::SparseMatrix* vector> MCL::markov_process(Eigen::MatrixXf* m, float inflation, float expansion){ for (uint32_t idiag=0; idiagcols(); ++idiag){ - assert(abs(m->coeffRef(idiag, idiag)) > std::numeric_limits::epsilon()); + assert(abs(m->coeffRef(idiag, idiag)) > numeric_limits::epsilon()); } uint32_t iteration = 0; - float diff_norm = std::numeric_limits::max(); + float diff_norm = numeric_limits::max(); Eigen::MatrixXf msquared(m->rows(), m->cols()); Eigen::MatrixXf m_update(m->rows(), m->cols()); get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable - while( iteration < 100 && diff_norm > std::numeric_limits::epsilon()*m->rows() ){ + while( iteration < 100 && diff_norm > numeric_limits::epsilon()*m->rows() ){ get_exp(m, &msquared, expansion); get_gamma(&msquared, &m_update, inflation); *m -= m_update; @@ -344,9 +345,9 @@ void MCL::run(){ auto componentInfo = ms.getComponents(); vector> indices = get<0>(componentInfo); vector>> components = get<1>(componentInfo); - std::vector sort_order(components.size()); - std::iota(sort_order.begin(), sort_order.end(), 0); - std::sort(sort_order.begin(), sort_order.end(), [&](uint32_t i, uint32_t j){return indices[i].size() > indices[j].size();}); + vector sort_order(components.size()); + iota(sort_order.begin(), sort_order.end(), 0); + sort(sort_order.begin(), sort_order.end(), [&](uint32_t i, uint32_t j){return indices[i].size() > indices[j].size();}); uint32_t nComponents = count_if(indices.begin(), indices.end(), [](vector v){ return v.size() > 0;}); uint32_t nComponentsLt1 = count_if(indices.begin(), indices.end(), [](vector v){ return v.size() > 1;}); @@ -355,52 +356,47 @@ void MCL::run(){ timer.go("Clustering components"); bool full = false; - vector clustering_result(db->ref_header.sequences, std::numeric_limits::max()); - float max_sparsity = 0.0f; - float min_sparsity = 1.0f; - uint32_t cluster_id = 0; - uint32_t nClustersEq1 = 0; - uint32_t n_dense_calculations = 0; - uint32_t n_sparse_calculations = 0; - float inflation = (float) config.cluster_mcl_inflation; - float expansion = (float) config.cluster_mcl_expansion; - - if( full ){ - // TODO: According to the SIAM publication this is not valid, just for debugging - Eigen::SparseMatrix m = ms.getMatrix(); - for(unordered_set subset : markov_process(&m, inflation, expansion)){ - for(uint32_t el : subset){ - clustering_result[el] = cluster_id; - } - cluster_id++; - if(subset.size() == 1) nClustersEq1 ++; - } + // Note, we will access the clustering_result from several threads below and a vector does not guarantee thread-safety in these situations. + // Note also, that the use of the disjoint_set structure guarantees that each thread will access a different part of the clustering_result + uint32_t* clustering_result = new uint32_t[db->ref_header.sequences]; + for(uint64_t iseq = 0; iseq < db->ref_header.sequences; iseq++){ + clustering_result[iseq] = numeric_limits::max(); } - else { - // TODO: check when/if using dense matrices for non-sparse components improves performance - // vector threads; - // for (size_t i = 0; i < config.threads_; ++i) - // threads.emplace_back(seed_join_worker, query_idx, ref_idx, &seedp, &range, query_seed_hits, ref_seed_hits); - // for (auto &t : threads) - // t.join(); - for(uint32_t iComponent : sort_order){ + + uint32_t nThreads = min(config.threads_, nComponentsLt1); + float* max_sparsities = new float[nThreads]; + float* min_sparsities = new float[nThreads]; + atomic_uint32_t cluster_id(0); + atomic_uint32_t nClustersEq1(0); + atomic_uint32_t n_dense_calculations(0); + atomic_uint32_t n_sparse_calculations(0); + const float inflation = (float) config.cluster_mcl_inflation; + const float expansion = (float) config.cluster_mcl_expansion; + + atomic_uint32_t component_counter(0); + + auto mcl_clustering = [&](uint32_t iThr){ + const uint32_t max_counter = sort_order.size(); + uint32_t my_counter = component_counter.fetch_add(1, memory_order_relaxed); + while(my_counter < max_counter){ + uint32_t iComponent = sort_order[my_counter]; vector order = indices[iComponent]; if(order.size() > 1){ vector> m = components[iComponent]; assert(m.size() <= order.size() * order.size()); float sparsity = 1.0-(1.0 * m.size()) / (order.size()*order.size()); - max_sparsity = max(max_sparsity, sparsity); - min_sparsity = min(min_sparsity, sparsity); + max_sparsities[iThr] = max(max_sparsities[iThr], sparsity); + min_sparsities[iThr] = min(min_sparsities[iThr], sparsity); // map back to original ids vector> list_of_sets; //TODO: a size limit for the dense matrix should control this as well - std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now(); + chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); if(sparsity >= config.cluster_mcl_sparsity_switch && expansion - (int) expansion == 0){ n_sparse_calculations++; Eigen::SparseMatrix m_sparse(order.size(), order.size()); m_sparse.setFromTriplets(m.begin(), m.end()); - sparse_create_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; + sparse_create_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); list_of_sets = markov_process(&m_sparse, inflation, expansion); } else{ @@ -409,30 +405,83 @@ void MCL::run(){ for(Eigen::Triplet const & t : m){ m_dense(t.row(), t.col()) = t.value(); } - dense_create_time += (double) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t).count() / 1000.0; + dense_create_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); list_of_sets = markov_process(&m_dense, inflation, expansion); } for(unordered_set subset : list_of_sets){ + // FIXME: this does not assign stable cluster ids + uint32_t c_id = cluster_id.fetch_add(1, memory_order_relaxed); for(uint32_t el : subset){ - clustering_result[order[el]] = cluster_id; + clustering_result[order[el]] = c_id; } - cluster_id++; if(subset.size() == 1) nClustersEq1 ++; } } else if (order.size() == 1){ - clustering_result[order[0]] = cluster_id++; + uint32_t c_id = cluster_id.fetch_add(1, memory_order_relaxed); + clustering_result[order[0]] = c_id; nClustersEq1++; } + my_counter = component_counter.fetch_add(1, memory_order_relaxed); } + }; + + vector threads; + for(uint32_t iThread = 0; iThread < nThreads ; iThread++ ) { + min_sparsities[iThread] = 1.0f; + max_sparsities[iThread] = 0.0f; + threads.emplace_back(mcl_clustering, iThread); } + + float min_sparsity = 1.0f; + float max_sparsity = 0.0f; + for(uint32_t iThread = 0; iThread < nThreads ; iThread++ ) { + threads[iThread].join(); + min_sparsity = min(min_sparsity, min_sparsities[iThread]); + max_sparsity = max(max_sparsity, max_sparsities[iThread]); + } + delete[] min_sparsities; + delete[] max_sparsities; + timer.finish(); - cout << "Found "< seq; @@ -449,6 +498,7 @@ void MCL::run(){ } if (out != &cout) delete out; db->close(); + delete clustering_result; timer.finish(); } }} diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h index 7b11ebc5..73396135 100644 --- a/src/cluster/mcl.h +++ b/src/cluster/mcl.h @@ -27,6 +27,7 @@ along with this program. If not, see . #include #include #include +#include #include "../util/system/system.h" #include "../util/util.h" #include "../basic/config.h" @@ -53,10 +54,15 @@ class MCL: public ClusteringAlgorithm { vector> get_list(Eigen::MatrixXf* m); vector> markov_process(Eigen::SparseMatrix* m, float inflation, float expansion); vector> markov_process(Eigen::MatrixXf* m, float inflation, float expansion); - double sparse_create_time, dense_create_time = 0.0; - double sparse_exp_time, dense_int_exp_time, dense_gen_exp_time = 0.0; - double sparse_gamma_time, dense_gamma_time = 0.0; - double sparse_list_time, dense_list_time = 0.0; + atomic_uint64_t sparse_create_time = {0}; + atomic_uint64_t dense_create_time = {0}; + atomic_uint64_t sparse_exp_time = {0}; + atomic_uint64_t dense_int_exp_time = {0}; + atomic_uint64_t dense_gen_exp_time = {0}; + atomic_uint64_t sparse_gamma_time = {0}; + atomic_uint64_t dense_gamma_time = {0}; + atomic_uint64_t sparse_list_time = {0}; + atomic_uint64_t dense_list_time = {0}; public: void run(); string get_key(); From c127db88040e8685f36f93c11700f8e921966ad7 Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Sun, 24 May 2020 21:40:29 +0200 Subject: [PATCH 16/21] also allowing for log as input --- src/output/clustering_format.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/output/clustering_format.cpp b/src/output/clustering_format.cpp index 1f68efab..e2cddfe9 100644 --- a/src/output/clustering_format.cpp +++ b/src/output/clustering_format.cpp @@ -107,6 +107,12 @@ class RecursiveParser{ advance(1); // ')' return std::exp(result1); } + else if (peek() == 'l' && peek(1)=='o' && peek(2)=='g' && peek(3)=='('){ + advance(4); // 'log(' + double result1 = expression(); + advance(1); // ')' + return std::exp(result1); + } else { return VariableRegistry::get(variable())->get(r); } From bb3d305bc249821518651e6803ca2cf0a92696d7 Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Sun, 24 May 2020 21:42:25 +0200 Subject: [PATCH 17/21] correct log --- src/output/clustering_format.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/output/clustering_format.cpp b/src/output/clustering_format.cpp index e2cddfe9..3fffbd81 100644 --- a/src/output/clustering_format.cpp +++ b/src/output/clustering_format.cpp @@ -111,7 +111,7 @@ class RecursiveParser{ advance(4); // 'log(' double result1 = expression(); advance(1); // ')' - return std::exp(result1); + return std::log(result1); } else { return VariableRegistry::get(variable())->get(r); From 8481e0d7360e40b597cead68f1b6131932648867 Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Tue, 26 May 2020 09:14:24 +0200 Subject: [PATCH 18/21] change atomic types to be able to compile with clang5.0 --- src/cluster/mcl.cpp | 12 ++++++------ src/cluster/mcl.h | 18 +++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp index 0224a2d7..634b6e49 100644 --- a/src/cluster/mcl.cpp +++ b/src/cluster/mcl.cpp @@ -366,14 +366,14 @@ void MCL::run(){ uint32_t nThreads = min(config.threads_, nComponentsLt1); float* max_sparsities = new float[nThreads]; float* min_sparsities = new float[nThreads]; - atomic_uint32_t cluster_id(0); - atomic_uint32_t nClustersEq1(0); - atomic_uint32_t n_dense_calculations(0); - atomic_uint32_t n_sparse_calculations(0); + atomic_uint cluster_id(0); + atomic_uint nClustersEq1(0); + atomic_uint n_dense_calculations(0); + atomic_uint n_sparse_calculations(0); const float inflation = (float) config.cluster_mcl_inflation; const float expansion = (float) config.cluster_mcl_expansion; - atomic_uint32_t component_counter(0); + atomic_uint component_counter(0); auto mcl_clustering = [&](uint32_t iThr){ const uint32_t max_counter = sort_order.size(); @@ -498,7 +498,7 @@ void MCL::run(){ } if (out != &cout) delete out; db->close(); - delete clustering_result; + delete[] clustering_result; timer.finish(); } }} diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h index 73396135..5de3aaef 100644 --- a/src/cluster/mcl.h +++ b/src/cluster/mcl.h @@ -54,15 +54,15 @@ class MCL: public ClusteringAlgorithm { vector> get_list(Eigen::MatrixXf* m); vector> markov_process(Eigen::SparseMatrix* m, float inflation, float expansion); vector> markov_process(Eigen::MatrixXf* m, float inflation, float expansion); - atomic_uint64_t sparse_create_time = {0}; - atomic_uint64_t dense_create_time = {0}; - atomic_uint64_t sparse_exp_time = {0}; - atomic_uint64_t dense_int_exp_time = {0}; - atomic_uint64_t dense_gen_exp_time = {0}; - atomic_uint64_t sparse_gamma_time = {0}; - atomic_uint64_t dense_gamma_time = {0}; - atomic_uint64_t sparse_list_time = {0}; - atomic_uint64_t dense_list_time = {0}; + atomic_ullong sparse_create_time = {0}; + atomic_ullong dense_create_time = {0}; + atomic_ullong sparse_exp_time = {0}; + atomic_ullong dense_int_exp_time = {0}; + atomic_ullong dense_gen_exp_time = {0}; + atomic_ullong sparse_gamma_time = {0}; + atomic_ullong dense_gamma_time = {0}; + atomic_ullong sparse_list_time = {0}; + atomic_ullong dense_list_time = {0}; public: void run(); string get_key(); From cb6dc00516e7183d9794ba3c276a97ae7f220513 Mon Sep 17 00:00:00 2001 From: Patrick Ettenhuber Date: Thu, 28 May 2020 08:21:34 +0200 Subject: [PATCH 19/21] some minor improvements - moving atomic updates out of loop - work distribution is only governed by thread id - printing of node labels, i.e. attractor, normal and single --- src/cluster/mcl.cpp | 172 ++++++++++++++++++++++++++++++-------------- src/cluster/mcl.h | 8 +-- 2 files changed, 123 insertions(+), 57 deletions(-) diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp index 634b6e49..fdfcc58c 100644 --- a/src/cluster/mcl.cpp +++ b/src/cluster/mcl.cpp @@ -17,6 +17,10 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . ****/ #include "mcl.h" +#define MASK_INVERSE 0xC0000000 +#define MASK_NORMAL_NODE 0x40000000 +#define MASK_ATTRACTOR_NODE 0x80000000 +#define MASK_SINGLE_NODE 0xC0000000 using namespace std; @@ -30,7 +34,9 @@ string MCL::get_description() { } void MCL::get_exp(Eigen::SparseMatrix* in, Eigen::SparseMatrix* out, float r){ +#ifdef MCL_TIMINGS chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); +#endif if( r - (int) r == 0 ){ // TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues *out = *in; @@ -42,18 +48,24 @@ void MCL::get_exp(Eigen::SparseMatrix* in, Eigen::SparseMatrix* ou throw runtime_error(" Eigen does not provide an eigenvalue solver for sparse matrices"); } *out = out->pruned(); +#ifdef MCL_TIMINGS sparse_exp_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); +#endif } void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ // TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues +#ifdef MCL_TIMINGS chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); +#endif if(r - (int) r == 0){ *out = *in; for(uint32_t i=1; i(chrono::high_resolution_clock::now() - t).count(); +#endif } else{ // TODO: check whether the matrix is self-adjoint and use SelfAdjointEigenSolver instead @@ -68,12 +80,16 @@ void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ if( thr > numeric_limits::epsilon() ){ out->noalias() = (V * d.real() * V.inverse()).real(); } +#ifdef MCL_TIMINGS dense_gen_exp_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); +#endif } } void MCL::get_gamma(Eigen::SparseMatrix* in, Eigen::SparseMatrix* out, float r){ +#ifdef MCL_TIMINGS chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); +#endif vector> data; for (uint32_t k=0; kouterSize(); ++k){ float colSum = 0.0f; @@ -86,11 +102,15 @@ void MCL::get_gamma(Eigen::SparseMatrix* in, Eigen::SparseMatrix* } out->setFromTriplets(data.begin(), data.end(), [] (const float&, const float &b) { return b; }); *out = out->pruned(1.0, numeric_limits::epsilon()); +#ifdef MCL_TIMINGS sparse_gamma_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); +#endif } void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ +#ifdef MCL_TIMINGS chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); +#endif // Note that Eigen matrices are column-major, so this is the most efficient way for (uint32_t icol=0; icolcols(); ++icol){ float colSum = 0.0f; @@ -101,33 +121,9 @@ void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){ out->coeffRef(irow, icol) = pow(in->coeffRef(irow, icol) , r) / colSum; } } +#ifdef MCL_TIMINGS dense_gamma_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); -} - -vector> MCL::get_list(Eigen::SparseMatrix* m){ - chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); - LazyDisjointIntegralSet disjointSet(m->cols()); - for (uint32_t k=0; kouterSize(); ++k){ - for (Eigen::SparseMatrix::InnerIterator it(*m, k); it; ++it){ - disjointSet.merge(it.row(), it.col()); - } - } - sparse_list_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); - return disjointSet.getListOfSets(); -} - -vector> MCL::get_list(Eigen::MatrixXf* m){ - chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); - LazyDisjointIntegralSet disjointSet(m->cols()); - for (uint32_t icol=0; icolcols(); ++icol){ - for (uint32_t irow=0; irowrows(); ++irow){ - if( abs((*m)(irow, icol)) > numeric_limits::epsilon()){ - disjointSet.merge(irow, icol); - } - } - } - dense_list_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); - return disjointSet.getListOfSets(); +#endif } // void printMatrix(Eigen::SparseMatrix* m){ @@ -163,7 +159,7 @@ vector> MCL::get_list(Eigen::MatrixXf* m){ // printf("]\n\n"); // } -vector> MCL::markov_process(Eigen::SparseMatrix* m, float inflation, float expansion){ +void MCL::markov_process(Eigen::SparseMatrix* m, float inflation, float expansion){ for (uint32_t idiag=0; idiagcols(); ++idiag){ assert(abs(m->coeffRef(idiag, idiag)) > numeric_limits::epsilon()); } @@ -180,10 +176,9 @@ vector> MCL::markov_process(Eigen::SparseMatrix* *m = m_update; iteration++; } - return get_list(m); } -vector> MCL::markov_process(Eigen::MatrixXf* m, float inflation, float expansion){ +void MCL::markov_process(Eigen::MatrixXf* m, float inflation, float expansion){ for (uint32_t idiag=0; idiagcols(); ++idiag){ assert(abs(m->coeffRef(idiag, idiag)) > numeric_limits::epsilon()); } @@ -200,7 +195,6 @@ vector> MCL::markov_process(Eigen::MatrixXf* m, float in *m = m_update; iteration++; } - return get_list(m); } void MCL::run(){ @@ -364,21 +358,27 @@ void MCL::run(){ } uint32_t nThreads = min(config.threads_, nComponentsLt1); + // note that the disconnected components are sorted by size + const uint32_t max_counter = sort_order.size(); float* max_sparsities = new float[nThreads]; float* min_sparsities = new float[nThreads]; - atomic_uint cluster_id(0); - atomic_uint nClustersEq1(0); - atomic_uint n_dense_calculations(0); - atomic_uint n_sparse_calculations(0); const float inflation = (float) config.cluster_mcl_inflation; const float expansion = (float) config.cluster_mcl_expansion; + atomic_uint n_clusters_found(0); atomic_uint component_counter(0); + atomic_uint n_dense_calculations(0); + atomic_uint n_sparse_calculations(0); + atomic_uint nClustersEq1(0); - auto mcl_clustering = [&](uint32_t iThr){ - const uint32_t max_counter = sort_order.size(); - uint32_t my_counter = component_counter.fetch_add(1, memory_order_relaxed); - while(my_counter < max_counter){ + auto mcl_clustering = [&](const uint32_t iThr){ + uint32_t n_dense = 0; + uint32_t n_sparse = 0; + uint32_t n_singletons = 0; + uint32_t cluster_id = iThr; + //uint32_t my_counter = component_counter.fetch_add(1, memory_order_relaxed); + //while(my_counter < max_counter){ + for(uint32_t my_counter = iThr; my_counter order = indices[iComponent]; if(order.size() > 1){ @@ -389,41 +389,89 @@ void MCL::run(){ min_sparsities[iThr] = min(min_sparsities[iThr], sparsity); // map back to original ids vector> list_of_sets; + unordered_set attractors; //TODO: a size limit for the dense matrix should control this as well +#ifdef MCL_TIMINGS chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); +#endif if(sparsity >= config.cluster_mcl_sparsity_switch && expansion - (int) expansion == 0){ - n_sparse_calculations++; + n_sparse++; Eigen::SparseMatrix m_sparse(order.size(), order.size()); m_sparse.setFromTriplets(m.begin(), m.end()); +#ifdef MCL_TIMINGS sparse_create_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); - list_of_sets = markov_process(&m_sparse, inflation, expansion); +#endif + markov_process(&m_sparse, inflation, expansion); +#ifdef MCL_TIMINGS + chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); +#endif + LazyDisjointIntegralSet disjointSet(m_sparse.cols()); + for (uint32_t k=0; k::InnerIterator it(m_sparse, k); it; ++it){ + assert(abs(it.value()) > numeric_limits::epsilon()); + disjointSet.merge(it.row(), it.col()); + if(it.row() == it.col()){ + attractors.emplace(it.row()); + } + } + } + list_of_sets = disjointSet.getListOfSets(); +#ifdef MCL_TIMINGS + sparse_list_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); +#endif } else{ - n_dense_calculations++; + n_dense++; Eigen::MatrixXf m_dense = Eigen::MatrixXf::Zero(order.size(), order.size()); for(Eigen::Triplet const & t : m){ m_dense(t.row(), t.col()) = t.value(); } +#ifdef MCL_TIMINGS dense_create_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); - list_of_sets = markov_process(&m_dense, inflation, expansion); +#endif + markov_process(&m_dense, inflation, expansion); +#ifdef MCL_TIMINGS + chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now(); +#endif + LazyDisjointIntegralSet disjointSet(m_dense.cols()); + for (uint32_t icol=0; icol numeric_limits::epsilon()){ + disjointSet.merge(irow, icol); + if(irow == icol){ + attractors.emplace(irow); + } + } + } + } + list_of_sets = disjointSet.getListOfSets(); +#ifdef MCL_TIMINGS + dense_list_time += chrono::duration_cast(chrono::high_resolution_clock::now() - t).count(); +#endif } for(unordered_set subset : list_of_sets){ - // FIXME: this does not assign stable cluster ids - uint32_t c_id = cluster_id.fetch_add(1, memory_order_relaxed); + assert(cluster_id < 0x00ffffff); for(uint32_t el : subset){ - clustering_result[order[el]] = c_id; + const uint32_t mask = attractors.find(el) == attractors.end() ? MASK_NORMAL_NODE : MASK_ATTRACTOR_NODE; + clustering_result[order[el]] = mask | cluster_id; } - if(subset.size() == 1) nClustersEq1 ++; + if(subset.size() == 1) n_singletons++; + cluster_id += nThreads; } } else if (order.size() == 1){ - uint32_t c_id = cluster_id.fetch_add(1, memory_order_relaxed); - clustering_result[order[0]] = c_id; - nClustersEq1++; + assert(cluster_id < 0x00ffffff); + clustering_result[order[0]] = MASK_SINGLE_NODE | cluster_id; + cluster_id += nThreads; + n_singletons++; } - my_counter = component_counter.fetch_add(1, memory_order_relaxed); + //my_counter = component_counter.fetch_add(1, memory_order_relaxed); } + n_clusters_found.fetch_add((cluster_id-iThr)/nThreads, memory_order_relaxed); + n_dense_calculations.fetch_add(n_dense, memory_order_relaxed); + n_sparse_calculations.fetch_add(n_sparse, memory_order_relaxed); + nClustersEq1.fetch_add(n_singletons, memory_order_relaxed); }; vector threads; @@ -445,8 +493,8 @@ void MCL::run(){ timer.finish(); cout << "Found " - << cluster_id - nClustersEq1 - << " ("<< cluster_id <<" incl. singletons) clusters with min sparsity " + << n_clusters_found - nClustersEq1 + << " ("<< n_clusters_found <<" incl. singletons) clusters with min sparsity " << min_sparsity << " and max. sparsity " << max_sparsity @@ -455,6 +503,7 @@ void MCL::run(){ << " dense, and " << n_sparse_calculations << " sparse calculations " << endl; +#ifdef MCL_TIMINGS cout << "Time used for matrix creation: " << (sparse_create_time.load() + dense_create_time.load())/1000.0 <<" (sparse: " @@ -481,6 +530,7 @@ void MCL::run(){ << sparse_list_time.load()/1000.0 << ", dense: " << dense_list_time.load()/1000.0 <<")" << endl; +#endif timer.go("Cluster output"); ostream *out = config.output_file.empty() ? &cout : new ofstream(config.output_file.c_str()); @@ -492,7 +542,23 @@ void MCL::run(){ out->precision(3); for (int i = 0; i < (int)db->ref_header.sequences; ++i) { db->read_seq(id, seq); - (*out) << blast_id(id) << '\t' << clustering_result[i] << endl; + const uint32_t res = clustering_result[i]; + (*out) << blast_id(id) << '\t' << (res & (~MASK_INVERSE)) << '\t'; + switch(res & MASK_INVERSE){ + case MASK_SINGLE_NODE: + (*out) << 's'; + break; + case MASK_ATTRACTOR_NODE: + (*out) << 'a'; + break; + case MASK_NORMAL_NODE: + (*out) << 'n'; + break; + default: + (*out) << (res & MASK_INVERSE) <* in, Eigen::SparseMatrix* out, float r); void get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r); - vector> get_list(Eigen::SparseMatrix* m); - vector> get_list(Eigen::MatrixXf* m); - vector> markov_process(Eigen::SparseMatrix* m, float inflation, float expansion); - vector> markov_process(Eigen::MatrixXf* m, float inflation, float expansion); + void markov_process(Eigen::SparseMatrix* m, float inflation, float expansion); + void markov_process(Eigen::MatrixXf* m, float inflation, float expansion); +#ifdef MCL_TIMINGS atomic_ullong sparse_create_time = {0}; atomic_ullong dense_create_time = {0}; atomic_ullong sparse_exp_time = {0}; @@ -63,6 +62,7 @@ class MCL: public ClusteringAlgorithm { atomic_ullong dense_gamma_time = {0}; atomic_ullong sparse_list_time = {0}; atomic_ullong dense_list_time = {0}; +#endif public: void run(); string get_key(); From 026c9ef52ed3374ddddee0459b19f705dd1d9646 Mon Sep 17 00:00:00 2001 From: Benjamin Buchfink Date: Fri, 5 Jun 2020 11:48:21 +0200 Subject: [PATCH 20/21] Fixed include. --- src/cluster/mcl.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp index fdfcc58c..a6696278 100644 --- a/src/cluster/mcl.cpp +++ b/src/cluster/mcl.cpp @@ -16,6 +16,8 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . ****/ + +#include #include "mcl.h" #define MASK_INVERSE 0xC0000000 #define MASK_NORMAL_NODE 0x40000000 From 2e4bb1082f63a4a2890cf150a40d8f6267b3cd89 Mon Sep 17 00:00:00 2001 From: Benjamin Buchfink Date: Fri, 5 Jun 2020 19:03:39 +0200 Subject: [PATCH 21/21] Fixed ifdef. --- src/tools/benchmark.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tools/benchmark.cpp b/src/tools/benchmark.cpp index 5337cd9b..46a2d247 100644 --- a/src/tools/benchmark.cpp +++ b/src/tools/benchmark.cpp @@ -80,7 +80,7 @@ void benchmark_ungapped(const sequence &s1, const sequence &s2) cout << "Scalar ungapped extension:\t" << (double)time_span.count() / (n*64) * 1000 << " ps/Cell" << endl; } -#ifdef __SSSE3__ +#if defined(__SSSE3__) && defined(__SSE4_1__) void benchmark_ssse3_shuffle(const sequence &s1, const sequence &s2) { static const size_t n = 100000000llu; @@ -262,7 +262,7 @@ void benchmark() { benchmark_hamming(s1, s2); #endif benchmark_ungapped(ss1, ss2); -#ifdef __SSSE3__ +#if defined(__SSSE3__) && defined(__SSE4_1__) benchmark_ssse3_shuffle(s1, s2); #endif #ifdef __SSE4_1__