From 2c8b14c61832ed78c52bb623fd15fe50af03f42f Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Thu, 27 Feb 2020 08:53:48 +0100
Subject: [PATCH 01/21] inital commit for transitive closure clustering

---
 src/basic/config.cpp           |   6 +-
 src/basic/config.h             |   5 +-
 src/run/cluster.cpp            | 129 +++++++++++++++++---
 src/run/disjoint_set.h         | 216 +++++++++++++++++++++++++++++++++
 src/util/command_line_parser.h |   2 +-
 5 files changed, 340 insertions(+), 18 deletions(-)
 create mode 100644 src/run/disjoint_set.h
diff --git a/src/basic/config.cpp b/src/basic/config.cpp
index f93ffa44..f3c93a0f 100644
--- a/src/basic/config.cpp
+++ b/src/basic/config.cpp
@@ -143,6 +143,10 @@ Config::Config(int argc, const char **argv, bool check_io)
 	makedb.add()
 		("in", 0, "input reference file in FASTA format", input_ref_file);
 
+	Options_group cluster("Cluster options");
+	cluster.add()
+		("calgo", 0, "Clustering algorithm (0=multi-step/1=transitive-closure)", cluster_algo, -1);
+
 	Options_group aligner("Aligner options");
 	aligner.add()
 		("query", 'q', "input query file", query_file)
@@ -306,7 +310,7 @@ Config::Config(int argc, const char **argv, bool check_io)
 		("cut-bar", 0, "", cut_bar)
 		("bootstrap", 0, "", bootstrap);
 	
-	parser.add(general).add(makedb).add(aligner).add(advanced).add(view_options).add(getseq_options).add(hidden_options);
+	parser.add(general).add(makedb).add(cluster).add(aligner).add(advanced).add(view_options).add(getseq_options).add(hidden_options);
 	parser.store(argc, argv, command);
 
 	if (long_reads) {
diff --git a/src/basic/config.h b/src/basic/config.h
index a4f162eb..73cc9c8e 100644
--- a/src/basic/config.h
+++ b/src/basic/config.h
@@ -206,6 +206,9 @@ struct Config
 	enum { double_indexed = 0, query_indexed = 1, subject_indexed = 2 };
 	int algo;
 
+	enum { multi_step = 0, transitive_closure = 1};
+	int cluster_algo;
+
 	enum { query_parallel = 0, target_parallel = 1 };
 	unsigned load_balancing;
 
@@ -275,4 +278,4 @@ struct Config
 
 extern Config config;
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/run/cluster.cpp b/src/run/cluster.cpp
index 0943990e..7f1f4f13 100644
--- a/src/run/cluster.cpp
+++ b/src/run/cluster.cpp
@@ -28,6 +28,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../basic/config.h"
 #include "../data/reference.h"
 #include "workflow.h"
+#include "disjoint_set.h"
 #include "../util/io/consumer.h"
 #include "../util/algo/algo.h"
 #include "../basic/statistics.h"
@@ -60,6 +61,32 @@ struct Neighbors : public vector<vector<int>>, public Consumer {
 	vector<Util::Algo::Edge> edges;
 };
 
+class NeighborStream : public Consumer {
+	LazyDisjointSet<size_t>* disjointSet;
+	virtual void consume(const char *ptr, size_t n) override {
+		size_t query, subject, count;
+		float qcov, scov, bitscore;
+		const char *end = ptr + n;
+		while (ptr < end) {
+			//if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2)
+			if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &count) != 5)
+				throw runtime_error("Cluster format error.");
+			disjointSet->merge(query, subject);
+			ptr += count;
+		}
+	}
+public:
+	NeighborStream(size_t n){
+		disjointSet = new LazyDisjointIntegralSet<size_t>(n); 
+	}
+	~NeighborStream(){
+		delete disjointSet;
+	}
+	vector<unordered_set<size_t>> getListOfSets(){
+		return disjointSet->getListOfSets();
+	}
+};
+
 vector<bool> rep_bitset(const vector<int> &centroid, const vector<bool> *superset = nullptr) {
 	vector<bool> r(centroid.size());
 	for (int c : centroid)
@@ -93,22 +120,18 @@ vector<int> cluster(DatabaseFile &db, const vector<bool> *filter) {
 	return Util::Algo::greedy_vortex_cover(nb);
 }
 
-void run() {
-	if (config.database == "")
-		throw runtime_error("Missing parameter: database file (--db/-d)");
-	config.command = Config::makedb;
-	unique_ptr<DatabaseFile> db(DatabaseFile::auto_create_from_fasta());
-	const size_t seq_count = db->ref_header.sequences;
+void run_two_step_clustering(DatabaseFile& db){
+	const size_t seq_count = db.ref_header.sequences;
 
 	config.min_id = 70;
-	vector<int> centroid1(cluster(*db, nullptr));
+	vector<int> centroid1(cluster(db, nullptr));
 	vector<bool> rep1(rep_bitset(centroid1));
 	size_t n_rep1 = count_if(rep1.begin(), rep1.end(), [](bool x) { return x; });
 	message_stream << "Clustering step 1 complete. #Input sequences: " << centroid1.size() << " #Clusters: " << n_rep1 << endl;
 
 	config.mode_more_sensitive = true;
 	config.min_id = 0;
-	vector<int> centroid2(cluster(*db, &rep1));
+	vector<int> centroid2(cluster(db, &rep1));
 	vector<bool> rep2(rep_bitset(centroid2, &rep1));
 	for (size_t i = 0; i < centroid2.size(); ++i)
 		if (!rep1[i])
@@ -119,21 +142,21 @@ void run() {
 	Sequence_set *rep_seqs;
 	String_set<0> *rep_ids;
 	vector<unsigned> rep_database_id, rep_block_id(seq_count);
-	db->rewind();
-	db->load_seqs(rep_database_id, (size_t)1e11, &rep_seqs, &rep_ids, true, &rep2);
+	db.rewind();
+	db.load_seqs(rep_database_id, (size_t)1e11, &rep_seqs, &rep_ids, true, &rep2);
 	for (size_t i = 0; i < rep_database_id.size(); ++i)
 		rep_block_id[rep_database_id[i]] = (unsigned)i;
 
 	ostream *out = config.output_file.empty() ? &cout : new ofstream(config.output_file.c_str());
 	vector<char> seq;
 	string id;
-	db->seek_direct();
+	db.seek_direct();
 	Hsp hsp;
 	size_t n;
 	out->precision(3);
 
-	for (int i = 0; i < (int)db->ref_header.sequences; ++i) {
-		db->read_seq(id, seq);
+	for (int i = 0; i < (int)db.ref_header.sequences; ++i) {
+		db.read_seq(id, seq);
 		const unsigned r = rep_block_id[centroid2[i]];
 
 		(*out) << blast_id(id) << '\t'
@@ -151,10 +174,86 @@ void run() {
 		}
 	}
 
-	db->close();
 	if (out != &cout) delete out;
 	delete rep_seqs;
 	delete rep_ids;
 }
 
-}}
\ No newline at end of file
+
+void run_transitive_closure_clustering(DatabaseFile& db){
+	// testing
+	// LazyDisjointIntegralSet<uint32_t> test(5);
+	// test.merge(1,3);
+	// test.merge(3,4);
+	// test.merge(0,2);
+	// auto los = test.getListOfSets();
+	// for(auto& set: los){
+	// 	for(auto item:set){
+	// 		cout<< " " << item;
+	// 	}
+	// 	cout<<endl;
+	// }
+	// unordered_set<string> s ( {"red","green","blue","rain", "sun", "snow"} );
+	// LazyDisjointTypeSet<string> ts(&s);
+
+	// ts.merge("red", "green");
+	// ts.merge("snow", "rain");
+	// ts.merge("sun", "rain");
+	// auto lo = ts.getListOfSets();
+	// for(auto& set: lo){
+	// 	cout<< "set :";
+	// 	for(auto item:set){
+	// 		cout<< " " << item;
+	// 	}
+	// 	cout<<endl;
+	// }
+
+	const size_t seq_count = db.ref_header.sequences;
+	statistics.reset();
+	config.command = Config::blastp;
+	config.no_self_hits = true;
+	//config.output_format = { "6", "qnum", "snum" };
+	config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore" };
+	config.query_cover = 80;
+	config.subject_cover = 80;
+	config.algo = 0;
+	config.index_mode = 0;
+	config.freq_sd = 0;
+	config.max_alignments = numeric_limits<uint64_t>::max();
+
+	Workflow::Search::Options opt;
+	opt.db = &db;
+	opt.self = true;
+	NeighborStream ns(db.ref_header.sequences);
+	opt.consumer = &ns;
+	opt.db_filter = nullptr;
+
+	Workflow::Search::run(opt);
+
+	auto clusterResult = ns.getListOfSets();
+	cout << "Found "<<clusterResult.size()<<" clusters"<<endl;
+	// for(auto& set: clusterResult){
+	// 	cout<< "set :";
+	// 	for(auto item : set){
+	// 		cout<< " " << item;
+	// 	}
+	// 	cout<<endl;
+	// }
+}
+
+void run() {
+	if (config.database == "")
+		throw runtime_error("Missing parameter: database file (--db/-d)");
+	config.command = Config::makedb;
+	unique_ptr<DatabaseFile> db(DatabaseFile::auto_create_from_fasta());
+	switch(config.cluster_algo){
+		case Config::multi_step:
+			run_two_step_clustering(*db);
+			break;
+		case Config::transitive_closure:
+			run_transitive_closure_clustering(*db);
+			break;
+	}
+	db->close();
+}
+}}
diff --git a/src/run/disjoint_set.h b/src/run/disjoint_set.h
new file mode 100644
index 00000000..4b7acc7b
--- /dev/null
+++ b/src/run/disjoint_set.h
@@ -0,0 +1,216 @@
+#include <algorithm>
+#include <stdio.h>
+#include <fstream>
+#include <limits>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+using namespace std;
+
+template<typename T> class Node {
+	Node<T>* parent = this;
+	uint32_t r = 0;
+public:
+	virtual ~Node(){};
+	virtual Node<T>* getParent(){
+		return parent;
+	}
+	virtual void setParent(Node<T>* p){
+		parent = p;
+	}
+
+	virtual const T* getValue() = 0;
+
+	virtual uint32_t getCount(){
+		return r;
+	}
+	virtual void incrementCount(){
+		r++;
+	}
+};
+
+template<typename T> class IntegralNode: public Node<T> {
+private:
+	T value;
+public:
+	virtual ~IntegralNode(){};
+	IntegralNode<T>(T v){
+		value = v;
+	}
+	virtual const T* getValue(){
+		return &value;
+	}
+};
+
+template<typename T> class TypeNode: public Node<T> {
+private:
+	const T* value;
+	uint32_t r;
+public:
+	virtual ~TypeNode(){};
+	TypeNode<T>(const T* v){
+		value = v;
+	}
+	virtual const T* getValue(){
+		return value;
+	}
+};
+
+template<typename T> class LazyDisjointSet {
+private:
+	virtual Node<T>* get(const T i) = 0;
+	virtual Node<T>* get(const T* i) = 0;
+	virtual uint64_t size() = 0;
+	virtual vector<Node<T>*>* getNodes() = 0;
+public:
+	virtual ~LazyDisjointSet(){};
+
+	virtual Node<T>* getRoot(Node<T>* x) {
+		if (x->getParent() != x) {
+			// flatten the tree while traversing
+			x->setParent(getRoot(x->getParent()));
+		}
+		return x->getParent();
+	}
+
+	virtual void merge(Node<T>* x, Node<T>* y) {
+		if (x != y) {
+			Node<T>* rootX = getRoot(x);
+			Node<T>* rootY = getRoot(y);
+			if (rootX != rootY) {
+				if (rootX->getCount() < rootY->getCount()) {
+					rootX->setParent(rootY);
+				}
+				else if (rootX->getCount() > rootY->getCount()) {
+					rootY->setParent(rootX);
+				}
+				else {
+					rootX->incrementCount();
+					rootY->setParent(rootX);
+				}
+			}
+		}
+	}
+
+	virtual void merge(const T* x, const T* y) {
+		merge(get(x), get(y));
+	}
+
+	void merge(T x, T y) {
+		merge(get(x), get(y)); 
+	}
+
+	virtual vector<unordered_set<T>> getListOfSets() {
+		unordered_map<const T*, unordered_set<T>*> map;
+		for (Node<T>* n : *(getNodes())) {
+			if (n == nullptr) {
+				continue;
+			}
+			const T* r = getRoot(n)->getValue();
+			auto it = map.find(r);
+			T v = *(n->getValue());
+			if (it == map.end()) {
+				it = map.emplace(r, new unordered_set<T>()).first;
+			}
+			it->second->emplace(v);
+		}
+		vector<unordered_set<T>> listOfSets;
+		for(auto el=map.begin(); el != map.end(); el++){
+			listOfSets.emplace_back(move(*(el->second)));
+		}
+		return listOfSets;
+	}
+};
+
+template<typename T> class LazyDisjointIntegralSet : public LazyDisjointSet<T> {
+private:
+	vector<Node<T>*> nodes;
+
+	virtual Node<T>* get(T i){
+		assert(i >= 0 && nodes.size() > i);
+		if(nodes[i] == nullptr){
+			nodes[i] = new IntegralNode<T>(i);
+		}
+		return nodes[i];
+	}
+
+	virtual Node<T>* get(const T* i) {
+		return get(*i);
+	}
+
+	virtual uint64_t size(){
+		return nodes.size();
+	}
+	virtual vector<Node<T>*>* getNodes(){
+		return &nodes;
+	}
+
+public:
+	static_assert(std::is_integral<T>::value, "T needs to be an integral type");
+
+	LazyDisjointIntegralSet<T>(T size) {
+		nodes = vector<Node<T>*>(size, nullptr);
+	}
+
+	virtual ~LazyDisjointIntegralSet<T>() {
+		for(Node<T>* n : nodes){
+			if (n != nullptr){
+				delete n;
+				n = nullptr;
+			}
+		}
+		nodes.clear();
+	}
+};
+
+template<typename T> class LazyDisjointTypeSet : public LazyDisjointSet<T> {
+private:
+	unordered_set<T>* elements;
+	vector<Node<T>*> nodes;
+	unordered_map<const T*, uint64_t> mapping;
+	bool isIntegralAndConsecutive;
+
+	virtual Node<T>* get(const T* i) {
+		auto found = mapping.find(i);
+		assert(found != mapping.end());
+		return nodes[found->second];
+	}
+
+	virtual Node<T>* get(T i) {
+		auto found = elements->find(i);
+		assert(found != elements->end());
+		return get(&(*found));
+	}
+
+	virtual uint64_t size(){
+		return nodes.size();
+	}
+	virtual vector<Node<T>*>* getNodes(){
+		return &nodes;
+	}
+
+public:
+	LazyDisjointTypeSet<T>(unordered_set<T>* s) {
+		// note that the structure is backed by the set, so it should not be modified
+		isIntegralAndConsecutive = false;
+		elements = s;
+		nodes = vector<Node<T>*>(elements->size());
+		mapping = unordered_map<const T*, uint64_t>(elements->size());
+		for(auto e = elements->begin(); e != elements->end(); e++){
+			mapping[&(*e)] = nodes.size();
+			Node<T>* n = new TypeNode<T>(&(*e));
+			nodes.push_back(n);
+		}
+	}
+	virtual ~LazyDisjointTypeSet<T>() {
+		mapping.clear();
+		for(Node<T>* n : nodes){
+			if (n != nullptr){
+				delete n;
+				n = nullptr;
+			}
+		}
+		nodes.clear();
+	}
+};
diff --git a/src/util/command_line_parser.h b/src/util/command_line_parser.h
index 1b33eee3..c08231b8 100644
--- a/src/util/command_line_parser.h
+++ b/src/util/command_line_parser.h
@@ -166,4 +166,4 @@ struct Command_line_parser
 	std::vector<std::pair<std::string,std::string> > commands_;
 };
 
-#endif
\ No newline at end of file
+#endif

From d59f33449fd056333dd5c4af9af53702f8c8fc9b Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Sat, 11 Apr 2020 08:45:46 +0200
Subject: [PATCH 02/21] implemented markov clustering with sparse matrices

---
 src/run/cluster.cpp     | 235 ++++++++++++++++++-
 src/run/sparse_matrix.h | 498 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 730 insertions(+), 3 deletions(-)
 create mode 100644 src/run/sparse_matrix.h

diff --git a/src/run/cluster.cpp b/src/run/cluster.cpp
index 610b25f7..f3ab09c7 100644
--- a/src/run/cluster.cpp
+++ b/src/run/cluster.cpp
@@ -29,6 +29,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../data/reference.h"
 #include "workflow.h"
 #include "disjoint_set.h"
+#include "sparse_matrix.h"
 #include "../util/io/consumer.h"
 #include "../util/algo/algo.h"
 #include "../basic/statistics.h"
@@ -87,6 +88,48 @@ class NeighborStream : public Consumer {
 	}
 };
 
+template <typename T>
+class SparseMatrixStream : public Consumer {
+	SparseSimpleMatrix<T>* m = nullptr;
+	LazyDisjointSet<size_t>* disjointSet;
+	virtual void consume(const char *ptr, size_t n) override {
+		size_t query, subject, count;
+		float qcov, scov, bitscore;
+		const char *end = ptr + n;
+		while (ptr < end) {
+			//if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2)
+			if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &count) != 5) 
+				throw runtime_error("Cluster format error.");
+			m->set_elm(query, subject, bitscore);
+			disjointSet->merge(query, subject);
+			ptr += count;
+		}
+	}
+public:
+	SparseMatrixStream(size_t n, bool symmetric){
+		disjointSet = new LazyDisjointIntegralSet<size_t>(n); 
+		if( symmetric ){
+			m = new SparseSymmetricSimpleMatrix<T>(n);
+		}
+		else{
+			m = new SparseSimpleMatrixImpl<T>(n, n);
+		}
+	}
+	~SparseMatrixStream(){
+		if(m) delete m;
+		delete disjointSet;
+	}
+	SparseSimpleMatrix<T>* getMatrix(){
+		cout << " getting matrix " << m->get_nrows() << " " << m->get_ncols() << " " << m->get_n_nonzero_elements() << " sparsity "<< (1.0 * m->get_n_nonzero_elements()) / (m->get_nrows()* m->get_ncols()) << endl;
+		SparseSimpleMatrix<T>* h = m;
+		m = nullptr;
+		return h;
+	}
+	vector<unordered_set<size_t>> getListOfSets(){
+		return disjointSet->getListOfSets();
+	}
+};
+
 vector<bool> rep_bitset(const vector<int> &centroid, const vector<bool> *superset = nullptr) {
 	vector<bool> r(centroid.size());
 	for (int c : centroid)
@@ -116,7 +159,7 @@ vector<int> cluster(DatabaseFile &db, const vector<bool> *filter) {
 	opt.db_filter = filter;
 
 	Workflow::Search::run(opt);
-	
+
 	return Util::Algo::greedy_vortex_cover(nb);
 }
 
@@ -158,10 +201,8 @@ void run_two_step_clustering(DatabaseFile& db){
 	for (int i = 0; i < (int)db.ref_header.sequences; ++i) {
 		db.read_seq(id, seq);
 		const unsigned r = rep_block_id[centroid2[i]];
-
 		(*out) << blast_id(id) << '\t'
 			<< blast_id((*rep_ids)[r]) << '\t';		
-		
 		if ((int)i == centroid2[i])
 			(*out) << "100\t100\t100\t0" << endl;
 		else {
@@ -241,6 +282,191 @@ void run_transitive_closure_clustering(DatabaseFile& db){
 	// }
 }
 
+SparseSimpleMatrix<float>* get_gamma(SparseSimpleMatrix<float>* m, uint32_t r){
+	SparseSimpleMatrix<float>* gamma = new SparseSimpleMatrixImpl<float>(m->get_nrows(), m->get_ncols());
+	float* column_sum = new float[m->get_ncols()];
+	for(uint32_t icol = 0; icol< m->get_ncols(); icol++){
+		column_sum[icol] = 0.0f;
+	}
+	for(auto it = m->begin(); it != m->end(); it++){
+		uint32_t i,j;
+		m->get_indices(it->first, &i, &j);
+		float el = pow(it->second, r);
+		gamma->set_elm(i,j,el);
+		column_sum[j] += el;
+	}
+	for(auto it = gamma->begin(); it != gamma->end(); it++){
+		uint32_t i,j;
+		gamma->get_indices(it->first, &i, &j);
+		gamma->set_elm(i,j, it->second/column_sum[j]);
+	}
+	return gamma;
+}
+SimpleMatrix<float>* get_gamma(SimpleMatrix<float>* m, uint32_t r){
+	SimpleMatrix<float>* gamma = new SparseSimpleMatrixImpl<float>(m->get_nrows(), m->get_ncols());
+	float* column_sum = new float[m->get_ncols()];
+	for(uint32_t icol = 0; icol< m->get_ncols(); icol++){
+		column_sum[icol] = 0.0f;
+	}
+	for(uint32_t irow = 0; irow < m->get_nrows(); irow++){
+		for(uint32_t icol = 0; icol < m->get_ncols(); icol++){
+			column_sum[icol]+=pow(m->get_elm(irow, icol), r);
+		}
+	}
+	for(uint32_t irow = 0; irow < m->get_nrows(); irow++){
+		for(uint32_t icol = 0; icol < m->get_ncols(); icol++){
+			gamma->set_elm(irow, icol, pow(m->get_elm(irow, icol),r) / column_sum[icol]);
+		}
+	}
+	return gamma;
+}
+
+SimpleMatrix<float>* get_exp(SimpleMatrix<float>* m, uint32_t r){
+	// TODO: make this more efficient
+	SimpleMatrix<float>* tmp1 = m->multiply(m);
+	for(uint32_t i = 2; i<r; i++){
+		SimpleMatrix<float>* tmp2 = tmp1->multiply(tmp1);
+		delete tmp1;
+		tmp1 = tmp2;
+	}
+	return tmp1;
+}
+vector<unordered_set<uint32_t>> get_list(SimpleMatrix<float>* m){
+	LazyDisjointIntegralSet<uint32_t> disjointSet(m->get_ncols());
+	for(uint32_t irow = 0; irow < m->get_nrows(); irow++){
+		for(uint32_t icol = 0; icol < m->get_ncols(); icol++){
+			if(abs(m->get_elm(irow, icol)) > 1e-13){
+				disjointSet.merge(irow, icol);
+			}
+		}
+	}
+	return disjointSet.getListOfSets();
+}
+vector<unordered_set<uint32_t>> get_list(SparseSimpleMatrix<float>* m){
+	LazyDisjointIntegralSet<uint32_t> disjointSet(m->get_ncols());
+	for(auto it = m->begin(); it != m->end(); it++){
+		uint32_t irow, icol;
+		m->get_indices(it->first, &irow, &icol);
+		disjointSet.merge(irow, icol);
+	}
+	return disjointSet.getListOfSets();
+}
+
+
+vector<unordered_set<uint32_t>> markov_process(SimpleMatrix<float>* m){
+	uint32_t iteration = 0;
+	double diff_norm = std::numeric_limits<double>::max();
+	while( iteration < 100 && diff_norm > 1e-3 ){
+		SimpleMatrix<float>* msquared = get_exp(m, 2);
+		SimpleMatrix<float>* m_update = get_gamma(msquared, 2);
+		delete msquared;
+		SimpleMatrix<float>* diff_m = m_update->minus(m);
+		diff_norm = diff_m->norm(2.0,2.0);
+		delete m;
+		delete diff_m;
+		m = m_update;
+		m_update = nullptr;
+		iteration++;
+	}
+	cout<< "Markov Process converged after " << iteration <<" iterations " << diff_norm << endl;
+	return get_list(m);
+}
+
+void run_markov_clustering(DatabaseFile& db){
+	// testing
+	// SparseSimpleMatrix<float>* tmp = new SparseSimpleMatrixImpl<float>(12, 12);
+	// tmp->set_elm(0, 0, 0.2f);
+	// tmp->set_elm(0, 1, 0.25f);
+	// tmp->set_elm(0, 5, 0.333f);
+	// tmp->set_elm(0, 6, 0.25f);
+	// tmp->set_elm(0, 9, 0.25f);
+	// tmp->set_elm(1, 0, 0.20f);
+	// tmp->set_elm(1, 1, 0.25f);
+	// tmp->set_elm(1, 2, 0.25f);
+	// tmp->set_elm(1, 4, 0.20f);
+	// tmp->set_elm(2, 1, 0.25f);
+	// tmp->set_elm(2, 2, 0.25f);
+	// tmp->set_elm(2, 3, 0.20f);
+	// tmp->set_elm(2, 4, 0.20f);
+	// tmp->set_elm(3, 2, 0.25f);
+	// tmp->set_elm(3, 3, 0.20f);
+	// tmp->set_elm(3, 7, 0.20f);
+	// tmp->set_elm(3, 8, 0.20f);
+	// tmp->set_elm(3, 10, 0.20f);
+	// tmp->set_elm(4, 1, 0.25f);
+	// tmp->set_elm(4, 2, 0.25f);
+	// tmp->set_elm(4, 4, 0.20f);
+	// tmp->set_elm(4, 6, 0.25f);
+	// tmp->set_elm(4, 7, 0.20f);
+	// tmp->set_elm(5, 0, 0.20f);
+	// tmp->set_elm(5, 5, 0.333f);
+	// tmp->set_elm(5, 9, 0.25f);
+	// tmp->set_elm(6, 0, 0.20f);
+	// tmp->set_elm(6, 4, 0.20f);
+	// tmp->set_elm(6, 6, 0.25f);
+	// tmp->set_elm(6, 9, 0.25f);
+	// tmp->set_elm(7, 3, 0.20f);
+	// tmp->set_elm(7, 4, 0.20f);
+	// tmp->set_elm(7, 7, 0.20f);
+	// tmp->set_elm(7, 8, 0.20f);
+	// tmp->set_elm(7, 10, 0.20f);
+	// tmp->set_elm(8, 3, 0.20f);
+	// tmp->set_elm(8, 7, 0.20f);
+	// tmp->set_elm(8, 8, 0.20f);
+	// tmp->set_elm(8, 10, 0.20f);
+	// tmp->set_elm(8, 11, 0.333f);
+	// tmp->set_elm(9, 0, 0.20f);
+	// tmp->set_elm(9, 5, 0.333f);
+	// tmp->set_elm(9, 6, 0.25f);
+	// tmp->set_elm(9, 9, 0.25f);
+	// tmp->set_elm(10, 3, 0.20f);
+	// tmp->set_elm(10, 7, 0.20f);
+	// tmp->set_elm(10, 8, 0.20f);
+	// tmp->set_elm(10, 10, 0.20f);
+	// tmp->set_elm(10, 11, 0.333f);
+	// tmp->set_elm(11, 8, 0.20f);
+	// tmp->set_elm(11, 10, 0.20f);
+	// tmp->set_elm(11, 11, 0.333f);
+	// auto cr = markov_process(tmp);
+	// cout << "Found "<<cr.size()<<" clusters"<<endl;
+	// for(auto& set: cr){
+	// 	cout<< "set :";
+	// 	for(auto item : set){
+	// 		cout<< " " << item;
+	// 	}
+	// 	cout<<endl;
+	// }
+	// delete tmp;
+	// return;
+
+
+	const size_t seq_count = db.ref_header.sequences;
+	statistics.reset();
+	config.command = Config::blastp;
+	config.no_self_hits = true;
+	//config.output_format = { "6", "qnum", "snum" };
+	config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore" };
+	config.query_cover = 80;
+	config.subject_cover = 80;
+	config.algo = 0;
+	config.index_mode = 0;
+	config.freq_sd = 0;
+	config.max_alignments = numeric_limits<uint64_t>::max();
+
+	Workflow::Search::Options opt;
+	opt.db = &db;
+	opt.self = true;
+	SparseMatrixStream<float> ms(db.ref_header.sequences, false);
+	opt.consumer = &ms;
+	opt.db_filter = nullptr;
+	Workflow::Search::run(opt);
+
+	SparseSimpleMatrix<float>* m = ms.getMatrix();
+
+	auto clusterResult = markov_process(m);
+	cout << "Found "<<clusterResult.size()<<" clusters"<<endl;
+}
+
 void run() {
 	if (config.database == "")
 		throw runtime_error("Missing parameter: database file (--db/-d)");
@@ -253,6 +479,9 @@ void run() {
 		case Config::transitive_closure:
 			run_transitive_closure_clustering(*db);
 			break;
+		case Config::markov_clustering:
+			run_markov_clustering(*db);
+			break;
 	}
 	db->close();
 }
diff --git a/src/run/sparse_matrix.h b/src/run/sparse_matrix.h
new file mode 100644
index 00000000..d9502996
--- /dev/null
+++ b/src/run/sparse_matrix.h
@@ -0,0 +1,498 @@
+#pragma once
+#include <unordered_map>
+#include <vector>
+
+template<typename T>
+class SimpleMatrix {
+public:
+	virtual T get_max_elm() = 0;
+	virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx) = 0;
+	virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element) = 0;
+	virtual T& at(uint32_t i_idx, uint32_t j_idx) = 0;
+	virtual T& at(uint64_t c_idx) = 0;
+	virtual inline uint64_t get_nrows() = 0;
+	virtual inline uint64_t get_ncols() = 0;
+	virtual SimpleMatrix<T>* get_new_instance(int nrows, int ncols) = 0;
+	virtual SimpleMatrix<T>* get_new_instance() = 0;
+	virtual void print(){
+		for( uint32_t irow=0; irow < get_nrows(); irow++){
+			for( uint32_t icol=0; icol < get_ncols(); icol++){
+				printf(" %.6f",get_elm(irow,icol));
+			}
+			printf("\n");
+		}
+	}
+
+	virtual inline uint64_t get_idx(uint32_t i_idx, uint32_t j_idx){
+		assert(i_idx < get_nrows());
+		assert(j_idx < get_ncols());
+		return i_idx * get_ncols() + j_idx;
+	}
+
+	virtual inline void get_indices (uint64_t combined, uint32_t* i, uint32_t* j){
+		*j = combined % get_ncols();
+		*i = ( combined - *j) / get_ncols();
+	}
+
+	virtual SimpleMatrix<T>* minus(SimpleMatrix<T>* m) = 0;
+	virtual SimpleMatrix<T>* base_minus(SimpleMatrix<T>* m){
+		assert(this->get_nrows() == m->get_nrows());
+		assert(this->get_ncols() == m->get_ncols());
+		SimpleMatrix<T>* result = this->get_new_instance(this->get_nrows(),m->get_ncols());
+		for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
+			for(uint32_t icol = 0; icol<m->get_ncols(); icol++){
+				result->set_elm(irow, icol, this->get_elm(irow, icol) - m->get_elm(irow,icol));
+			}
+		}
+		return result;
+	}
+	virtual SimpleMatrix<T>* plus(SimpleMatrix<T>* m) = 0;
+	virtual SimpleMatrix<T>* base_plus(SimpleMatrix<T>* m){
+		assert(this->get_nrows() == m->get_nrows());
+		assert(this->get_ncols() == m->get_ncols());
+		SimpleMatrix<T>* result = this->get_new_instance(this->get_nrows(),m->get_ncols());
+		for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
+			for(uint32_t icol = 0; icol<m->get_ncols(); icol++){
+				result->set_elm(irow, icol, this->get_elm(irow, icol) + m->get_elm(irow, icol));
+			}
+		}
+		return result;
+	}
+	virtual SimpleMatrix<T>* multiply(SimpleMatrix<T>* m) = 0;
+	virtual SimpleMatrix<T>* base_multiply(SimpleMatrix<T>* m){
+		printf("standard mult\n");
+		SimpleMatrix<T>* result = this->get_new_instance(this->get_nrows(),m->get_ncols());
+		for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
+			for(uint32_t icol = 0; icol<m->get_ncols(); icol++){
+				result->set_elm(irow, icol, 0);
+			}
+		}
+		for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
+			for(uint32_t icol = 0; icol<m->get_ncols(); icol++){
+				for(uint32_t k = 0; k<m->get_nrows(); k++){
+					result->set_elm(irow, icol, result->get_elm(irow, icol) + this->get_elm(irow, k) * m->get_elm(k,icol));
+				}
+			}
+		}
+		return result;
+	}
+	virtual double norm(double p, double q){
+		double result = 0.0;
+		double exp1 = q/p;
+		for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
+			double colnorm = 0.0;
+			for(uint32_t icol = 0; icol<this->get_ncols(); icol++){
+					colnorm += std::pow(std::abs(this->get_elm(irow, icol)), p);
+			}
+			result += std::pow(colnorm, exp1);
+		}
+		return std::pow(result, 1.0/q);
+	}
+
+	virtual ~SimpleMatrix() {};
+};
+
+template<typename T>
+class SymmetricSimpleMatrix : virtual public SimpleMatrix<T> {
+public:
+	static inline uint64_t get_lower_index (uint32_t i_idx, uint32_t j_idx){
+		uint32_t min = std::min(i_idx,j_idx);
+		uint32_t max = std::max(i_idx,j_idx);
+		return  max*((uint64_t) max+1)/2 + min;
+	}
+	static inline void get_symmetric_index (uint64_t combined, uint32_t* max, uint32_t* min){
+		// solving max^2-max-2*combined = 0. -1 to convert to indexing varaible in C, avoiding overflow
+		// by removing an additional factor of 2 from under the square root
+		*max = (uint32_t) std::floor(0.5+2*sqrt(0.0625+0.5*combined)) - 1;
+		*min = combined - (*max*((uint64_t)*max+1))/2;
+	}
+};
+
+template<typename T>
+class SparseSimpleMatrix : virtual public SimpleMatrix<T> {
+public:
+	virtual uint64_t get_n_nonzero_elements() = 0;
+	virtual inline bool has_elm(uint32_t i_idx, uint32_t j_idx) = 0;
+	virtual typename std::unordered_map<uint64_t, T>::iterator begin() = 0;
+	virtual typename std::unordered_map<uint64_t, T>::iterator end() = 0;
+};
+
+template<typename T>
+class SparseSimpleMatrixImpl : public SparseSimpleMatrix<T> {
+private:
+	uint32_t nrows, ncols;
+	T tol;
+	std::unordered_map<uint64_t,T> mat;
+
+public:
+	SparseSimpleMatrixImpl(int nrows, int ncols) : nrows(nrows), ncols(ncols), tol((T) 1e-12) {};
+	SparseSimpleMatrixImpl(int nrows, int ncols, T tol) : nrows(nrows), ncols(ncols), tol(std::abs(tol)) {};
+
+	virtual SimpleMatrix<T>*  get_new_instance(int nrows, int ncols){
+		return new SparseSimpleMatrixImpl<T>(nrows, ncols);
+	}
+	virtual SimpleMatrix<T>*  get_new_instance(){
+		return new SparseSimpleMatrixImpl<T>(nrows, ncols);
+	}
+
+	virtual ~SparseSimpleMatrixImpl(){ mat.clear(); };
+
+	virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){
+		assert(i_idx < nrows);
+		assert(j_idx < ncols);
+		if(std::abs(element) > tol) {
+			mat[SimpleMatrix<T>::get_idx(i_idx,j_idx)] = element;
+		}
+	}
+	virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){
+		assert(i_idx < nrows);
+		assert(j_idx < ncols);
+		auto it = mat.find(SimpleMatrix<T>::get_idx(i_idx,j_idx));
+		return it != mat.end() ? it-> second : (T) 0.0;
+	}
+	virtual inline bool has_elm(uint32_t i_idx, uint32_t j_idx){
+		assert(i_idx < nrows);
+		assert(j_idx < ncols);
+		auto it = mat.find(SimpleMatrix<T>::get_idx(i_idx,j_idx));
+		return it != mat.end();
+	}
+	virtual inline uint64_t get_nrows(){
+		return nrows;
+	};
+	virtual inline uint64_t get_ncols(){
+		return ncols;
+	};
+	virtual inline uint64_t get_n_nonzero_elements(){
+		return mat.size();
+	};
+	virtual typename std::unordered_map<uint64_t,T>::iterator begin(){
+	 	return mat.begin();
+	};
+	virtual typename std::unordered_map<uint64_t,T>::iterator end(){
+		return mat.end();
+	};
+	virtual T& at(uint32_t i_idx, uint32_t j_idx){
+		return at(SimpleMatrix<T>::get_idx(i_idx,j_idx));
+	};
+	virtual T& at(uint64_t c_idx){
+		assert(c_idx < nrows*ncols);
+		return mat[c_idx];
+	};
+
+	void purge(){
+		auto it = mat.begin();
+		while(it!=mat.end()){
+			if(std::abs(it->second) < tol){
+				it = mat.erase(it);
+			}
+			else{
+				it++;
+			}
+		}
+	}
+	virtual T get_max_elm(){
+		T m = 0.0;
+		for(auto& it : mat){
+			m = std::max(m, it.second);
+		}
+		return m;
+	};
+	virtual SimpleMatrix<T>* minus(SimpleMatrix<T>* m){
+		if(SparseSimpleMatrix<T>* n = dynamic_cast<SparseSimpleMatrix<T>*> (m)){
+			// TODO: Imporove this implementation
+			SparseSimpleMatrix<T>* result = new SparseSimpleMatrixImpl(this->get_nrows(),n->get_ncols());
+
+			std::vector<std::set<uint32_t>> k_to_j(n->get_nrows());
+			for(auto it_m = n->begin(); it_m != n->end(); it_m++){
+				uint32_t irow,icol;
+				n->get_indices(it_m->first, &irow, &icol);
+				result->set_elm(irow, icol, this->get_elm(irow,icol) - it_m->second);
+			}
+			for(auto it_this = this->begin(); it_this != this->end(); it_this++){
+				uint32_t irow,icol;
+				this->get_indices(it_this->first, &irow, &icol);
+				if(!n->has_elm(irow,icol)) result->set_elm(irow, icol, it_this->second);
+			}
+			return result;
+		}
+		return this->base_minus(m);
+	}
+	virtual SimpleMatrix<T>* plus(SimpleMatrix<T>* m){
+		if(SparseSimpleMatrix<T>* n = dynamic_cast<SparseSimpleMatrix<T>*> (m)){
+			// TODO: Imporove this implementation
+			SparseSimpleMatrix<T>* result = new SparseSimpleMatrixImpl(this->get_nrows(),n->get_ncols());
+
+			std::vector<std::set<uint32_t>> k_to_j(n->get_nrows());
+			for(auto it_m = n->begin(); it_m != n->end(); it_m++){
+				uint32_t irow,icol;
+				n->get_indices(it_m->first, &irow, &icol);
+				result->set_elm(irow, icol, this->get_elm(irow,icol) + it_m->second);
+			}
+			for(auto it_this = this->begin(); it_this != this->end(); it_this++){
+				uint32_t irow,icol;
+				this->get_indices(it_this->first, &irow, &icol);
+				if(!n->has_elm(irow,icol)) result->set_elm(irow, icol, it_this->second);
+			}
+			return result;
+		}
+		return this->base_plus(m);
+	}
+	virtual SimpleMatrix<T>* multiply(SimpleMatrix<T>* m){
+		if(SparseSimpleMatrix<T>* n = dynamic_cast<SparseSimpleMatrix<T>*> (m)){
+			SparseSimpleMatrix<T>* result = new SparseSimpleMatrixImpl(this->get_nrows(),n->get_ncols());
+			std::vector<std::vector<uint32_t>> k_to_j(n->get_nrows());
+			for(uint32_t k = 0; k<n->get_nrows(); k++){
+				k_to_j[k] = std::vector<uint32_t>();
+			}
+			for(auto it_m = n->begin(); it_m != n->end(); it_m++){
+				uint32_t k2,icol;
+				n->get_indices(it_m->first, &k2, &icol);
+				k_to_j[k2].push_back(icol);
+			}
+
+			auto it_this = this->begin();
+			uint32_t ithis = 0;
+			while(it_this != this->end()){
+				uint32_t irow, k;
+				this->get_indices(it_this->first, &irow, &k);
+				for( uint32_t const & icol : k_to_j[k] ){
+					T el = result->get_elm(irow, icol);
+					result->set_elm(irow, icol, el + it_this->second * n->get_elm(k, icol));
+				}
+				it_this++;
+			}
+			return result;
+		}
+		return this->base_multiply(m);
+	}
+};
+
+template<typename T>
+class DenseSimpleMatrix : public SimpleMatrix<T> {
+private:
+	uint32_t nrows, ncols;
+	T* mat;
+public:
+	DenseSimpleMatrix(int nrows, int ncols) : nrows(nrows), ncols(ncols) {
+		mat = new T[nrows * ncols];
+	};
+	DenseSimpleMatrix(int nrows, int ncols, T init) : nrows(nrows), ncols(ncols) {
+		mat = new T[nrows * ncols];
+		for(uint64_t i = 0; i<nrows*ncols; i++){
+			mat[i] = init;
+		}
+	};
+	virtual ~DenseSimpleMatrix(){
+		delete [] mat;
+		mat = nullptr;
+	};
+	virtual SimpleMatrix<T>* get_new_instance(int nrows, int ncols){
+		return new DenseSimpleMatrix<T>(nrows, ncols);
+	}
+	virtual SimpleMatrix<T>* get_new_instance(){
+		return new DenseSimpleMatrix<T>(nrows, ncols);
+	}
+
+	virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){
+		mat[SimpleMatrix<T>::get_idx(i_idx,j_idx)] = element;
+	}
+	virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){
+		return mat[SimpleMatrix<T>::get_idx(i_idx,j_idx)];
+	}
+	virtual inline uint64_t get_nrows(){
+		return nrows;
+	};
+	virtual inline uint64_t get_ncols(){
+		return ncols;
+	};
+	virtual T& at(uint32_t i_idx, uint32_t j_idx){
+		return at(SimpleMatrix<T>::get_idx(i_idx,j_idx));
+	};
+	virtual T& at(uint64_t c_idx){
+		assert(c_idx < nrows * ncols);
+		return mat[c_idx];
+	};
+	virtual T get_max_elm(){
+		// TODO: implement me
+		return 0;//*std::max_element(mat,mat+n_max);
+	};
+
+	SimpleMatrix<T>* multiply(SimpleMatrix<T>* m){
+		if(SparseSimpleMatrix<T>* n = dynamic_cast<SparseSimpleMatrix<T>*> (m)){
+			DenseSimpleMatrix<T>* result = new DenseSimpleMatrix(this->get_nrows(),n->get_ncols());
+			for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
+				for(uint32_t icol = 0; icol<n->get_ncols(); icol++){
+					result->set_elm(irow, icol, 0);
+				}
+			}
+			for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
+				auto it = n->begin();
+				while(it!=n->end()){
+					uint32_t k,icol;
+					n->get_indices (it->first, &k, &icol);
+					result->at(irow, icol) += this->get_elm(irow, k) * it->second;
+					it++;
+				}
+			}
+			return result;
+		}
+		return this->base_multiply(m);
+	}
+};
+
+template<typename T>
+class SparseSymmetricSimpleMatrix : public SparseSimpleMatrix<T>, public SymmetricSimpleMatrix<T> {
+private:
+	uint32_t n_max;
+	T tol;
+	std::unordered_map<uint64_t,T> mat;
+
+public:
+	SparseSymmetricSimpleMatrix(int n) : n_max(n), tol((T) 1e-12) {};
+	SparseSymmetricSimpleMatrix(int n, T tol) : n_max(n), tol(std::abs(tol)) {};
+
+	virtual ~SparseSymmetricSimpleMatrix(){ mat.clear(); };
+	virtual SimpleMatrix<T>* get_new_instance(int nrows, int ncols){
+		return new SparseSymmetricSimpleMatrix<T>(nrows, tol);
+	}
+	virtual SimpleMatrix<T>* get_new_instance(){
+		return new SparseSymmetricSimpleMatrix<T>(n_max, tol);
+	}
+
+	virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){
+		if( std::abs(element) > tol ) {
+			assert(i_idx < n_max);
+			assert(j_idx < n_max);
+			const uint64_t my_idx=this->get_lower_index(i_idx,j_idx);
+			mat[my_idx] = element;
+		}
+	}
+	virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){
+		assert(i_idx < n_max);
+		assert(j_idx < n_max);
+		const uint64_t my_idx=this->get_lower_index(i_idx,j_idx);
+		auto it = mat.find(my_idx);
+		return it != mat.end() ? it-> second : (T) 0.0;
+	}
+	virtual inline bool has_elm(uint32_t i_idx, uint32_t j_idx){
+		assert(i_idx < n_max);
+		assert(j_idx < n_max);
+		const uint64_t my_idx=this->get_lower_index(i_idx,j_idx);
+		auto it = mat.find(my_idx);
+		return it != mat.end();
+	}
+	virtual inline uint64_t get_nrows(){
+		// TODO: this can be determined from mat (since symmetric) then the matrix can be unbounded!
+		return n_max;
+	};
+	virtual inline uint64_t get_ncols(){
+		// TODO: this can be determined from mat (since symmetric) then the matrix can be unbounded!
+		return n_max;
+	};
+	virtual inline uint64_t get_n_nonzero_elements(){
+		return mat.size();
+	};
+	virtual typename std::unordered_map<uint64_t,T>::iterator begin(){
+		return mat.begin();
+	};
+	virtual typename std::unordered_map<uint64_t,T>::iterator end(){
+		return mat.end();
+	};
+	virtual T& at(uint32_t i_idx, uint32_t j_idx){
+		assert(i_idx < n_max);
+		assert(j_idx < n_max);
+		const uint64_t c_idx=this->get_lower_index(i_idx,j_idx);
+		return at(c_idx);
+	};
+	virtual T& at(uint64_t c_idx){
+		assert(c_idx < n_max*(n_max+1)/2);
+		return mat[c_idx];
+	};
+
+	void purge(){
+		auto it = mat.begin();
+		while(it!=mat.end()){
+			if (std::abs(it->second) < tol ){
+				it = mat.erase(it);
+			}
+			else{
+				it++;
+			}
+		}
+	}
+
+	virtual T get_max_elm(){
+		T m = 0.0;
+		for ( auto& it : mat ) {
+			m = std::max(m,it.second);
+		}
+		return m;
+	};
+
+	void resize(uint32_t new_n_max){
+		n_max = new_n_max;
+	}
+	// TODO: implement valid matrix ops
+	virtual SimpleMatrix<T>* minus(SimpleMatrix<T>* m){
+		return this->base_minus(m);
+	}
+	virtual SimpleMatrix<T>* plus(SimpleMatrix<T>* m){
+		return this->base_plus(m);
+	}
+	virtual SimpleMatrix<T>* multiply(SimpleMatrix<T>* m){
+		return this->base_multiply(m);
+	}
+};
+
+template<typename T>
+class DenseSymmetricSimpleMatrix : public SymmetricSimpleMatrix<T> {
+private:
+	uint32_t n_max;
+	T* mat;
+public:
+	DenseSymmetricSimpleMatrix(int n) : n_max(n) {
+		mat = new T[(n_max*(n_max+1))/2];
+	};
+	virtual ~DenseSymmetricSimpleMatrix(){
+		delete [] mat;
+		mat = nullptr;
+	};
+	virtual SimpleMatrix<T>* get_new_instance(int nrows, int ncols){
+		return new DenseSymmetricSimpleMatrix<T>(nrows * ncols);
+	}
+	virtual SimpleMatrix<T>* get_new_instance(){
+		return new DenseSymmetricSimpleMatrix<T>(n_max);
+	}
+
+	virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){
+		assert(i_idx < n_max);
+		assert(j_idx < n_max);
+		const uint64_t my_idx=SymmetricSimpleMatrix<T>::get_lower_index(i_idx,j_idx);
+		mat[my_idx] = element;
+	}
+	virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){
+		assert(i_idx < n_max);
+		assert(j_idx < n_max);
+		const uint64_t my_idx=SymmetricSimpleMatrix<T>::get_lower_index(i_idx,j_idx);
+		return mat[my_idx];
+	}
+	virtual inline uint64_t get_nrows(){
+		return n_max;
+	};
+	virtual inline uint64_t get_ncols(){
+		return n_max;
+	};
+	virtual T& at(uint32_t i_idx, uint32_t j_idx){
+		assert(i_idx < n_max);
+		assert(j_idx < n_max);
+		const uint64_t c_idx=SymmetricSimpleMatrix<T>::get_lower_index(i_idx,j_idx);
+		return at(c_idx);
+	};
+	virtual T& at(uint64_t c_idx){
+		assert(c_idx < n_max*(n_max+1)/2);
+		return mat[c_idx];
+	};
+	virtual T get_max_elm(){
+		return *std::max_element(mat,mat+n_max);
+	};
+};

From 8e18435090f6353d8f24808e452f53cb48e20225 Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Sat, 18 Apr 2020 10:32:53 +0200
Subject: [PATCH 03/21] reimplementation of MCL using Eigen

I have completely removed my own sparse matrix implementation and
replaced it by the Eigen implementation which should also improve the
speed in general
---
 src/basic/config.h      |   2 +-
 src/run/cluster.cpp     | 360 +++++++++++++++++------------
 src/run/sparse_matrix.h | 498 ----------------------------------------
 3 files changed, 209 insertions(+), 651 deletions(-)
 delete mode 100644 src/run/sparse_matrix.h

diff --git a/src/basic/config.h b/src/basic/config.h
index 69023b07..b059a6a6 100644
--- a/src/basic/config.h
+++ b/src/basic/config.h
@@ -217,7 +217,7 @@ struct Config
 	enum { double_indexed = 0, query_indexed = 1, subject_indexed = 2 };
 	int algo;
 
-	enum { multi_step = 0, transitive_closure = 1};
+	enum { multi_step = 0, transitive_closure = 1, markov_clustering = 2};
 	int cluster_algo;
 
 	enum { query_parallel = 0, target_parallel = 1 };
diff --git a/src/run/cluster.cpp b/src/run/cluster.cpp
index f3ab09c7..49c2cd24 100644
--- a/src/run/cluster.cpp
+++ b/src/run/cluster.cpp
@@ -16,6 +16,7 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
+#include <Eigen/Sparse>
 #include <algorithm>
 #include <stdexcept>
 #include <string>
@@ -29,7 +30,6 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../data/reference.h"
 #include "workflow.h"
 #include "disjoint_set.h"
-#include "sparse_matrix.h"
 #include "../util/io/consumer.h"
 #include "../util/algo/algo.h"
 #include "../basic/statistics.h"
@@ -90,43 +90,72 @@ class NeighborStream : public Consumer {
 
 template <typename T>
 class SparseMatrixStream : public Consumer {
-	SparseSimpleMatrix<T>* m = nullptr;
+	size_t n;
+	vector<Eigen::Triplet<T>> data;
 	LazyDisjointSet<size_t>* disjointSet;
 	virtual void consume(const char *ptr, size_t n) override {
 		size_t query, subject, count;
-		float qcov, scov, bitscore;
+		float qcov, scov, bitscore, id;
 		const char *end = ptr + n;
 		while (ptr < end) {
 			//if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2)
-			if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &count) != 5) 
+			if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &id, &count) != 6) 
 				throw runtime_error("Cluster format error.");
-			m->set_elm(query, subject, bitscore);
+			data.emplace_back(query, subject, (qcov/100.0f) * (scov/100.0f) * (id/100.0f));
 			disjointSet->merge(query, subject);
 			ptr += count;
 		}
 	}
 public:
-	SparseMatrixStream(size_t n, bool symmetric){
+	SparseMatrixStream(size_t n){
+		this->n = n;
 		disjointSet = new LazyDisjointIntegralSet<size_t>(n); 
-		if( symmetric ){
-			m = new SparseSymmetricSimpleMatrix<T>(n);
-		}
-		else{
-			m = new SparseSimpleMatrixImpl<T>(n, n);
+		// Note: this makes sure the self hits are always included, the value needs to be aligned with the measure used in consume!
+		for(uint32_t idiag=0; idiag<n; idiag++){
+			data.emplace_back(idiag, idiag, 1.0);
 		}
 	}
 	~SparseMatrixStream(){
-		if(m) delete m;
 		delete disjointSet;
 	}
-	SparseSimpleMatrix<T>* getMatrix(){
-		cout << " getting matrix " << m->get_nrows() << " " << m->get_ncols() << " " << m->get_n_nonzero_elements() << " sparsity "<< (1.0 * m->get_n_nonzero_elements()) / (m->get_nrows()* m->get_ncols()) << endl;
-		SparseSimpleMatrix<T>* h = m;
-		m = nullptr;
-		return h;
+	pair<vector<vector<uint32_t>>, vector<Eigen::SparseMatrix<T>>> getComponents(){
+		vector<unordered_set<size_t>> sets = disjointSet->getListOfSets();
+		vector<vector<Eigen::Triplet<T>>> split(sets.size());
+		for(uint32_t iset = 0; iset < sets.size(); iset++){
+			split.push_back(vector<Eigen::Triplet<T>>());
+		}
+		for(Eigen::Triplet<T> d : data){
+			for(uint32_t iset = 0; iset < sets.size(); iset++){
+				if(sets[iset].count(d.col()) != 0){
+					split[iset].emplace_back(d.row(), d.col(), d.value());
+					break;
+				}
+			}
+		}
+		vector<vector<uint32_t>> indices(sets.size());
+		vector<Eigen::SparseMatrix<T>> components(sets.size());
+		for(uint32_t iset = 0; iset < sets.size(); iset++){
+			Eigen::SparseMatrix<T> component(sets[iset].size(), sets[iset].size());
+			vector<uint32_t> order(sets[iset].begin(), sets[iset].end());
+			map<uint32_t, uint32_t> index_map;
+			uint32_t iel = 0;
+			for(uint32_t el: order){
+				index_map.emplace(el, iel++);
+			}
+			vector<Eigen::Triplet<T>> remapped(split[iset].size());
+			for(Eigen::Triplet<T> t : split[iset]){
+				remapped.emplace_back(index_map[t.row()], index_map[t.col()], t.value());
+			}
+			component.setFromTriplets(remapped.begin(), remapped.end());
+			components.emplace_back(move(component));
+			indices.emplace_back(move(order));
+		}
+		return std::make_pair(indices, components);
 	}
-	vector<unordered_set<size_t>> getListOfSets(){
-		return disjointSet->getListOfSets();
+	Eigen::SparseMatrix<T> getMatrix(){
+		Eigen::SparseMatrix<T> m(n,n);
+		m.setFromTriplets(data.begin(), data.end());
+		return m;
 	}
 };
 
@@ -282,152 +311,132 @@ void run_transitive_closure_clustering(DatabaseFile& db){
 	// }
 }
 
-SparseSimpleMatrix<float>* get_gamma(SparseSimpleMatrix<float>* m, uint32_t r){
-	SparseSimpleMatrix<float>* gamma = new SparseSimpleMatrixImpl<float>(m->get_nrows(), m->get_ncols());
-	float* column_sum = new float[m->get_ncols()];
-	for(uint32_t icol = 0; icol< m->get_ncols(); icol++){
-		column_sum[icol] = 0.0f;
-	}
-	for(auto it = m->begin(); it != m->end(); it++){
-		uint32_t i,j;
-		m->get_indices(it->first, &i, &j);
-		float el = pow(it->second, r);
-		gamma->set_elm(i,j,el);
-		column_sum[j] += el;
-	}
-	for(auto it = gamma->begin(); it != gamma->end(); it++){
-		uint32_t i,j;
-		gamma->get_indices(it->first, &i, &j);
-		gamma->set_elm(i,j, it->second/column_sum[j]);
-	}
-	return gamma;
-}
-SimpleMatrix<float>* get_gamma(SimpleMatrix<float>* m, uint32_t r){
-	SimpleMatrix<float>* gamma = new SparseSimpleMatrixImpl<float>(m->get_nrows(), m->get_ncols());
-	float* column_sum = new float[m->get_ncols()];
-	for(uint32_t icol = 0; icol< m->get_ncols(); icol++){
-		column_sum[icol] = 0.0f;
-	}
-	for(uint32_t irow = 0; irow < m->get_nrows(); irow++){
-		for(uint32_t icol = 0; icol < m->get_ncols(); icol++){
-			column_sum[icol]+=pow(m->get_elm(irow, icol), r);
+Eigen::SparseMatrix<float> get_gamma(Eigen::SparseMatrix<float>* m, float r){
+	vector<Eigen::Triplet<float>> data;
+	for (uint32_t k=0; k<m->outerSize(); ++k){
+		float colSum = 0.0f;
+		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
+			colSum += pow(it.value(),r);
 		}
-	}
-	for(uint32_t irow = 0; irow < m->get_nrows(); irow++){
-		for(uint32_t icol = 0; icol < m->get_ncols(); icol++){
-			gamma->set_elm(irow, icol, pow(m->get_elm(irow, icol),r) / column_sum[icol]);
+		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
+			data.emplace_back(it.row(), it.col(), pow(it.value(),r) / colSum);
 		}
 	}
-	return gamma;
+	Eigen::SparseMatrix<float> gamma(m->rows(), m->cols());
+	gamma.setFromTriplets(data.begin(), data.end());
+	return gamma.pruned(1.0, std::numeric_limits<float>::epsilon());
 }
 
-SimpleMatrix<float>* get_exp(SimpleMatrix<float>* m, uint32_t r){
-	// TODO: make this more efficient
-	SimpleMatrix<float>* tmp1 = m->multiply(m);
-	for(uint32_t i = 2; i<r; i++){
-		SimpleMatrix<float>* tmp2 = tmp1->multiply(tmp1);
-		delete tmp1;
-		tmp1 = tmp2;
+vector<unordered_set<uint32_t>> get_list(Eigen::SparseMatrix<float>* m){
+	LazyDisjointIntegralSet<uint32_t> disjointSet(m->cols());
+	for (uint32_t k=0; k<m->outerSize(); ++k){
+		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
+			disjointSet.merge(it.row(), it.col());
+		}
 	}
-	return tmp1;
+	return disjointSet.getListOfSets();
 }
-vector<unordered_set<uint32_t>> get_list(SimpleMatrix<float>* m){
-	LazyDisjointIntegralSet<uint32_t> disjointSet(m->get_ncols());
-	for(uint32_t irow = 0; irow < m->get_nrows(); irow++){
-		for(uint32_t icol = 0; icol < m->get_ncols(); icol++){
-			if(abs(m->get_elm(irow, icol)) > 1e-13){
-				disjointSet.merge(irow, icol);
+
+void print(Eigen::SparseMatrix<float>* m, bool sparse){
+	if(sparse){
+		for (uint32_t k=0; k<m->outerSize(); ++k){
+			for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
+				printf("%lu %lu %16.14f\n", it.row(), it.col(), it.value());
 			}
 		}
 	}
-	return disjointSet.getListOfSets();
-}
-vector<unordered_set<uint32_t>> get_list(SparseSimpleMatrix<float>* m){
-	LazyDisjointIntegralSet<uint32_t> disjointSet(m->get_ncols());
-	for(auto it = m->begin(); it != m->end(); it++){
-		uint32_t irow, icol;
-		m->get_indices(it->first, &irow, &icol);
-		disjointSet.merge(irow, icol);
+	else{
+		for(uint32_t irow = 0; irow < m->rows(); irow++){
+			for(uint32_t icol = 0; icol < m->cols(); icol++){
+				printf(" %8.4f", m->coeffRef(irow, icol));
+			}
+			printf("\n");
+		}
 	}
-	return disjointSet.getListOfSets();
+		printf("\n");
 }
 
-
-vector<unordered_set<uint32_t>> markov_process(SimpleMatrix<float>* m){
+vector<unordered_set<uint32_t>> markov_process(Eigen::SparseMatrix<float>* m){
 	uint32_t iteration = 0;
 	double diff_norm = std::numeric_limits<double>::max();
-	while( iteration < 100 && diff_norm > 1e-3 ){
-		SimpleMatrix<float>* msquared = get_exp(m, 2);
-		SimpleMatrix<float>* m_update = get_gamma(msquared, 2);
-		delete msquared;
-		SimpleMatrix<float>* diff_m = m_update->minus(m);
-		diff_norm = diff_m->norm(2.0,2.0);
-		delete m;
-		delete diff_m;
-		m = m_update;
-		m_update = nullptr;
+	// TODO: Add loops TODO: make sure self-hits are found and have the same unit as the off diagonal elements (bit score or other measure used) before calling this routine, then only check that diagonal elements exist
+	// for(uint32_t idiag = 0; idiag<m->get_nrows(); idiag++){
+	// 	float e = m->get_elm(idiag, idiag);
+	// 	if( abs(e) < 1e-13 ){
+	// 		//cout << "WARNING: did not find a self-hit for "<< idiag << endl;
+	// 		m->set_elm(idiag, idiag, 1.0);
+	// 	}
+	// }
+	//*m  = get_gamma(m,1);
+	while( iteration < 100 && diff_norm > 1e-6*m->rows() ){
+		Eigen::SparseMatrix<float> msquared = ((*m) * (*m)).pruned(1.0, std::numeric_limits<float>::epsilon());
+		Eigen::SparseMatrix<float> m_update = get_gamma(&msquared, 2);
+		diff_norm = (*m - m_update).norm();
+		*m = m_update.pruned(1.0, std::numeric_limits<float>::epsilon());
 		iteration++;
 	}
-	cout<< "Markov Process converged after " << iteration <<" iterations " << diff_norm << endl;
+	//cout<< "Markov Process converged after " << iteration <<" iterations " << diff_norm << endl;
 	return get_list(m);
 }
 
 void run_markov_clustering(DatabaseFile& db){
 	// testing
-	// SparseSimpleMatrix<float>* tmp = new SparseSimpleMatrixImpl<float>(12, 12);
-	// tmp->set_elm(0, 0, 0.2f);
-	// tmp->set_elm(0, 1, 0.25f);
-	// tmp->set_elm(0, 5, 0.333f);
-	// tmp->set_elm(0, 6, 0.25f);
-	// tmp->set_elm(0, 9, 0.25f);
-	// tmp->set_elm(1, 0, 0.20f);
-	// tmp->set_elm(1, 1, 0.25f);
-	// tmp->set_elm(1, 2, 0.25f);
-	// tmp->set_elm(1, 4, 0.20f);
-	// tmp->set_elm(2, 1, 0.25f);
-	// tmp->set_elm(2, 2, 0.25f);
-	// tmp->set_elm(2, 3, 0.20f);
-	// tmp->set_elm(2, 4, 0.20f);
-	// tmp->set_elm(3, 2, 0.25f);
-	// tmp->set_elm(3, 3, 0.20f);
-	// tmp->set_elm(3, 7, 0.20f);
-	// tmp->set_elm(3, 8, 0.20f);
-	// tmp->set_elm(3, 10, 0.20f);
-	// tmp->set_elm(4, 1, 0.25f);
-	// tmp->set_elm(4, 2, 0.25f);
-	// tmp->set_elm(4, 4, 0.20f);
-	// tmp->set_elm(4, 6, 0.25f);
-	// tmp->set_elm(4, 7, 0.20f);
-	// tmp->set_elm(5, 0, 0.20f);
-	// tmp->set_elm(5, 5, 0.333f);
-	// tmp->set_elm(5, 9, 0.25f);
-	// tmp->set_elm(6, 0, 0.20f);
-	// tmp->set_elm(6, 4, 0.20f);
-	// tmp->set_elm(6, 6, 0.25f);
-	// tmp->set_elm(6, 9, 0.25f);
-	// tmp->set_elm(7, 3, 0.20f);
-	// tmp->set_elm(7, 4, 0.20f);
-	// tmp->set_elm(7, 7, 0.20f);
-	// tmp->set_elm(7, 8, 0.20f);
-	// tmp->set_elm(7, 10, 0.20f);
-	// tmp->set_elm(8, 3, 0.20f);
-	// tmp->set_elm(8, 7, 0.20f);
-	// tmp->set_elm(8, 8, 0.20f);
-	// tmp->set_elm(8, 10, 0.20f);
-	// tmp->set_elm(8, 11, 0.333f);
-	// tmp->set_elm(9, 0, 0.20f);
-	// tmp->set_elm(9, 5, 0.333f);
-	// tmp->set_elm(9, 6, 0.25f);
-	// tmp->set_elm(9, 9, 0.25f);
-	// tmp->set_elm(10, 3, 0.20f);
-	// tmp->set_elm(10, 7, 0.20f);
-	// tmp->set_elm(10, 8, 0.20f);
-	// tmp->set_elm(10, 10, 0.20f);
-	// tmp->set_elm(10, 11, 0.333f);
-	// tmp->set_elm(11, 8, 0.20f);
-	// tmp->set_elm(11, 10, 0.20f);
-	// tmp->set_elm(11, 11, 0.333f);
-	// auto cr = markov_process(tmp);
+	// vector<Eigen::Triplet<float>> data;
+	// data.emplace_back(0, 0, 0.2f);
+	// data.emplace_back(0, 1, 0.25f);
+	// data.emplace_back(0, 5, 0.333f);
+	// data.emplace_back(0, 6, 0.25f);
+	// data.emplace_back(0, 9, 0.25f);
+	// data.emplace_back(1, 0, 0.20f);
+	// data.emplace_back(1, 1, 0.25f);
+	// data.emplace_back(1, 2, 0.25f);
+	// data.emplace_back(1, 4, 0.20f);
+	// data.emplace_back(2, 1, 0.25f);
+	// data.emplace_back(2, 2, 0.25f);
+	// data.emplace_back(2, 3, 0.20f);
+	// data.emplace_back(2, 4, 0.20f);
+	// data.emplace_back(3, 2, 0.25f);
+	// data.emplace_back(3, 3, 0.20f);
+	// data.emplace_back(3, 7, 0.20f);
+	// data.emplace_back(3, 8, 0.20f);
+	// data.emplace_back(3, 10, 0.20f);
+	// data.emplace_back(4, 1, 0.25f);
+	// data.emplace_back(4, 2, 0.25f);
+	// data.emplace_back(4, 4, 0.20f);
+	// data.emplace_back(4, 6, 0.25f);
+	// data.emplace_back(4, 7, 0.20f);
+	// data.emplace_back(5, 0, 0.20f);
+	// data.emplace_back(5, 5, 0.333f);
+	// data.emplace_back(5, 9, 0.25f);
+	// data.emplace_back(6, 0, 0.20f);
+	// data.emplace_back(6, 4, 0.20f);
+	// data.emplace_back(6, 6, 0.25f);
+	// data.emplace_back(6, 9, 0.25f);
+	// data.emplace_back(7, 3, 0.20f);
+	// data.emplace_back(7, 4, 0.20f);
+	// data.emplace_back(7, 7, 0.20f);
+	// data.emplace_back(7, 8, 0.20f);
+	// data.emplace_back(7, 10, 0.20f);
+	// data.emplace_back(8, 3, 0.20f);
+	// data.emplace_back(8, 7, 0.20f);
+	// data.emplace_back(8, 8, 0.20f);
+	// data.emplace_back(8, 10, 0.20f);
+	// data.emplace_back(8, 11, 0.333f);
+	// data.emplace_back(9, 0, 0.20f);
+	// data.emplace_back(9, 5, 0.333f);
+	// data.emplace_back(9, 6, 0.25f);
+	// data.emplace_back(9, 9, 0.25f);
+	// data.emplace_back(10, 3, 0.20f);
+	// data.emplace_back(10, 7, 0.20f);
+	// data.emplace_back(10, 8, 0.20f);
+	// data.emplace_back(10, 10, 0.20f);
+	// data.emplace_back(10, 11, 0.333f);
+	// data.emplace_back(11, 8, 0.20f);
+	// data.emplace_back(11, 10, 0.20f);
+	// data.emplace_back(11, 11, 0.333f);
+	// Eigen::SparseMatrix<float> tmp(12,12);
+	// tmp.setFromTriplets(data.begin(), data.end());
+	// auto cr = markov_process(&tmp);
 	// cout << "Found "<<cr.size()<<" clusters"<<endl;
 	// for(auto& set: cr){
 	// 	cout<< "set :";
@@ -436,16 +445,13 @@ void run_markov_clustering(DatabaseFile& db){
 	// 	}
 	// 	cout<<endl;
 	// }
-	// delete tmp;
 	// return;
 
-
 	const size_t seq_count = db.ref_header.sequences;
 	statistics.reset();
 	config.command = Config::blastp;
 	config.no_self_hits = true;
-	//config.output_format = { "6", "qnum", "snum" };
-	config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore" };
+	config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore", "pident" };
 	config.query_cover = 80;
 	config.subject_cover = 80;
 	config.algo = 0;
@@ -456,15 +462,65 @@ void run_markov_clustering(DatabaseFile& db){
 	Workflow::Search::Options opt;
 	opt.db = &db;
 	opt.self = true;
-	SparseMatrixStream<float> ms(db.ref_header.sequences, false);
+	SparseMatrixStream<float> ms(db.ref_header.sequences);
 	opt.consumer = &ms;
 	opt.db_filter = nullptr;
 	Workflow::Search::run(opt);
 
-	SparseSimpleMatrix<float>* m = ms.getMatrix();
-
-	auto clusterResult = markov_process(m);
-	cout << "Found "<<clusterResult.size()<<" clusters"<<endl;
+	auto componentInfo = ms.getComponents();
+	vector<vector<uint32_t>> indices = get<0>(componentInfo);
+	vector<Eigen::SparseMatrix<float>> components = get<1>(componentInfo);
+	uint32_t nComponents = count_if(indices.begin(), indices.end(), [](vector<uint32_t> v){ return v.size() > 0;});
+	uint32_t nComponentsLt1 = count_if(indices.begin(), indices.end(), [](vector<uint32_t> v){ return v.size() > 1;});
+
+	cout << "DIAMOND done" << endl;
+	cout << "************" << endl;
+	cout << "Found " << nComponentsLt1 <<" ("<<nComponents<< " incl. singletons) disconnected components" << endl;
+
+	vector<unordered_set<uint32_t>> clustering_result;
+	float max_sparsity = 0.0f;
+	float min_sparsity = 1.0f;
+	bool full = false;
+	if( full ){
+		// TODO: According to the SIAM publication this is not valid, just for debugging
+		Eigen::SparseMatrix<float> m = ms.getMatrix();
+		for(unordered_set<uint32_t> subset : markov_process(&m)){
+			clustering_result.emplace_back(subset);
+		}
+	}
+	else {
+		// TODO: check if using dense matrices for non-sparse components improves performance
+		for(uint32_t iComponent = 0; iComponent<indices.size(); iComponent++){
+			vector<uint32_t> order = indices[iComponent];
+			if(order.size() > 1){
+				Eigen::SparseMatrix<float> m = components[iComponent];
+				float sparsity = 1.0-(1.0 * m.nonZeros()) / (m.rows()*m.cols());
+				max_sparsity = max(max_sparsity, sparsity);
+				min_sparsity = min(min_sparsity, sparsity);
+				// map back to original ids
+				for(unordered_set<uint32_t> subset : markov_process(&m)){
+					unordered_set<uint32_t> s;
+					for(uint32_t el : subset){
+						s.emplace(order[el]);
+					}
+					clustering_result.emplace_back(s);
+				}
+			}
+			else if (order.size() == 1){
+				clustering_result.emplace_back(order.begin(), order.end());
+			}
+		}
+	}
+	uint32_t nClusters = clustering_result.size();
+	uint32_t nClustersLt1 = count_if(clustering_result.begin(), clustering_result.end(), [](unordered_set<uint32_t> v){ return v.size() > 1;});
+	cout << "Found "<<nClustersLt1<< " ("<< nClusters<<" incl. singletons) clusters with min sparsity "<<min_sparsity<< " and max. sparsity " << max_sparsity << endl;
+	// for(auto& set: clustering_result){
+	// 	cout<< "set :";
+	// 	for(auto item : set){
+	// 		cout<< " " << item;
+	// 	}
+	// 	cout<<endl;
+	// }
 }
 
 void run() {
diff --git a/src/run/sparse_matrix.h b/src/run/sparse_matrix.h
deleted file mode 100644
index d9502996..00000000
--- a/src/run/sparse_matrix.h
+++ /dev/null
@@ -1,498 +0,0 @@
-#pragma once
-#include <unordered_map>
-#include <vector>
-
-template<typename T>
-class SimpleMatrix {
-public:
-	virtual T get_max_elm() = 0;
-	virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx) = 0;
-	virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element) = 0;
-	virtual T& at(uint32_t i_idx, uint32_t j_idx) = 0;
-	virtual T& at(uint64_t c_idx) = 0;
-	virtual inline uint64_t get_nrows() = 0;
-	virtual inline uint64_t get_ncols() = 0;
-	virtual SimpleMatrix<T>* get_new_instance(int nrows, int ncols) = 0;
-	virtual SimpleMatrix<T>* get_new_instance() = 0;
-	virtual void print(){
-		for( uint32_t irow=0; irow < get_nrows(); irow++){
-			for( uint32_t icol=0; icol < get_ncols(); icol++){
-				printf(" %.6f",get_elm(irow,icol));
-			}
-			printf("\n");
-		}
-	}
-
-	virtual inline uint64_t get_idx(uint32_t i_idx, uint32_t j_idx){
-		assert(i_idx < get_nrows());
-		assert(j_idx < get_ncols());
-		return i_idx * get_ncols() + j_idx;
-	}
-
-	virtual inline void get_indices (uint64_t combined, uint32_t* i, uint32_t* j){
-		*j = combined % get_ncols();
-		*i = ( combined - *j) / get_ncols();
-	}
-
-	virtual SimpleMatrix<T>* minus(SimpleMatrix<T>* m) = 0;
-	virtual SimpleMatrix<T>* base_minus(SimpleMatrix<T>* m){
-		assert(this->get_nrows() == m->get_nrows());
-		assert(this->get_ncols() == m->get_ncols());
-		SimpleMatrix<T>* result = this->get_new_instance(this->get_nrows(),m->get_ncols());
-		for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
-			for(uint32_t icol = 0; icol<m->get_ncols(); icol++){
-				result->set_elm(irow, icol, this->get_elm(irow, icol) - m->get_elm(irow,icol));
-			}
-		}
-		return result;
-	}
-	virtual SimpleMatrix<T>* plus(SimpleMatrix<T>* m) = 0;
-	virtual SimpleMatrix<T>* base_plus(SimpleMatrix<T>* m){
-		assert(this->get_nrows() == m->get_nrows());
-		assert(this->get_ncols() == m->get_ncols());
-		SimpleMatrix<T>* result = this->get_new_instance(this->get_nrows(),m->get_ncols());
-		for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
-			for(uint32_t icol = 0; icol<m->get_ncols(); icol++){
-				result->set_elm(irow, icol, this->get_elm(irow, icol) + m->get_elm(irow, icol));
-			}
-		}
-		return result;
-	}
-	virtual SimpleMatrix<T>* multiply(SimpleMatrix<T>* m) = 0;
-	virtual SimpleMatrix<T>* base_multiply(SimpleMatrix<T>* m){
-		printf("standard mult\n");
-		SimpleMatrix<T>* result = this->get_new_instance(this->get_nrows(),m->get_ncols());
-		for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
-			for(uint32_t icol = 0; icol<m->get_ncols(); icol++){
-				result->set_elm(irow, icol, 0);
-			}
-		}
-		for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
-			for(uint32_t icol = 0; icol<m->get_ncols(); icol++){
-				for(uint32_t k = 0; k<m->get_nrows(); k++){
-					result->set_elm(irow, icol, result->get_elm(irow, icol) + this->get_elm(irow, k) * m->get_elm(k,icol));
-				}
-			}
-		}
-		return result;
-	}
-	virtual double norm(double p, double q){
-		double result = 0.0;
-		double exp1 = q/p;
-		for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
-			double colnorm = 0.0;
-			for(uint32_t icol = 0; icol<this->get_ncols(); icol++){
-					colnorm += std::pow(std::abs(this->get_elm(irow, icol)), p);
-			}
-			result += std::pow(colnorm, exp1);
-		}
-		return std::pow(result, 1.0/q);
-	}
-
-	virtual ~SimpleMatrix() {};
-};
-
-template<typename T>
-class SymmetricSimpleMatrix : virtual public SimpleMatrix<T> {
-public:
-	static inline uint64_t get_lower_index (uint32_t i_idx, uint32_t j_idx){
-		uint32_t min = std::min(i_idx,j_idx);
-		uint32_t max = std::max(i_idx,j_idx);
-		return  max*((uint64_t) max+1)/2 + min;
-	}
-	static inline void get_symmetric_index (uint64_t combined, uint32_t* max, uint32_t* min){
-		// solving max^2-max-2*combined = 0. -1 to convert to indexing varaible in C, avoiding overflow
-		// by removing an additional factor of 2 from under the square root
-		*max = (uint32_t) std::floor(0.5+2*sqrt(0.0625+0.5*combined)) - 1;
-		*min = combined - (*max*((uint64_t)*max+1))/2;
-	}
-};
-
-template<typename T>
-class SparseSimpleMatrix : virtual public SimpleMatrix<T> {
-public:
-	virtual uint64_t get_n_nonzero_elements() = 0;
-	virtual inline bool has_elm(uint32_t i_idx, uint32_t j_idx) = 0;
-	virtual typename std::unordered_map<uint64_t, T>::iterator begin() = 0;
-	virtual typename std::unordered_map<uint64_t, T>::iterator end() = 0;
-};
-
-template<typename T>
-class SparseSimpleMatrixImpl : public SparseSimpleMatrix<T> {
-private:
-	uint32_t nrows, ncols;
-	T tol;
-	std::unordered_map<uint64_t,T> mat;
-
-public:
-	SparseSimpleMatrixImpl(int nrows, int ncols) : nrows(nrows), ncols(ncols), tol((T) 1e-12) {};
-	SparseSimpleMatrixImpl(int nrows, int ncols, T tol) : nrows(nrows), ncols(ncols), tol(std::abs(tol)) {};
-
-	virtual SimpleMatrix<T>*  get_new_instance(int nrows, int ncols){
-		return new SparseSimpleMatrixImpl<T>(nrows, ncols);
-	}
-	virtual SimpleMatrix<T>*  get_new_instance(){
-		return new SparseSimpleMatrixImpl<T>(nrows, ncols);
-	}
-
-	virtual ~SparseSimpleMatrixImpl(){ mat.clear(); };
-
-	virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){
-		assert(i_idx < nrows);
-		assert(j_idx < ncols);
-		if(std::abs(element) > tol) {
-			mat[SimpleMatrix<T>::get_idx(i_idx,j_idx)] = element;
-		}
-	}
-	virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){
-		assert(i_idx < nrows);
-		assert(j_idx < ncols);
-		auto it = mat.find(SimpleMatrix<T>::get_idx(i_idx,j_idx));
-		return it != mat.end() ? it-> second : (T) 0.0;
-	}
-	virtual inline bool has_elm(uint32_t i_idx, uint32_t j_idx){
-		assert(i_idx < nrows);
-		assert(j_idx < ncols);
-		auto it = mat.find(SimpleMatrix<T>::get_idx(i_idx,j_idx));
-		return it != mat.end();
-	}
-	virtual inline uint64_t get_nrows(){
-		return nrows;
-	};
-	virtual inline uint64_t get_ncols(){
-		return ncols;
-	};
-	virtual inline uint64_t get_n_nonzero_elements(){
-		return mat.size();
-	};
-	virtual typename std::unordered_map<uint64_t,T>::iterator begin(){
-	 	return mat.begin();
-	};
-	virtual typename std::unordered_map<uint64_t,T>::iterator end(){
-		return mat.end();
-	};
-	virtual T& at(uint32_t i_idx, uint32_t j_idx){
-		return at(SimpleMatrix<T>::get_idx(i_idx,j_idx));
-	};
-	virtual T& at(uint64_t c_idx){
-		assert(c_idx < nrows*ncols);
-		return mat[c_idx];
-	};
-
-	void purge(){
-		auto it = mat.begin();
-		while(it!=mat.end()){
-			if(std::abs(it->second) < tol){
-				it = mat.erase(it);
-			}
-			else{
-				it++;
-			}
-		}
-	}
-	virtual T get_max_elm(){
-		T m = 0.0;
-		for(auto& it : mat){
-			m = std::max(m, it.second);
-		}
-		return m;
-	};
-	virtual SimpleMatrix<T>* minus(SimpleMatrix<T>* m){
-		if(SparseSimpleMatrix<T>* n = dynamic_cast<SparseSimpleMatrix<T>*> (m)){
-			// TODO: Imporove this implementation
-			SparseSimpleMatrix<T>* result = new SparseSimpleMatrixImpl(this->get_nrows(),n->get_ncols());
-
-			std::vector<std::set<uint32_t>> k_to_j(n->get_nrows());
-			for(auto it_m = n->begin(); it_m != n->end(); it_m++){
-				uint32_t irow,icol;
-				n->get_indices(it_m->first, &irow, &icol);
-				result->set_elm(irow, icol, this->get_elm(irow,icol) - it_m->second);
-			}
-			for(auto it_this = this->begin(); it_this != this->end(); it_this++){
-				uint32_t irow,icol;
-				this->get_indices(it_this->first, &irow, &icol);
-				if(!n->has_elm(irow,icol)) result->set_elm(irow, icol, it_this->second);
-			}
-			return result;
-		}
-		return this->base_minus(m);
-	}
-	virtual SimpleMatrix<T>* plus(SimpleMatrix<T>* m){
-		if(SparseSimpleMatrix<T>* n = dynamic_cast<SparseSimpleMatrix<T>*> (m)){
-			// TODO: Imporove this implementation
-			SparseSimpleMatrix<T>* result = new SparseSimpleMatrixImpl(this->get_nrows(),n->get_ncols());
-
-			std::vector<std::set<uint32_t>> k_to_j(n->get_nrows());
-			for(auto it_m = n->begin(); it_m != n->end(); it_m++){
-				uint32_t irow,icol;
-				n->get_indices(it_m->first, &irow, &icol);
-				result->set_elm(irow, icol, this->get_elm(irow,icol) + it_m->second);
-			}
-			for(auto it_this = this->begin(); it_this != this->end(); it_this++){
-				uint32_t irow,icol;
-				this->get_indices(it_this->first, &irow, &icol);
-				if(!n->has_elm(irow,icol)) result->set_elm(irow, icol, it_this->second);
-			}
-			return result;
-		}
-		return this->base_plus(m);
-	}
-	virtual SimpleMatrix<T>* multiply(SimpleMatrix<T>* m){
-		if(SparseSimpleMatrix<T>* n = dynamic_cast<SparseSimpleMatrix<T>*> (m)){
-			SparseSimpleMatrix<T>* result = new SparseSimpleMatrixImpl(this->get_nrows(),n->get_ncols());
-			std::vector<std::vector<uint32_t>> k_to_j(n->get_nrows());
-			for(uint32_t k = 0; k<n->get_nrows(); k++){
-				k_to_j[k] = std::vector<uint32_t>();
-			}
-			for(auto it_m = n->begin(); it_m != n->end(); it_m++){
-				uint32_t k2,icol;
-				n->get_indices(it_m->first, &k2, &icol);
-				k_to_j[k2].push_back(icol);
-			}
-
-			auto it_this = this->begin();
-			uint32_t ithis = 0;
-			while(it_this != this->end()){
-				uint32_t irow, k;
-				this->get_indices(it_this->first, &irow, &k);
-				for( uint32_t const & icol : k_to_j[k] ){
-					T el = result->get_elm(irow, icol);
-					result->set_elm(irow, icol, el + it_this->second * n->get_elm(k, icol));
-				}
-				it_this++;
-			}
-			return result;
-		}
-		return this->base_multiply(m);
-	}
-};
-
-template<typename T>
-class DenseSimpleMatrix : public SimpleMatrix<T> {
-private:
-	uint32_t nrows, ncols;
-	T* mat;
-public:
-	DenseSimpleMatrix(int nrows, int ncols) : nrows(nrows), ncols(ncols) {
-		mat = new T[nrows * ncols];
-	};
-	DenseSimpleMatrix(int nrows, int ncols, T init) : nrows(nrows), ncols(ncols) {
-		mat = new T[nrows * ncols];
-		for(uint64_t i = 0; i<nrows*ncols; i++){
-			mat[i] = init;
-		}
-	};
-	virtual ~DenseSimpleMatrix(){
-		delete [] mat;
-		mat = nullptr;
-	};
-	virtual SimpleMatrix<T>* get_new_instance(int nrows, int ncols){
-		return new DenseSimpleMatrix<T>(nrows, ncols);
-	}
-	virtual SimpleMatrix<T>* get_new_instance(){
-		return new DenseSimpleMatrix<T>(nrows, ncols);
-	}
-
-	virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){
-		mat[SimpleMatrix<T>::get_idx(i_idx,j_idx)] = element;
-	}
-	virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){
-		return mat[SimpleMatrix<T>::get_idx(i_idx,j_idx)];
-	}
-	virtual inline uint64_t get_nrows(){
-		return nrows;
-	};
-	virtual inline uint64_t get_ncols(){
-		return ncols;
-	};
-	virtual T& at(uint32_t i_idx, uint32_t j_idx){
-		return at(SimpleMatrix<T>::get_idx(i_idx,j_idx));
-	};
-	virtual T& at(uint64_t c_idx){
-		assert(c_idx < nrows * ncols);
-		return mat[c_idx];
-	};
-	virtual T get_max_elm(){
-		// TODO: implement me
-		return 0;//*std::max_element(mat,mat+n_max);
-	};
-
-	SimpleMatrix<T>* multiply(SimpleMatrix<T>* m){
-		if(SparseSimpleMatrix<T>* n = dynamic_cast<SparseSimpleMatrix<T>*> (m)){
-			DenseSimpleMatrix<T>* result = new DenseSimpleMatrix(this->get_nrows(),n->get_ncols());
-			for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
-				for(uint32_t icol = 0; icol<n->get_ncols(); icol++){
-					result->set_elm(irow, icol, 0);
-				}
-			}
-			for(uint32_t irow = 0; irow<this->get_nrows(); irow++){
-				auto it = n->begin();
-				while(it!=n->end()){
-					uint32_t k,icol;
-					n->get_indices (it->first, &k, &icol);
-					result->at(irow, icol) += this->get_elm(irow, k) * it->second;
-					it++;
-				}
-			}
-			return result;
-		}
-		return this->base_multiply(m);
-	}
-};
-
-template<typename T>
-class SparseSymmetricSimpleMatrix : public SparseSimpleMatrix<T>, public SymmetricSimpleMatrix<T> {
-private:
-	uint32_t n_max;
-	T tol;
-	std::unordered_map<uint64_t,T> mat;
-
-public:
-	SparseSymmetricSimpleMatrix(int n) : n_max(n), tol((T) 1e-12) {};
-	SparseSymmetricSimpleMatrix(int n, T tol) : n_max(n), tol(std::abs(tol)) {};
-
-	virtual ~SparseSymmetricSimpleMatrix(){ mat.clear(); };
-	virtual SimpleMatrix<T>* get_new_instance(int nrows, int ncols){
-		return new SparseSymmetricSimpleMatrix<T>(nrows, tol);
-	}
-	virtual SimpleMatrix<T>* get_new_instance(){
-		return new SparseSymmetricSimpleMatrix<T>(n_max, tol);
-	}
-
-	virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){
-		if( std::abs(element) > tol ) {
-			assert(i_idx < n_max);
-			assert(j_idx < n_max);
-			const uint64_t my_idx=this->get_lower_index(i_idx,j_idx);
-			mat[my_idx] = element;
-		}
-	}
-	virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){
-		assert(i_idx < n_max);
-		assert(j_idx < n_max);
-		const uint64_t my_idx=this->get_lower_index(i_idx,j_idx);
-		auto it = mat.find(my_idx);
-		return it != mat.end() ? it-> second : (T) 0.0;
-	}
-	virtual inline bool has_elm(uint32_t i_idx, uint32_t j_idx){
-		assert(i_idx < n_max);
-		assert(j_idx < n_max);
-		const uint64_t my_idx=this->get_lower_index(i_idx,j_idx);
-		auto it = mat.find(my_idx);
-		return it != mat.end();
-	}
-	virtual inline uint64_t get_nrows(){
-		// TODO: this can be determined from mat (since symmetric) then the matrix can be unbounded!
-		return n_max;
-	};
-	virtual inline uint64_t get_ncols(){
-		// TODO: this can be determined from mat (since symmetric) then the matrix can be unbounded!
-		return n_max;
-	};
-	virtual inline uint64_t get_n_nonzero_elements(){
-		return mat.size();
-	};
-	virtual typename std::unordered_map<uint64_t,T>::iterator begin(){
-		return mat.begin();
-	};
-	virtual typename std::unordered_map<uint64_t,T>::iterator end(){
-		return mat.end();
-	};
-	virtual T& at(uint32_t i_idx, uint32_t j_idx){
-		assert(i_idx < n_max);
-		assert(j_idx < n_max);
-		const uint64_t c_idx=this->get_lower_index(i_idx,j_idx);
-		return at(c_idx);
-	};
-	virtual T& at(uint64_t c_idx){
-		assert(c_idx < n_max*(n_max+1)/2);
-		return mat[c_idx];
-	};
-
-	void purge(){
-		auto it = mat.begin();
-		while(it!=mat.end()){
-			if (std::abs(it->second) < tol ){
-				it = mat.erase(it);
-			}
-			else{
-				it++;
-			}
-		}
-	}
-
-	virtual T get_max_elm(){
-		T m = 0.0;
-		for ( auto& it : mat ) {
-			m = std::max(m,it.second);
-		}
-		return m;
-	};
-
-	void resize(uint32_t new_n_max){
-		n_max = new_n_max;
-	}
-	// TODO: implement valid matrix ops
-	virtual SimpleMatrix<T>* minus(SimpleMatrix<T>* m){
-		return this->base_minus(m);
-	}
-	virtual SimpleMatrix<T>* plus(SimpleMatrix<T>* m){
-		return this->base_plus(m);
-	}
-	virtual SimpleMatrix<T>* multiply(SimpleMatrix<T>* m){
-		return this->base_multiply(m);
-	}
-};
-
-template<typename T>
-class DenseSymmetricSimpleMatrix : public SymmetricSimpleMatrix<T> {
-private:
-	uint32_t n_max;
-	T* mat;
-public:
-	DenseSymmetricSimpleMatrix(int n) : n_max(n) {
-		mat = new T[(n_max*(n_max+1))/2];
-	};
-	virtual ~DenseSymmetricSimpleMatrix(){
-		delete [] mat;
-		mat = nullptr;
-	};
-	virtual SimpleMatrix<T>* get_new_instance(int nrows, int ncols){
-		return new DenseSymmetricSimpleMatrix<T>(nrows * ncols);
-	}
-	virtual SimpleMatrix<T>* get_new_instance(){
-		return new DenseSymmetricSimpleMatrix<T>(n_max);
-	}
-
-	virtual inline void set_elm(uint32_t i_idx, uint32_t j_idx, T element){
-		assert(i_idx < n_max);
-		assert(j_idx < n_max);
-		const uint64_t my_idx=SymmetricSimpleMatrix<T>::get_lower_index(i_idx,j_idx);
-		mat[my_idx] = element;
-	}
-	virtual inline T get_elm(uint32_t i_idx, uint32_t j_idx){
-		assert(i_idx < n_max);
-		assert(j_idx < n_max);
-		const uint64_t my_idx=SymmetricSimpleMatrix<T>::get_lower_index(i_idx,j_idx);
-		return mat[my_idx];
-	}
-	virtual inline uint64_t get_nrows(){
-		return n_max;
-	};
-	virtual inline uint64_t get_ncols(){
-		return n_max;
-	};
-	virtual T& at(uint32_t i_idx, uint32_t j_idx){
-		assert(i_idx < n_max);
-		assert(j_idx < n_max);
-		const uint64_t c_idx=SymmetricSimpleMatrix<T>::get_lower_index(i_idx,j_idx);
-		return at(c_idx);
-	};
-	virtual T& at(uint64_t c_idx){
-		assert(c_idx < n_max*(n_max+1)/2);
-		return mat[c_idx];
-	};
-	virtual T get_max_elm(){
-		return *std::max_element(mat,mat+n_max);
-	};
-};

From 07fdac0e478210c5f4ae50b9f0ef754bb04440a7 Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Sat, 18 Apr 2020 10:58:19 +0200
Subject: [PATCH 04/21] Two minor code changes with big effect

- create a matrix of random walks from the orignal matrix before calling
  markov_process
- improve speed in result collection
---
 src/run/cluster.cpp | 42 +++++++-----------------------------------
 1 file changed, 7 insertions(+), 35 deletions(-)

diff --git a/src/run/cluster.cpp b/src/run/cluster.cpp
index 49c2cd24..c88944df 100644
--- a/src/run/cluster.cpp
+++ b/src/run/cluster.cpp
@@ -121,16 +121,16 @@ class SparseMatrixStream : public Consumer {
 	pair<vector<vector<uint32_t>>, vector<Eigen::SparseMatrix<T>>> getComponents(){
 		vector<unordered_set<size_t>> sets = disjointSet->getListOfSets();
 		vector<vector<Eigen::Triplet<T>>> split(sets.size());
+		unordered_map<size_t, uint32_t> indexToSetId;
 		for(uint32_t iset = 0; iset < sets.size(); iset++){
 			split.push_back(vector<Eigen::Triplet<T>>());
+			for(size_t index : sets[iset]){
+				indexToSetId.emplace(index, iset);
+			}
 		}
 		for(Eigen::Triplet<T> d : data){
-			for(uint32_t iset = 0; iset < sets.size(); iset++){
-				if(sets[iset].count(d.col()) != 0){
-					split[iset].emplace_back(d.row(), d.col(), d.value());
-					break;
-				}
-			}
+			assert(indexToSetId[d.row()] == indexToSetId[d.row()]);
+			split[indexToSetId[d.row()]].emplace_back(d.row(), d.col(), d.value());
 		}
 		vector<vector<uint32_t>> indices(sets.size());
 		vector<Eigen::SparseMatrix<T>> components(sets.size());
@@ -337,37 +337,10 @@ vector<unordered_set<uint32_t>> get_list(Eigen::SparseMatrix<float>* m){
 	return disjointSet.getListOfSets();
 }
 
-void print(Eigen::SparseMatrix<float>* m, bool sparse){
-	if(sparse){
-		for (uint32_t k=0; k<m->outerSize(); ++k){
-			for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
-				printf("%lu %lu %16.14f\n", it.row(), it.col(), it.value());
-			}
-		}
-	}
-	else{
-		for(uint32_t irow = 0; irow < m->rows(); irow++){
-			for(uint32_t icol = 0; icol < m->cols(); icol++){
-				printf(" %8.4f", m->coeffRef(irow, icol));
-			}
-			printf("\n");
-		}
-	}
-		printf("\n");
-}
-
 vector<unordered_set<uint32_t>> markov_process(Eigen::SparseMatrix<float>* m){
 	uint32_t iteration = 0;
 	double diff_norm = std::numeric_limits<double>::max();
-	// TODO: Add loops TODO: make sure self-hits are found and have the same unit as the off diagonal elements (bit score or other measure used) before calling this routine, then only check that diagonal elements exist
-	// for(uint32_t idiag = 0; idiag<m->get_nrows(); idiag++){
-	// 	float e = m->get_elm(idiag, idiag);
-	// 	if( abs(e) < 1e-13 ){
-	// 		//cout << "WARNING: did not find a self-hit for "<< idiag << endl;
-	// 		m->set_elm(idiag, idiag, 1.0);
-	// 	}
-	// }
-	//*m  = get_gamma(m,1);
+	*m  = get_gamma(m,1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable
 	while( iteration < 100 && diff_norm > 1e-6*m->rows() ){
 		Eigen::SparseMatrix<float> msquared = ((*m) * (*m)).pruned(1.0, std::numeric_limits<float>::epsilon());
 		Eigen::SparseMatrix<float> m_update = get_gamma(&msquared, 2);
@@ -375,7 +348,6 @@ vector<unordered_set<uint32_t>> markov_process(Eigen::SparseMatrix<float>* m){
 		*m = m_update.pruned(1.0, std::numeric_limits<float>::epsilon());
 		iteration++;
 	}
-	//cout<< "Markov Process converged after " << iteration <<" iterations " << diff_norm << endl;
 	return get_list(m);
 }
 

From 94dec7877586a6e44a1c53b3dd56de8b15f23cfd Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Wed, 22 Apr 2020 21:33:20 +0200
Subject: [PATCH 05/21] addressing some of the code-review suggestions

---
 CMakeLists.txt                      |   3 +-
 src/basic/config.cpp                |   2 +-
 src/basic/config.h                  |   3 +-
 src/{run => cluster}/disjoint_set.h |   0
 src/cluster/multi_step_cluster.cpp  | 161 +++++++++
 src/run/cluster.cpp                 | 516 ----------------------------
 src/run/main.cpp                    |  11 +-
 src/run/workflow.h                  |   9 +-
 8 files changed, 180 insertions(+), 525 deletions(-)
 rename src/{run => cluster}/disjoint_set.h (100%)
 create mode 100644 src/cluster/multi_step_cluster.cpp
 delete mode 100644 src/run/cluster.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8eb8f16b..91b00ab7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -151,7 +151,6 @@ set(OBJECTS
   src/data/seed_array.cpp
   src/output/paf_format.cpp
   src/util/system/system.cpp
-  src/run/cluster.cpp
   src/util/algo/greedy_vortex_cover.cpp
   src/util/sequence/sequence.cpp
   src/tools/tsv_record.cpp
@@ -172,6 +171,8 @@ set(OBJECTS
   src/align/gapped.cpp
   src/align/culling.cpp
   src/cluster/medoid.cpp
+  src/cluster/multi_step_cluster.cpp
+  src/cluster/mcl.cpp
   src/align/output.cpp
   src/tools/roc.cpp
   src/test/data.cpp
diff --git a/src/basic/config.cpp b/src/basic/config.cpp
index e67fa5d8..dbde2e8c 100644
--- a/src/basic/config.cpp
+++ b/src/basic/config.cpp
@@ -145,7 +145,7 @@ Config::Config(int argc, const char **argv, bool check_io)
 
 	Options_group cluster("Cluster options");
 	cluster.add()
-		("calgo", 0, "Clustering algorithm (0=multi-step/1=transitive-closure)", cluster_algo, -1);
+		("cluster-algo", 0, "Clustering algorithm (\"multi-step\", \"mcl\")", cluster_algo);
 
 	Options_group aligner("Aligner options");
 	aligner.add()
diff --git a/src/basic/config.h b/src/basic/config.h
index 46bbff7a..7d5e9243 100644
--- a/src/basic/config.h
+++ b/src/basic/config.h
@@ -222,8 +222,7 @@ struct Config
 	enum { double_indexed = 0, query_indexed = 1, subject_indexed = 2 };
 	int algo;
 
-	enum { multi_step = 0, transitive_closure = 1, markov_clustering = 2};
-	int cluster_algo;
+	string cluster_algo;
 
 	enum { query_parallel = 0, target_parallel = 1 };
 	unsigned load_balancing;
diff --git a/src/run/disjoint_set.h b/src/cluster/disjoint_set.h
similarity index 100%
rename from src/run/disjoint_set.h
rename to src/cluster/disjoint_set.h
diff --git a/src/cluster/multi_step_cluster.cpp b/src/cluster/multi_step_cluster.cpp
new file mode 100644
index 00000000..6cb2f704
--- /dev/null
+++ b/src/cluster/multi_step_cluster.cpp
@@ -0,0 +1,161 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+#include <stdio.h>
+#include <memory>
+#include <fstream>
+#include <limits>
+#include "../util/system/system.h"
+#include "../util/util.h"
+#include "../basic/config.h"
+#include "../data/reference.h"
+#include "../run/workflow.h"
+#include "../util/io/consumer.h"
+#include "../util/algo/algo.h"
+#include "../basic/statistics.h"
+#include "../util/log_stream.h"
+#include "../dp/dp.h"
+#include "../basic/masking.h"
+
+using namespace std;
+
+namespace Workflow { namespace Cluster{ namespace MultiStep {
+
+struct Neighbors : public vector<vector<int>>, public Consumer {
+	Neighbors(size_t n):
+		vector<vector<int>>(n)
+	{}
+	virtual void consume(const char *ptr, size_t n) override {
+		int query, subject, count;
+		float qcov, scov, bitscore;
+		const char *end = ptr + n;
+		while (ptr < end) {
+			//if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2)
+			if (sscanf(ptr, "%i\t%i\t%f\t%f\t%f\n%n", &query, &subject, &qcov, &scov, &bitscore, &count) != 5)
+				throw runtime_error("Cluster format error.");
+			ptr += count;
+			//cout << query << '\t' << subject << '\t' << qcov << '\t' << scov << '\t' << endl;
+			(*this)[query].push_back(subject);
+			edges.push_back({ query, subject, (int)bitscore });
+		}
+	}
+	vector<Util::Algo::Edge> edges;
+};
+
+vector<bool> rep_bitset(const vector<int> &centroid, const vector<bool> *superset = nullptr) {
+	vector<bool> r(centroid.size());
+	for (int c : centroid)
+		if(!superset || (*superset)[c])
+			r[c] = true;
+	return r;
+}
+
+vector<int> cluster(DatabaseFile &db, const vector<bool> *filter) {
+	statistics.reset();
+	config.command = Config::blastp;
+	config.no_self_hits = true;
+	//config.output_format = { "6", "qnum", "snum" };
+	config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore" };
+	config.query_cover = 80;
+	config.subject_cover = 80;
+	config.algo = 0;
+	config.index_mode = 0;
+	config.freq_sd = 0;
+	config.max_alignments = numeric_limits<uint64_t>::max();
+
+	Workflow::Search::Options opt;
+	opt.db = &db;
+	opt.self = true;
+	Neighbors nb(db.ref_header.sequences);
+	opt.consumer = &nb;
+	opt.db_filter = filter;
+
+	Workflow::Search::run(opt);
+
+	return Util::Algo::greedy_vortex_cover(nb);
+}
+
+void run_two_step_clustering(DatabaseFile& db){
+	const size_t seq_count = db.ref_header.sequences;
+
+	config.min_id = 70;
+	vector<int> centroid1(cluster(db, nullptr));
+	vector<bool> rep1(rep_bitset(centroid1));
+	size_t n_rep1 = count_if(rep1.begin(), rep1.end(), [](bool x) { return x; });
+	message_stream << "Clustering step 1 complete. #Input sequences: " << centroid1.size() << " #Clusters: " << n_rep1 << endl;
+
+	config.mode_more_sensitive = true;
+	config.min_id = 0;
+	vector<int> centroid2(cluster(db, &rep1));
+	vector<bool> rep2(rep_bitset(centroid2, &rep1));
+	for (size_t i = 0; i < centroid2.size(); ++i)
+		if (!rep1[i])
+			centroid2[i] = centroid2[centroid1[i]];
+	message_stream << "Clustering step 2 complete. #Input sequences: " << n_rep1 << " #Clusters: " << count_if(rep2.begin(), rep2.end(), [](bool x) { return x; }) << endl;
+
+	task_timer timer("Generating output");
+	Sequence_set *rep_seqs;
+	String_set<char, 0> *rep_ids;
+	vector<unsigned> rep_database_id, rep_block_id(seq_count);
+	db.rewind();
+	db.load_seqs(rep_database_id, (size_t)1e11, &rep_seqs, &rep_ids, true, &rep2);
+	for (size_t i = 0; i < rep_database_id.size(); ++i)
+		rep_block_id[rep_database_id[i]] = (unsigned)i;
+
+	ostream *out = config.output_file.empty() ? &cout : new ofstream(config.output_file.c_str());
+	vector<Letter> seq;
+	string id;
+	db.seek_direct();
+	Hsp hsp;
+	size_t n;
+	out->precision(3);
+
+	for (int i = 0; i < (int)db.ref_header.sequences; ++i) {
+		db.read_seq(id, seq);
+		const unsigned r = rep_block_id[centroid2[i]];
+		(*out) << blast_id(id) << '\t'
+			<< blast_id((*rep_ids)[r]) << '\t';		
+		if ((int)i == centroid2[i])
+			(*out) << "100\t100\t100\t0" << endl;
+		else {
+			Masking::get().bit_to_hard_mask(seq.data(), seq.size(), n);
+			smith_waterman(sequence(seq), (*rep_seqs)[r], hsp);
+			(*out) << hsp.id_percent() << '\t'
+				<< hsp.query_cover_percent((unsigned)seq.size()) << '\t'
+				<< hsp.subject_cover_percent((unsigned)(*rep_seqs)[r].length()) << '\t'
+				<< score_matrix.bitscore(hsp.score) << endl;
+		}
+	}
+
+	if (out != &cout) delete out;
+	delete rep_seqs;
+	delete rep_ids;
+}
+
+void run() {
+	if (config.database == "")
+		throw runtime_error("Missing parameter: database file (--db/-d)");
+	config.command = Config::makedb;
+	unique_ptr<DatabaseFile> db(DatabaseFile::auto_create_from_fasta());
+	run_two_step_clustering(*db);
+	db->close();
+}
+}}}
diff --git a/src/run/cluster.cpp b/src/run/cluster.cpp
deleted file mode 100644
index c88944df..00000000
--- a/src/run/cluster.cpp
+++ /dev/null
@@ -1,516 +0,0 @@
-/****
-DIAMOND protein aligner
-Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-****/
-
-#include <Eigen/Sparse>
-#include <algorithm>
-#include <stdexcept>
-#include <string>
-#include <stdio.h>
-#include <memory>
-#include <fstream>
-#include <limits>
-#include "../util/system/system.h"
-#include "../util/util.h"
-#include "../basic/config.h"
-#include "../data/reference.h"
-#include "workflow.h"
-#include "disjoint_set.h"
-#include "../util/io/consumer.h"
-#include "../util/algo/algo.h"
-#include "../basic/statistics.h"
-#include "../util/log_stream.h"
-#include "../dp/dp.h"
-#include "../basic/masking.h"
-
-using namespace std;
-
-namespace Workflow { namespace Cluster {
-
-struct Neighbors : public vector<vector<int>>, public Consumer {
-	Neighbors(size_t n):
-		vector<vector<int>>(n)
-	{}
-	virtual void consume(const char *ptr, size_t n) override {
-		int query, subject, count;
-		float qcov, scov, bitscore;
-		const char *end = ptr + n;
-		while (ptr < end) {
-			//if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2)
-			if (sscanf(ptr, "%i\t%i\t%f\t%f\t%f\n%n", &query, &subject, &qcov, &scov, &bitscore, &count) != 5)
-				throw runtime_error("Cluster format error.");
-			ptr += count;
-			//cout << query << '\t' << subject << '\t' << qcov << '\t' << scov << '\t' << endl;
-			(*this)[query].push_back(subject);
-			edges.push_back({ query, subject, (int)bitscore });
-		}
-	}
-	vector<Util::Algo::Edge> edges;
-};
-
-class NeighborStream : public Consumer {
-	LazyDisjointSet<size_t>* disjointSet;
-	virtual void consume(const char *ptr, size_t n) override {
-		size_t query, subject, count;
-		float qcov, scov, bitscore;
-		const char *end = ptr + n;
-		while (ptr < end) {
-			//if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2)
-			if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &count) != 5)
-				throw runtime_error("Cluster format error.");
-			disjointSet->merge(query, subject);
-			ptr += count;
-		}
-	}
-public:
-	NeighborStream(size_t n){
-		disjointSet = new LazyDisjointIntegralSet<size_t>(n); 
-	}
-	~NeighborStream(){
-		delete disjointSet;
-	}
-	vector<unordered_set<size_t>> getListOfSets(){
-		return disjointSet->getListOfSets();
-	}
-};
-
-template <typename T>
-class SparseMatrixStream : public Consumer {
-	size_t n;
-	vector<Eigen::Triplet<T>> data;
-	LazyDisjointSet<size_t>* disjointSet;
-	virtual void consume(const char *ptr, size_t n) override {
-		size_t query, subject, count;
-		float qcov, scov, bitscore, id;
-		const char *end = ptr + n;
-		while (ptr < end) {
-			//if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2)
-			if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &id, &count) != 6) 
-				throw runtime_error("Cluster format error.");
-			data.emplace_back(query, subject, (qcov/100.0f) * (scov/100.0f) * (id/100.0f));
-			disjointSet->merge(query, subject);
-			ptr += count;
-		}
-	}
-public:
-	SparseMatrixStream(size_t n){
-		this->n = n;
-		disjointSet = new LazyDisjointIntegralSet<size_t>(n); 
-		// Note: this makes sure the self hits are always included, the value needs to be aligned with the measure used in consume!
-		for(uint32_t idiag=0; idiag<n; idiag++){
-			data.emplace_back(idiag, idiag, 1.0);
-		}
-	}
-	~SparseMatrixStream(){
-		delete disjointSet;
-	}
-	pair<vector<vector<uint32_t>>, vector<Eigen::SparseMatrix<T>>> getComponents(){
-		vector<unordered_set<size_t>> sets = disjointSet->getListOfSets();
-		vector<vector<Eigen::Triplet<T>>> split(sets.size());
-		unordered_map<size_t, uint32_t> indexToSetId;
-		for(uint32_t iset = 0; iset < sets.size(); iset++){
-			split.push_back(vector<Eigen::Triplet<T>>());
-			for(size_t index : sets[iset]){
-				indexToSetId.emplace(index, iset);
-			}
-		}
-		for(Eigen::Triplet<T> d : data){
-			assert(indexToSetId[d.row()] == indexToSetId[d.row()]);
-			split[indexToSetId[d.row()]].emplace_back(d.row(), d.col(), d.value());
-		}
-		vector<vector<uint32_t>> indices(sets.size());
-		vector<Eigen::SparseMatrix<T>> components(sets.size());
-		for(uint32_t iset = 0; iset < sets.size(); iset++){
-			Eigen::SparseMatrix<T> component(sets[iset].size(), sets[iset].size());
-			vector<uint32_t> order(sets[iset].begin(), sets[iset].end());
-			map<uint32_t, uint32_t> index_map;
-			uint32_t iel = 0;
-			for(uint32_t el: order){
-				index_map.emplace(el, iel++);
-			}
-			vector<Eigen::Triplet<T>> remapped(split[iset].size());
-			for(Eigen::Triplet<T> t : split[iset]){
-				remapped.emplace_back(index_map[t.row()], index_map[t.col()], t.value());
-			}
-			component.setFromTriplets(remapped.begin(), remapped.end());
-			components.emplace_back(move(component));
-			indices.emplace_back(move(order));
-		}
-		return std::make_pair(indices, components);
-	}
-	Eigen::SparseMatrix<T> getMatrix(){
-		Eigen::SparseMatrix<T> m(n,n);
-		m.setFromTriplets(data.begin(), data.end());
-		return m;
-	}
-};
-
-vector<bool> rep_bitset(const vector<int> &centroid, const vector<bool> *superset = nullptr) {
-	vector<bool> r(centroid.size());
-	for (int c : centroid)
-		if(!superset || (*superset)[c])
-			r[c] = true;
-	return r;
-}
-
-vector<int> cluster(DatabaseFile &db, const vector<bool> *filter) {
-	statistics.reset();
-	config.command = Config::blastp;
-	config.no_self_hits = true;
-	//config.output_format = { "6", "qnum", "snum" };
-	config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore" };
-	config.query_cover = 80;
-	config.subject_cover = 80;
-	config.algo = 0;
-	config.index_mode = 0;
-	config.freq_sd = 0;
-	config.max_alignments = numeric_limits<uint64_t>::max();
-
-	Workflow::Search::Options opt;
-	opt.db = &db;
-	opt.self = true;
-	Neighbors nb(db.ref_header.sequences);
-	opt.consumer = &nb;
-	opt.db_filter = filter;
-
-	Workflow::Search::run(opt);
-
-	return Util::Algo::greedy_vortex_cover(nb);
-}
-
-void run_two_step_clustering(DatabaseFile& db){
-	const size_t seq_count = db.ref_header.sequences;
-
-	config.min_id = 70;
-	vector<int> centroid1(cluster(db, nullptr));
-	vector<bool> rep1(rep_bitset(centroid1));
-	size_t n_rep1 = count_if(rep1.begin(), rep1.end(), [](bool x) { return x; });
-	message_stream << "Clustering step 1 complete. #Input sequences: " << centroid1.size() << " #Clusters: " << n_rep1 << endl;
-
-	config.mode_more_sensitive = true;
-	config.min_id = 0;
-	vector<int> centroid2(cluster(db, &rep1));
-	vector<bool> rep2(rep_bitset(centroid2, &rep1));
-	for (size_t i = 0; i < centroid2.size(); ++i)
-		if (!rep1[i])
-			centroid2[i] = centroid2[centroid1[i]];
-	message_stream << "Clustering step 2 complete. #Input sequences: " << n_rep1 << " #Clusters: " << count_if(rep2.begin(), rep2.end(), [](bool x) { return x; }) << endl;
-
-	task_timer timer("Generating output");
-	Sequence_set *rep_seqs;
-	String_set<char, 0> *rep_ids;
-	vector<unsigned> rep_database_id, rep_block_id(seq_count);
-	db.rewind();
-	db.load_seqs(rep_database_id, (size_t)1e11, &rep_seqs, &rep_ids, true, &rep2);
-	for (size_t i = 0; i < rep_database_id.size(); ++i)
-		rep_block_id[rep_database_id[i]] = (unsigned)i;
-
-	ostream *out = config.output_file.empty() ? &cout : new ofstream(config.output_file.c_str());
-	vector<Letter> seq;
-	string id;
-	db.seek_direct();
-	Hsp hsp;
-	size_t n;
-	out->precision(3);
-
-	for (int i = 0; i < (int)db.ref_header.sequences; ++i) {
-		db.read_seq(id, seq);
-		const unsigned r = rep_block_id[centroid2[i]];
-		(*out) << blast_id(id) << '\t'
-			<< blast_id((*rep_ids)[r]) << '\t';		
-		if ((int)i == centroid2[i])
-			(*out) << "100\t100\t100\t0" << endl;
-		else {
-			Masking::get().bit_to_hard_mask(seq.data(), seq.size(), n);
-			smith_waterman(sequence(seq), (*rep_seqs)[r], hsp);
-			(*out) << hsp.id_percent() << '\t'
-				<< hsp.query_cover_percent((unsigned)seq.size()) << '\t'
-				<< hsp.subject_cover_percent((unsigned)(*rep_seqs)[r].length()) << '\t'
-				<< score_matrix.bitscore(hsp.score) << endl;
-		}
-	}
-
-	if (out != &cout) delete out;
-	delete rep_seqs;
-	delete rep_ids;
-}
-
-
-void run_transitive_closure_clustering(DatabaseFile& db){
-	// testing
-	// LazyDisjointIntegralSet<uint32_t> test(5);
-	// test.merge(1,3);
-	// test.merge(3,4);
-	// test.merge(0,2);
-	// auto los = test.getListOfSets();
-	// for(auto& set: los){
-	// 	for(auto item:set){
-	// 		cout<< " " << item;
-	// 	}
-	// 	cout<<endl;
-	// }
-	// unordered_set<string> s ( {"red","green","blue","rain", "sun", "snow"} );
-	// LazyDisjointTypeSet<string> ts(&s);
-
-	// ts.merge("red", "green");
-	// ts.merge("snow", "rain");
-	// ts.merge("sun", "rain");
-	// auto lo = ts.getListOfSets();
-	// for(auto& set: lo){
-	// 	cout<< "set :";
-	// 	for(auto item:set){
-	// 		cout<< " " << item;
-	// 	}
-	// 	cout<<endl;
-	// }
-
-	const size_t seq_count = db.ref_header.sequences;
-	statistics.reset();
-	config.command = Config::blastp;
-	config.no_self_hits = true;
-	//config.output_format = { "6", "qnum", "snum" };
-	config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore" };
-	config.query_cover = 80;
-	config.subject_cover = 80;
-	config.algo = 0;
-	config.index_mode = 0;
-	config.freq_sd = 0;
-	config.max_alignments = numeric_limits<uint64_t>::max();
-
-	Workflow::Search::Options opt;
-	opt.db = &db;
-	opt.self = true;
-	NeighborStream ns(db.ref_header.sequences);
-	opt.consumer = &ns;
-	opt.db_filter = nullptr;
-
-	Workflow::Search::run(opt);
-
-	auto clusterResult = ns.getListOfSets();
-	cout << "Found "<<clusterResult.size()<<" clusters"<<endl;
-	// for(auto& set: clusterResult){
-	// 	cout<< "set :";
-	// 	for(auto item : set){
-	// 		cout<< " " << item;
-	// 	}
-	// 	cout<<endl;
-	// }
-}
-
-Eigen::SparseMatrix<float> get_gamma(Eigen::SparseMatrix<float>* m, float r){
-	vector<Eigen::Triplet<float>> data;
-	for (uint32_t k=0; k<m->outerSize(); ++k){
-		float colSum = 0.0f;
-		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
-			colSum += pow(it.value(),r);
-		}
-		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
-			data.emplace_back(it.row(), it.col(), pow(it.value(),r) / colSum);
-		}
-	}
-	Eigen::SparseMatrix<float> gamma(m->rows(), m->cols());
-	gamma.setFromTriplets(data.begin(), data.end());
-	return gamma.pruned(1.0, std::numeric_limits<float>::epsilon());
-}
-
-vector<unordered_set<uint32_t>> get_list(Eigen::SparseMatrix<float>* m){
-	LazyDisjointIntegralSet<uint32_t> disjointSet(m->cols());
-	for (uint32_t k=0; k<m->outerSize(); ++k){
-		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
-			disjointSet.merge(it.row(), it.col());
-		}
-	}
-	return disjointSet.getListOfSets();
-}
-
-vector<unordered_set<uint32_t>> markov_process(Eigen::SparseMatrix<float>* m){
-	uint32_t iteration = 0;
-	double diff_norm = std::numeric_limits<double>::max();
-	*m  = get_gamma(m,1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable
-	while( iteration < 100 && diff_norm > 1e-6*m->rows() ){
-		Eigen::SparseMatrix<float> msquared = ((*m) * (*m)).pruned(1.0, std::numeric_limits<float>::epsilon());
-		Eigen::SparseMatrix<float> m_update = get_gamma(&msquared, 2);
-		diff_norm = (*m - m_update).norm();
-		*m = m_update.pruned(1.0, std::numeric_limits<float>::epsilon());
-		iteration++;
-	}
-	return get_list(m);
-}
-
-void run_markov_clustering(DatabaseFile& db){
-	// testing
-	// vector<Eigen::Triplet<float>> data;
-	// data.emplace_back(0, 0, 0.2f);
-	// data.emplace_back(0, 1, 0.25f);
-	// data.emplace_back(0, 5, 0.333f);
-	// data.emplace_back(0, 6, 0.25f);
-	// data.emplace_back(0, 9, 0.25f);
-	// data.emplace_back(1, 0, 0.20f);
-	// data.emplace_back(1, 1, 0.25f);
-	// data.emplace_back(1, 2, 0.25f);
-	// data.emplace_back(1, 4, 0.20f);
-	// data.emplace_back(2, 1, 0.25f);
-	// data.emplace_back(2, 2, 0.25f);
-	// data.emplace_back(2, 3, 0.20f);
-	// data.emplace_back(2, 4, 0.20f);
-	// data.emplace_back(3, 2, 0.25f);
-	// data.emplace_back(3, 3, 0.20f);
-	// data.emplace_back(3, 7, 0.20f);
-	// data.emplace_back(3, 8, 0.20f);
-	// data.emplace_back(3, 10, 0.20f);
-	// data.emplace_back(4, 1, 0.25f);
-	// data.emplace_back(4, 2, 0.25f);
-	// data.emplace_back(4, 4, 0.20f);
-	// data.emplace_back(4, 6, 0.25f);
-	// data.emplace_back(4, 7, 0.20f);
-	// data.emplace_back(5, 0, 0.20f);
-	// data.emplace_back(5, 5, 0.333f);
-	// data.emplace_back(5, 9, 0.25f);
-	// data.emplace_back(6, 0, 0.20f);
-	// data.emplace_back(6, 4, 0.20f);
-	// data.emplace_back(6, 6, 0.25f);
-	// data.emplace_back(6, 9, 0.25f);
-	// data.emplace_back(7, 3, 0.20f);
-	// data.emplace_back(7, 4, 0.20f);
-	// data.emplace_back(7, 7, 0.20f);
-	// data.emplace_back(7, 8, 0.20f);
-	// data.emplace_back(7, 10, 0.20f);
-	// data.emplace_back(8, 3, 0.20f);
-	// data.emplace_back(8, 7, 0.20f);
-	// data.emplace_back(8, 8, 0.20f);
-	// data.emplace_back(8, 10, 0.20f);
-	// data.emplace_back(8, 11, 0.333f);
-	// data.emplace_back(9, 0, 0.20f);
-	// data.emplace_back(9, 5, 0.333f);
-	// data.emplace_back(9, 6, 0.25f);
-	// data.emplace_back(9, 9, 0.25f);
-	// data.emplace_back(10, 3, 0.20f);
-	// data.emplace_back(10, 7, 0.20f);
-	// data.emplace_back(10, 8, 0.20f);
-	// data.emplace_back(10, 10, 0.20f);
-	// data.emplace_back(10, 11, 0.333f);
-	// data.emplace_back(11, 8, 0.20f);
-	// data.emplace_back(11, 10, 0.20f);
-	// data.emplace_back(11, 11, 0.333f);
-	// Eigen::SparseMatrix<float> tmp(12,12);
-	// tmp.setFromTriplets(data.begin(), data.end());
-	// auto cr = markov_process(&tmp);
-	// cout << "Found "<<cr.size()<<" clusters"<<endl;
-	// for(auto& set: cr){
-	// 	cout<< "set :";
-	// 	for(auto item : set){
-	// 		cout<< " " << item;
-	// 	}
-	// 	cout<<endl;
-	// }
-	// return;
-
-	const size_t seq_count = db.ref_header.sequences;
-	statistics.reset();
-	config.command = Config::blastp;
-	config.no_self_hits = true;
-	config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore", "pident" };
-	config.query_cover = 80;
-	config.subject_cover = 80;
-	config.algo = 0;
-	config.index_mode = 0;
-	config.freq_sd = 0;
-	config.max_alignments = numeric_limits<uint64_t>::max();
-
-	Workflow::Search::Options opt;
-	opt.db = &db;
-	opt.self = true;
-	SparseMatrixStream<float> ms(db.ref_header.sequences);
-	opt.consumer = &ms;
-	opt.db_filter = nullptr;
-	Workflow::Search::run(opt);
-
-	auto componentInfo = ms.getComponents();
-	vector<vector<uint32_t>> indices = get<0>(componentInfo);
-	vector<Eigen::SparseMatrix<float>> components = get<1>(componentInfo);
-	uint32_t nComponents = count_if(indices.begin(), indices.end(), [](vector<uint32_t> v){ return v.size() > 0;});
-	uint32_t nComponentsLt1 = count_if(indices.begin(), indices.end(), [](vector<uint32_t> v){ return v.size() > 1;});
-
-	cout << "DIAMOND done" << endl;
-	cout << "************" << endl;
-	cout << "Found " << nComponentsLt1 <<" ("<<nComponents<< " incl. singletons) disconnected components" << endl;
-
-	vector<unordered_set<uint32_t>> clustering_result;
-	float max_sparsity = 0.0f;
-	float min_sparsity = 1.0f;
-	bool full = false;
-	if( full ){
-		// TODO: According to the SIAM publication this is not valid, just for debugging
-		Eigen::SparseMatrix<float> m = ms.getMatrix();
-		for(unordered_set<uint32_t> subset : markov_process(&m)){
-			clustering_result.emplace_back(subset);
-		}
-	}
-	else {
-		// TODO: check if using dense matrices for non-sparse components improves performance
-		for(uint32_t iComponent = 0; iComponent<indices.size(); iComponent++){
-			vector<uint32_t> order = indices[iComponent];
-			if(order.size() > 1){
-				Eigen::SparseMatrix<float> m = components[iComponent];
-				float sparsity = 1.0-(1.0 * m.nonZeros()) / (m.rows()*m.cols());
-				max_sparsity = max(max_sparsity, sparsity);
-				min_sparsity = min(min_sparsity, sparsity);
-				// map back to original ids
-				for(unordered_set<uint32_t> subset : markov_process(&m)){
-					unordered_set<uint32_t> s;
-					for(uint32_t el : subset){
-						s.emplace(order[el]);
-					}
-					clustering_result.emplace_back(s);
-				}
-			}
-			else if (order.size() == 1){
-				clustering_result.emplace_back(order.begin(), order.end());
-			}
-		}
-	}
-	uint32_t nClusters = clustering_result.size();
-	uint32_t nClustersLt1 = count_if(clustering_result.begin(), clustering_result.end(), [](unordered_set<uint32_t> v){ return v.size() > 1;});
-	cout << "Found "<<nClustersLt1<< " ("<< nClusters<<" incl. singletons) clusters with min sparsity "<<min_sparsity<< " and max. sparsity " << max_sparsity << endl;
-	// for(auto& set: clustering_result){
-	// 	cout<< "set :";
-	// 	for(auto item : set){
-	// 		cout<< " " << item;
-	// 	}
-	// 	cout<<endl;
-	// }
-}
-
-void run() {
-	if (config.database == "")
-		throw runtime_error("Missing parameter: database file (--db/-d)");
-	config.command = Config::makedb;
-	unique_ptr<DatabaseFile> db(DatabaseFile::auto_create_from_fasta());
-	switch(config.cluster_algo){
-		case Config::multi_step:
-			run_two_step_clustering(*db);
-			break;
-		case Config::transitive_closure:
-			run_transitive_closure_clustering(*db);
-			break;
-		case Config::markov_clustering:
-			run_markov_clustering(*db);
-			break;
-	}
-	db->close();
-}
-}}
diff --git a/src/run/main.cpp b/src/run/main.cpp
index b597e41c..01df35a2 100644
--- a/src/run/main.cpp
+++ b/src/run/main.cpp
@@ -116,8 +116,15 @@ int main(int ac, const char* av[])
 			pairwise();
 			break;
 		case Config::cluster:
-			Workflow::Cluster::run();
-			break;
+			if(config.cluster_algo == "multi-step"){
+				Workflow::Cluster::MultiStep::run();
+			}
+			else if(config.cluster_algo == "mcl"){
+				Workflow::Cluster::MCL::run();
+			}
+			else{
+				throw std::runtime_error("The --cluster-algo option currently only accepts \"multi-step\" and \"mcl\"");
+			}
 		case Config::translate:
 			translate();
 			break;
diff --git a/src/run/workflow.h b/src/run/workflow.h
index ce85b5a4..7e9c35ab 100644
--- a/src/run/workflow.h
+++ b/src/run/workflow.h
@@ -31,11 +31,14 @@ void run(const Options &options);
 }
 
 namespace Cluster {
-
+namespace MultiStep {
 void run();
-
+}
+namespace MCL {
+void run();
+}
 }
 
 }
 
-#endif
\ No newline at end of file
+#endif

From 2383f40692749aeee00541a6f6d58d610983ad58 Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Sat, 25 Apr 2020 10:56:41 +0200
Subject: [PATCH 06/21] refactoring of clustering algos

made it simpler to include further clustering algorithms in the future.
---
 CMakeLists.txt                     |  1 +
 src/basic/config.cpp               | 10 ++++
 src/basic/masking.h                |  3 +-
 src/cluster/multi_step_cluster.cpp | 80 +++++++-----------------------
 src/run/main.cpp                   | 12 ++---
 src/run/workflow.h                 | 11 ----
 6 files changed, 34 insertions(+), 83 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91b00ab7..c1284de7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,6 +171,7 @@ set(OBJECTS
   src/align/gapped.cpp
   src/align/culling.cpp
   src/cluster/medoid.cpp
+  src/cluster/cluster_registry.cpp
   src/cluster/multi_step_cluster.cpp
   src/cluster/mcl.cpp
   src/align/output.cpp
diff --git a/src/basic/config.cpp b/src/basic/config.cpp
index b097858a..fb43885c 100644
--- a/src/basic/config.cpp
+++ b/src/basic/config.cpp
@@ -28,6 +28,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "shape_config.h"
 #include "../util/io/temp_file.h"
 #include "../basic/match.h"
+#include "../cluster/cluster_registry.h"
 #include "../basic/translate.h"
 #include "../dp/dp.h"
 #include "masking.h"
@@ -469,6 +470,15 @@ Config::Config(int argc, const char **argv, bool check_io)
 				throw std::runtime_error("Custom scoring matrices require setting the --gapopen and --gapextend options.");
 			score_matrix = Score_matrix(matrix_file, lambda, K, gap_open, gap_extend);
 		}
+		if(command == Config::cluster && !Workflow::Cluster::ClusterRegistry::has(cluster_algo)){
+			ostream &header_out = command == Config::help ? cout : cerr;
+			header_out << "Unkown clustering algorithm: " << cluster_algo << endl;
+			header_out << "Available options are: " << endl;
+			for(string c_algo : Workflow::Cluster::ClusterRegistry::getKeys()){
+				header_out << "\t" << c_algo << endl;
+			}
+			throw std::runtime_error("Clustering algorithm not found.");
+		}
 		message_stream << "Scoring parameters: " << score_matrix << endl;
 		if (masking == 1)
 			Masking::instance = unique_ptr<Masking>(new Masking(score_matrix));
diff --git a/src/basic/masking.h b/src/basic/masking.h
index 24e872d9..28f12c6f 100644
--- a/src/basic/masking.h
+++ b/src/basic/masking.h
@@ -16,6 +16,7 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
+#pragma once
 #include <memory>
 #include "value.h"
 #include "score_matrix.h"
@@ -41,4 +42,4 @@ struct Masking
 	char mask_table_x_[size], mask_table_bit_[size];
 };
 
-size_t mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask = true);
\ No newline at end of file
+size_t mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask = true);
diff --git a/src/cluster/multi_step_cluster.cpp b/src/cluster/multi_step_cluster.cpp
index 6cb2f704..85f90abf 100644
--- a/src/cluster/multi_step_cluster.cpp
+++ b/src/cluster/multi_step_cluster.cpp
@@ -16,51 +16,12 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 
-#include <algorithm>
-#include <stdexcept>
-#include <string>
-#include <stdio.h>
-#include <memory>
-#include <fstream>
-#include <limits>
-#include "../util/system/system.h"
-#include "../util/util.h"
-#include "../basic/config.h"
-#include "../data/reference.h"
-#include "../run/workflow.h"
-#include "../util/io/consumer.h"
-#include "../util/algo/algo.h"
-#include "../basic/statistics.h"
-#include "../util/log_stream.h"
-#include "../dp/dp.h"
-#include "../basic/masking.h"
+#include "multi_step_cluster.h"
 
 using namespace std;
 
-namespace Workflow { namespace Cluster{ namespace MultiStep {
-
-struct Neighbors : public vector<vector<int>>, public Consumer {
-	Neighbors(size_t n):
-		vector<vector<int>>(n)
-	{}
-	virtual void consume(const char *ptr, size_t n) override {
-		int query, subject, count;
-		float qcov, scov, bitscore;
-		const char *end = ptr + n;
-		while (ptr < end) {
-			//if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2)
-			if (sscanf(ptr, "%i\t%i\t%f\t%f\t%f\n%n", &query, &subject, &qcov, &scov, &bitscore, &count) != 5)
-				throw runtime_error("Cluster format error.");
-			ptr += count;
-			//cout << query << '\t' << subject << '\t' << qcov << '\t' << scov << '\t' << endl;
-			(*this)[query].push_back(subject);
-			edges.push_back({ query, subject, (int)bitscore });
-		}
-	}
-	vector<Util::Algo::Edge> edges;
-};
-
-vector<bool> rep_bitset(const vector<int> &centroid, const vector<bool> *superset = nullptr) {
+namespace Workflow { namespace Cluster{ 
+vector<bool> MultiStep::rep_bitset(const vector<int> &centroid, const vector<bool> *superset) {
 	vector<bool> r(centroid.size());
 	for (int c : centroid)
 		if(!superset || (*superset)[c])
@@ -68,7 +29,7 @@ vector<bool> rep_bitset(const vector<int> &centroid, const vector<bool> *superse
 	return r;
 }
 
-vector<int> cluster(DatabaseFile &db, const vector<bool> *filter) {
+vector<int> MultiStep::cluster(DatabaseFile &db, const vector<bool> *filter) {
 	statistics.reset();
 	config.command = Config::blastp;
 	config.no_self_hits = true;
@@ -93,18 +54,22 @@ vector<int> cluster(DatabaseFile &db, const vector<bool> *filter) {
 	return Util::Algo::greedy_vortex_cover(nb);
 }
 
-void run_two_step_clustering(DatabaseFile& db){
-	const size_t seq_count = db.ref_header.sequences;
+void MultiStep::run_clustering() {
+	if (config.database == "")
+		throw runtime_error("Missing parameter: database file (--db/-d)");
+	config.command = Config::makedb;
+	unique_ptr<DatabaseFile> db(DatabaseFile::auto_create_from_fasta());
+	const size_t seq_count = db->ref_header.sequences;
 
 	config.min_id = 70;
-	vector<int> centroid1(cluster(db, nullptr));
+	vector<int> centroid1(cluster(*db, nullptr));
 	vector<bool> rep1(rep_bitset(centroid1));
 	size_t n_rep1 = count_if(rep1.begin(), rep1.end(), [](bool x) { return x; });
 	message_stream << "Clustering step 1 complete. #Input sequences: " << centroid1.size() << " #Clusters: " << n_rep1 << endl;
 
 	config.mode_more_sensitive = true;
 	config.min_id = 0;
-	vector<int> centroid2(cluster(db, &rep1));
+	vector<int> centroid2(cluster(*db, &rep1));
 	vector<bool> rep2(rep_bitset(centroid2, &rep1));
 	for (size_t i = 0; i < centroid2.size(); ++i)
 		if (!rep1[i])
@@ -115,21 +80,21 @@ void run_two_step_clustering(DatabaseFile& db){
 	Sequence_set *rep_seqs;
 	String_set<char, 0> *rep_ids;
 	vector<unsigned> rep_database_id, rep_block_id(seq_count);
-	db.rewind();
-	db.load_seqs(rep_database_id, (size_t)1e11, &rep_seqs, &rep_ids, true, &rep2);
+	db->rewind();
+	db->load_seqs(rep_database_id, (size_t)1e11, &rep_seqs, &rep_ids, true, &rep2);
 	for (size_t i = 0; i < rep_database_id.size(); ++i)
 		rep_block_id[rep_database_id[i]] = (unsigned)i;
 
 	ostream *out = config.output_file.empty() ? &cout : new ofstream(config.output_file.c_str());
 	vector<Letter> seq;
 	string id;
-	db.seek_direct();
+	db->seek_direct();
 	Hsp hsp;
 	size_t n;
 	out->precision(3);
 
-	for (int i = 0; i < (int)db.ref_header.sequences; ++i) {
-		db.read_seq(id, seq);
+	for (int i = 0; i < (int)db->ref_header.sequences; ++i) {
+		db->read_seq(id, seq);
 		const unsigned r = rep_block_id[centroid2[i]];
 		(*out) << blast_id(id) << '\t'
 			<< blast_id((*rep_ids)[r]) << '\t';		
@@ -144,18 +109,9 @@ void run_two_step_clustering(DatabaseFile& db){
 				<< score_matrix.bitscore(hsp.score) << endl;
 		}
 	}
-
 	if (out != &cout) delete out;
 	delete rep_seqs;
 	delete rep_ids;
-}
-
-void run() {
-	if (config.database == "")
-		throw runtime_error("Missing parameter: database file (--db/-d)");
-	config.command = Config::makedb;
-	unique_ptr<DatabaseFile> db(DatabaseFile::auto_create_from_fasta());
-	run_two_step_clustering(*db);
 	db->close();
 }
-}}}
+}}
diff --git a/src/run/main.cpp b/src/run/main.cpp
index d5b73201..65a77c79 100644
--- a/src/run/main.cpp
+++ b/src/run/main.cpp
@@ -21,6 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "tools.h"
 #include "../data/reference.h"
 #include "workflow.h"
+#include "../cluster/cluster_registry.h"
 #include "../util/simd.h"
 #ifdef EXTRA
 #include "../extra/compare.h"
@@ -117,15 +118,8 @@ int main(int ac, const char* av[])
 			pairwise();
 			break;
 		case Config::cluster:
-			if(config.cluster_algo == "multi-step"){
-				Workflow::Cluster::MultiStep::run();
-			}
-			else if(config.cluster_algo == "mcl"){
-				Workflow::Cluster::MCL::run();
-			}
-			else{
-				throw std::runtime_error("The --cluster-algo option currently only accepts \"multi-step\" and \"mcl\"");
-			}
+			Workflow::Cluster::ClusterRegistry::get(config.cluster_algo)->run();
+			break;
 		case Config::translate:
 			translate();
 			break;
diff --git a/src/run/workflow.h b/src/run/workflow.h
index 7e9c35ab..babccf95 100644
--- a/src/run/workflow.h
+++ b/src/run/workflow.h
@@ -8,7 +8,6 @@ struct Consumer;
 struct TextInputFile;
 
 namespace Workflow { 
-
 namespace Search {
 
 struct Options {
@@ -29,16 +28,6 @@ struct Options {
 void run(const Options &options);
 
 }
-
-namespace Cluster {
-namespace MultiStep {
-void run();
-}
-namespace MCL {
-void run();
-}
-}
-
 }
 
 #endif

From 495001cd2fe7453b1f2a9767da0ef1866c684ffe Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Sat, 25 Apr 2020 10:58:04 +0200
Subject: [PATCH 07/21] add forgotten files

---
 src/cluster/cluster.h                 |   7 +
 src/cluster/cluster_registry.cpp      |  10 ++
 src/cluster/cluster_registry.h        |  43 +++++
 src/cluster/clustering_algorithms.cpp |  30 ++++
 src/cluster/mcl.cpp                   | 224 ++++++++++++++++++++++++++
 src/cluster/mcl.h                     | 129 +++++++++++++++
 src/cluster/multi_step_cluster.h      |  78 +++++++++
 7 files changed, 521 insertions(+)
 create mode 100644 src/cluster/cluster.h
 create mode 100644 src/cluster/cluster_registry.cpp
 create mode 100644 src/cluster/cluster_registry.h
 create mode 100644 src/cluster/clustering_algorithms.cpp
 create mode 100644 src/cluster/mcl.cpp
 create mode 100644 src/cluster/mcl.h
 create mode 100644 src/cluster/multi_step_cluster.h

diff --git a/src/cluster/cluster.h b/src/cluster/cluster.h
new file mode 100644
index 00000000..2fdff00a
--- /dev/null
+++ b/src/cluster/cluster.h
@@ -0,0 +1,7 @@
+#pragma once
+#include <string>
+class ClusteringAlgorithm {
+public:
+	virtual void run() = 0;
+	static std::string get_key();
+};
diff --git a/src/cluster/cluster_registry.cpp b/src/cluster/cluster_registry.cpp
new file mode 100644
index 00000000..ddf883da
--- /dev/null
+++ b/src/cluster/cluster_registry.cpp
@@ -0,0 +1,10 @@
+#include "cluster_registry.h"
+
+using namespace std;
+
+namespace Workflow { namespace Cluster{
+	map<string, ClusteringAlgorithm*> ClusterRegistry::regMap;
+	MCL ClusterRegistry::mcl;
+	MultiStep ClusterRegistry::multiStep;
+	ClusterRegistry::StaticConstructor ClusterRegistry::_staticConstructor;
+}}
diff --git a/src/cluster/cluster_registry.h b/src/cluster/cluster_registry.h
new file mode 100644
index 00000000..53653931
--- /dev/null
+++ b/src/cluster/cluster_registry.h
@@ -0,0 +1,43 @@
+#pragma once
+#include "cluster.h"
+#include "mcl.h"
+#include "multi_step_cluster.h"
+
+using namespace std;
+
+namespace Workflow { namespace Cluster{
+class ClusterRegistry{
+private:
+	ClusterRegistry(){};
+	static MCL mcl;
+	static MultiStep multiStep;
+public:
+	static map<string, ClusteringAlgorithm*> regMap;
+	static ClusteringAlgorithm* get(string key){
+		auto ca = ClusterRegistry::regMap.find(key);
+		if(ca == ClusterRegistry::regMap.end()){
+			throw std::runtime_error("Clustering algorithm not found.");
+		}
+		return ca->second;
+	}
+	static bool has(string key){
+		return ClusterRegistry::regMap.find(key) != ClusterRegistry::regMap.end();
+	}
+	static vector<string> getKeys(){
+		auto it = regMap.begin();
+		vector<string> keys;
+		while(it != regMap.end()){
+			keys.push_back(it->first);
+			it++;
+		}
+		return keys;
+	}
+	static struct StaticConstructor {
+		StaticConstructor() {
+			regMap.emplace(multiStep.get_key(), &multiStep);
+			regMap.emplace(mcl.get_key(), &mcl);
+		}
+	} _staticConstructor;
+};
+
+}}
diff --git a/src/cluster/clustering_algorithms.cpp b/src/cluster/clustering_algorithms.cpp
new file mode 100644
index 00000000..ad423373
--- /dev/null
+++ b/src/cluster/clustering_algorithms.cpp
@@ -0,0 +1,30 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "cluster.h"
+#include "cluster.h"
+
+using namespace std;
+
+namespace Workflow { namespace Cluster{ 
+	static map<string, Cluster> clustering_algorithms;
+	static{
+		clustering_algorithms.add(C
+	}
+
+}}
diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp
new file mode 100644
index 00000000..b2cc8eee
--- /dev/null
+++ b/src/cluster/mcl.cpp
@@ -0,0 +1,224 @@
+#include "mcl.h"
+
+using namespace std;
+
+namespace Workflow { namespace Cluster{
+
+Eigen::SparseMatrix<float> MCL::get_gamma(Eigen::SparseMatrix<float>* m, float r){
+	vector<Eigen::Triplet<float>> data;
+	for (uint32_t k=0; k<m->outerSize(); ++k){
+		float colSum = 0.0f;
+		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
+			colSum += pow(it.value(),r);
+		}
+		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
+			data.emplace_back(it.row(), it.col(), pow(it.value(),r) / colSum);
+		}
+	}
+	Eigen::SparseMatrix<float> gamma(m->rows(), m->cols());
+	gamma.setFromTriplets(data.begin(), data.end());
+	return gamma.pruned(1.0, std::numeric_limits<float>::epsilon());
+}
+
+vector<unordered_set<uint32_t>> MCL::get_list(Eigen::SparseMatrix<float>* m){
+	LazyDisjointIntegralSet<uint32_t> disjointSet(m->cols());
+	for (uint32_t k=0; k<m->outerSize(); ++k){
+		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
+			disjointSet.merge(it.row(), it.col());
+		}
+	}
+	return disjointSet.getListOfSets();
+}
+
+vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::SparseMatrix<float>* m){
+	uint32_t iteration = 0;
+	double diff_norm = std::numeric_limits<double>::max();
+	*m  = get_gamma(m,1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable
+	while( iteration < 100 && diff_norm > 1e-6*m->rows() ){
+		Eigen::SparseMatrix<float> msquared = ((*m) * (*m)).pruned(1.0, std::numeric_limits<float>::epsilon());
+		Eigen::SparseMatrix<float> m_update = get_gamma(&msquared, 2);
+		diff_norm = (*m - m_update).norm();
+		*m = m_update.pruned(1.0, std::numeric_limits<float>::epsilon());
+		iteration++;
+	}
+	return get_list(m);
+}
+
+void MCL::run_clustering() {
+	if (config.database == "")
+		throw runtime_error("Missing parameter: database file (--db/-d)");
+	config.command = Config::makedb;
+	unique_ptr<DatabaseFile> db(DatabaseFile::auto_create_from_fasta());
+
+	// testing
+	// LazyDisjointIntegralSet<uint32_t> test(5);
+	// test.merge(1,3);
+	// test.merge(3,4);
+	// test.merge(0,2);
+	// auto los = test.getListOfSets();
+	// for(auto& set: los){
+	// 	for(auto item:set){
+	// 		cout<< " " << item;
+	// 	}
+	// 	cout<<endl;
+	// }
+	// unordered_set<string> s ( {"red","green","blue","rain", "sun", "snow"} );
+	// LazyDisjointTypeSet<string> ts(&s);
+
+	// ts.merge("red", "green");
+	// ts.merge("snow", "rain");
+	// ts.merge("sun", "rain");
+	// auto lo = ts.getListOfSets();
+	// for(auto& set: lo){
+	// 	cout<< "set :";
+	// 	for(auto item:set){
+	// 		cout<< " " << item;
+	// 	}
+	// 	cout<<endl;
+	// }
+	// 
+	// testing
+	// vector<Eigen::Triplet<float>> data;
+	// data.emplace_back(0, 0, 0.2f);
+	// data.emplace_back(0, 1, 0.25f);
+	// data.emplace_back(0, 5, 0.333f);
+	// data.emplace_back(0, 6, 0.25f);
+	// data.emplace_back(0, 9, 0.25f);
+	// data.emplace_back(1, 0, 0.20f);
+	// data.emplace_back(1, 1, 0.25f);
+	// data.emplace_back(1, 2, 0.25f);
+	// data.emplace_back(1, 4, 0.20f);
+	// data.emplace_back(2, 1, 0.25f);
+	// data.emplace_back(2, 2, 0.25f);
+	// data.emplace_back(2, 3, 0.20f);
+	// data.emplace_back(2, 4, 0.20f);
+	// data.emplace_back(3, 2, 0.25f);
+	// data.emplace_back(3, 3, 0.20f);
+	// data.emplace_back(3, 7, 0.20f);
+	// data.emplace_back(3, 8, 0.20f);
+	// data.emplace_back(3, 10, 0.20f);
+	// data.emplace_back(4, 1, 0.25f);
+	// data.emplace_back(4, 2, 0.25f);
+	// data.emplace_back(4, 4, 0.20f);
+	// data.emplace_back(4, 6, 0.25f);
+	// data.emplace_back(4, 7, 0.20f);
+	// data.emplace_back(5, 0, 0.20f);
+	// data.emplace_back(5, 5, 0.333f);
+	// data.emplace_back(5, 9, 0.25f);
+	// data.emplace_back(6, 0, 0.20f);
+	// data.emplace_back(6, 4, 0.20f);
+	// data.emplace_back(6, 6, 0.25f);
+	// data.emplace_back(6, 9, 0.25f);
+	// data.emplace_back(7, 3, 0.20f);
+	// data.emplace_back(7, 4, 0.20f);
+	// data.emplace_back(7, 7, 0.20f);
+	// data.emplace_back(7, 8, 0.20f);
+	// data.emplace_back(7, 10, 0.20f);
+	// data.emplace_back(8, 3, 0.20f);
+	// data.emplace_back(8, 7, 0.20f);
+	// data.emplace_back(8, 8, 0.20f);
+	// data.emplace_back(8, 10, 0.20f);
+	// data.emplace_back(8, 11, 0.333f);
+	// data.emplace_back(9, 0, 0.20f);
+	// data.emplace_back(9, 5, 0.333f);
+	// data.emplace_back(9, 6, 0.25f);
+	// data.emplace_back(9, 9, 0.25f);
+	// data.emplace_back(10, 3, 0.20f);
+	// data.emplace_back(10, 7, 0.20f);
+	// data.emplace_back(10, 8, 0.20f);
+	// data.emplace_back(10, 10, 0.20f);
+	// data.emplace_back(10, 11, 0.333f);
+	// data.emplace_back(11, 8, 0.20f);
+	// data.emplace_back(11, 10, 0.20f);
+	// data.emplace_back(11, 11, 0.333f);
+	// Eigen::SparseMatrix<float> tmp(12,12);
+	// tmp.setFromTriplets(data.begin(), data.end());
+	// auto cr = markov_process(&tmp);
+	// cout << "Found "<<cr.size()<<" clusters"<<endl;
+	// for(auto& set: cr){
+	// 	cout<< "set :";
+	// 	for(auto item : set){
+	// 		cout<< " " << item;
+	// 	}
+	// 	cout<<endl;
+	// }
+	// return;
+
+	const size_t seq_count = db->ref_header.sequences;
+	statistics.reset();
+	config.command = Config::blastp;
+	config.no_self_hits = true;
+	config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore", "pident" };
+	config.query_cover = 80;
+	config.subject_cover = 80;
+	config.algo = 0;
+	config.index_mode = 0;
+	config.freq_sd = 0;
+	config.max_alignments = numeric_limits<uint64_t>::max();
+
+	Workflow::Search::Options opt;
+	opt.db = &(*db);
+	opt.self = true;
+	SparseMatrixStream<float> ms(db->ref_header.sequences);
+	opt.consumer = &ms;
+	opt.db_filter = nullptr;
+	Workflow::Search::run(opt);
+
+	auto componentInfo = ms.getComponents();
+	vector<vector<uint32_t>> indices = get<0>(componentInfo);
+	vector<Eigen::SparseMatrix<float>> components = get<1>(componentInfo);
+	uint32_t nComponents = count_if(indices.begin(), indices.end(), [](vector<uint32_t> v){ return v.size() > 0;});
+	uint32_t nComponentsLt1 = count_if(indices.begin(), indices.end(), [](vector<uint32_t> v){ return v.size() > 1;});
+
+	cout << "DIAMOND done" << endl;
+	cout << "************" << endl;
+	cout << "Found " << nComponentsLt1 <<" ("<<nComponents<< " incl. singletons) disconnected components" << endl;
+
+	vector<unordered_set<uint32_t>> clustering_result;
+	float max_sparsity = 0.0f;
+	float min_sparsity = 1.0f;
+	bool full = false;
+	if( full ){
+		// TODO: According to the SIAM publication this is not valid, just for debugging
+		Eigen::SparseMatrix<float> m = ms.getMatrix();
+		for(unordered_set<uint32_t> subset : markov_process(&m)){
+			clustering_result.emplace_back(subset);
+		}
+	}
+	else {
+		// TODO: check if using dense matrices for non-sparse components improves performance
+		for(uint32_t iComponent = 0; iComponent<indices.size(); iComponent++){
+			vector<uint32_t> order = indices[iComponent];
+			if(order.size() > 1){
+				Eigen::SparseMatrix<float> m = components[iComponent];
+				float sparsity = 1.0-(1.0 * m.nonZeros()) / (m.rows()*m.cols());
+				max_sparsity = max(max_sparsity, sparsity);
+				min_sparsity = min(min_sparsity, sparsity);
+				// map back to original ids
+				for(unordered_set<uint32_t> subset : markov_process(&m)){
+					unordered_set<uint32_t> s;
+					for(uint32_t el : subset){
+						s.emplace(order[el]);
+					}
+					clustering_result.emplace_back(s);
+				}
+			}
+			else if (order.size() == 1){
+				clustering_result.emplace_back(order.begin(), order.end());
+			}
+		}
+	}
+	uint32_t nClusters = clustering_result.size();
+	uint32_t nClustersLt1 = count_if(clustering_result.begin(), clustering_result.end(), [](unordered_set<uint32_t> v){ return v.size() > 1;});
+	cout << "Found "<<nClustersLt1<< " ("<< nClusters<<" incl. singletons) clusters with min sparsity "<<min_sparsity<< " and max. sparsity " << max_sparsity << endl;
+	// for(auto& set: clustering_result){
+	// 	cout<< "set :";
+	// 	for(auto item : set){
+	// 		cout<< " " << item;
+	// 	}
+	// 	cout<<endl;
+	// }
+	// 
+	db->close();
+}
+}}
diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h
new file mode 100644
index 00000000..4e5c1226
--- /dev/null
+++ b/src/cluster/mcl.h
@@ -0,0 +1,129 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#pragma once
+#include <Eigen/Sparse>
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+#include <stdio.h>
+#include <memory>
+#include <fstream>
+#include <limits>
+#include "../util/system/system.h"
+#include "../util/util.h"
+#include "../basic/config.h"
+#include "../data/reference.h"
+#include "../run/workflow.h"
+#include "disjoint_set.h"
+#include "../util/io/consumer.h"
+#include "../util/algo/algo.h"
+#include "../basic/statistics.h"
+#include "../util/log_stream.h"
+#include "../dp/dp.h"
+#include "cluster.h"
+
+using namespace std;
+
+namespace Workflow { namespace Cluster{
+class MCL: public ClusteringAlgorithm {
+private: 
+	Eigen::SparseMatrix<float> get_gamma(Eigen::SparseMatrix<float>* m, float r);
+	vector<unordered_set<uint32_t>> get_list(Eigen::SparseMatrix<float>* m);
+	vector<unordered_set<uint32_t>> markov_process(Eigen::SparseMatrix<float>* m);
+	void run_clustering();
+public:
+	void run(){
+		run_clustering();
+	}
+	static string get_key(){
+		return "mcl";
+	}
+};
+
+template <typename T>
+class SparseMatrixStream : public Consumer {
+	size_t n;
+	vector<Eigen::Triplet<T>> data;
+	LazyDisjointSet<size_t>* disjointSet;
+	virtual void consume(const char *ptr, size_t n) override {
+		size_t query, subject, count;
+		float qcov, scov, bitscore, id;
+		const char *end = ptr + n;
+		while (ptr < end) {
+			//if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2)
+			if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &id, &count) != 6) 
+				throw runtime_error("Cluster format error.");
+			data.emplace_back(query, subject, (qcov/100.0f) * (scov/100.0f) * (id/100.0f));
+			disjointSet->merge(query, subject);
+			ptr += count;
+		}
+	}
+public:
+	SparseMatrixStream(size_t n){
+		this->n = n;
+		disjointSet = new LazyDisjointIntegralSet<size_t>(n); 
+		// Note: this makes sure the self hits are always included, the value needs to be aligned with the measure used in consume!
+		for(uint32_t idiag=0; idiag<n; idiag++){
+			data.emplace_back(idiag, idiag, 1.0);
+		}
+	}
+	~SparseMatrixStream(){
+		delete disjointSet;
+	}
+	pair<vector<vector<uint32_t>>, vector<Eigen::SparseMatrix<T>>> getComponents(){
+		vector<unordered_set<size_t>> sets = disjointSet->getListOfSets();
+		vector<vector<Eigen::Triplet<T>>> split(sets.size());
+		unordered_map<size_t, uint32_t> indexToSetId;
+		for(uint32_t iset = 0; iset < sets.size(); iset++){
+			split.push_back(vector<Eigen::Triplet<T>>());
+			for(size_t index : sets[iset]){
+				indexToSetId.emplace(index, iset);
+			}
+		}
+		for(Eigen::Triplet<T> d : data){
+			assert(indexToSetId[d.row()] == indexToSetId[d.row()]);
+			split[indexToSetId[d.row()]].emplace_back(d.row(), d.col(), d.value());
+		}
+		vector<vector<uint32_t>> indices(sets.size());
+		vector<Eigen::SparseMatrix<T>> components(sets.size());
+		for(uint32_t iset = 0; iset < sets.size(); iset++){
+			Eigen::SparseMatrix<T> component(sets[iset].size(), sets[iset].size());
+			vector<uint32_t> order(sets[iset].begin(), sets[iset].end());
+			map<uint32_t, uint32_t> index_map;
+			uint32_t iel = 0;
+			for(uint32_t el: order){
+				index_map.emplace(el, iel++);
+			}
+			vector<Eigen::Triplet<T>> remapped(split[iset].size());
+			for(Eigen::Triplet<T> t : split[iset]){
+				remapped.emplace_back(index_map[t.row()], index_map[t.col()], t.value());
+			}
+			component.setFromTriplets(remapped.begin(), remapped.end());
+			components.emplace_back(move(component));
+			indices.emplace_back(move(order));
+		}
+		return make_pair(move(indices), move(components));
+	}
+	Eigen::SparseMatrix<T> getMatrix(){
+		Eigen::SparseMatrix<T> m(n,n);
+		m.setFromTriplets(data.begin(), data.end());
+		return m;
+	}
+};
+}}
diff --git a/src/cluster/multi_step_cluster.h b/src/cluster/multi_step_cluster.h
new file mode 100644
index 00000000..b4cc2258
--- /dev/null
+++ b/src/cluster/multi_step_cluster.h
@@ -0,0 +1,78 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#pragma once
+#include <algorithm>
+#include <stdexcept>
+#include <stdio.h>
+#include <memory>
+#include <fstream>
+#include <limits>
+#include "../util/system/system.h"
+#include "../util/util.h"
+#include "../basic/config.h"
+#include "../data/reference.h"
+#include "../run/workflow.h"
+#include "../util/io/consumer.h"
+#include "../util/algo/algo.h"
+#include "../basic/statistics.h"
+#include "../util/log_stream.h"
+#include "../dp/dp.h"
+#include "../basic/masking.h"
+#include "cluster.h"
+
+using namespace std;
+
+namespace Workflow { namespace Cluster{ 
+
+class MultiStep : public ClusteringAlgorithm {
+private:
+	vector<bool> rep_bitset(const vector<int> &centroid, const vector<bool> *superset = nullptr);
+	vector<int> cluster(DatabaseFile &db, const vector<bool> *filter);
+	void run_clustering();
+public:
+	void run(){
+		run_clustering();
+	}
+	static string get_key(){
+		return "multi-step";
+	}
+};
+
+struct Neighbors : public vector<vector<int>>, public Consumer {
+	Neighbors(size_t n):
+		vector<vector<int>>(n)
+	{}
+	virtual void consume(const char *ptr, size_t n) override {
+		int query, subject, count;
+		float qcov, scov, bitscore;
+		const char *end = ptr + n;
+		while (ptr < end) {
+			//if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2)
+			if (sscanf(ptr, "%i\t%i\t%f\t%f\t%f\n%n", &query, &subject, &qcov, &scov, &bitscore, &count) != 5)
+				throw runtime_error("Cluster format error.");
+			ptr += count;
+			//cout << query << '\t' << subject << '\t' << qcov << '\t' << scov << '\t' << endl;
+			(*this)[query].push_back(subject);
+			edges.push_back({ query, subject, (int)bitscore });
+		}
+	}
+	vector<Util::Algo::Edge> edges;
+};
+
+}}

From d49c9a4f0318f4c7bb22df311a89c33f1b544249 Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Sat, 25 Apr 2020 11:01:39 +0200
Subject: [PATCH 08/21] adding headers and descriptions

---
 src/cluster/cluster.h                 | 18 ++++++++++++++++
 src/cluster/cluster_registry.cpp      | 18 ++++++++++++++++
 src/cluster/cluster_registry.h        | 20 ++++++++++++++++++
 src/cluster/clustering_algorithms.cpp | 30 ---------------------------
 src/cluster/mcl.cpp                   | 18 ++++++++++++++++
 5 files changed, 74 insertions(+), 30 deletions(-)
 delete mode 100644 src/cluster/clustering_algorithms.cpp

diff --git a/src/cluster/cluster.h b/src/cluster/cluster.h
index 2fdff00a..fcbbb875 100644
--- a/src/cluster/cluster.h
+++ b/src/cluster/cluster.h
@@ -1,3 +1,21 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
 #pragma once
 #include <string>
 class ClusteringAlgorithm {
diff --git a/src/cluster/cluster_registry.cpp b/src/cluster/cluster_registry.cpp
index ddf883da..b3345f5c 100644
--- a/src/cluster/cluster_registry.cpp
+++ b/src/cluster/cluster_registry.cpp
@@ -1,3 +1,21 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
 #include "cluster_registry.h"
 
 using namespace std;
diff --git a/src/cluster/cluster_registry.h b/src/cluster/cluster_registry.h
index 53653931..7f89d20f 100644
--- a/src/cluster/cluster_registry.h
+++ b/src/cluster/cluster_registry.h
@@ -1,3 +1,21 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
 #pragma once
 #include "cluster.h"
 #include "mcl.h"
@@ -9,6 +27,7 @@ namespace Workflow { namespace Cluster{
 class ClusterRegistry{
 private:
 	ClusterRegistry(){};
+	// To include new clustering algorithms add the instantiation here and in the cluster_registry.cpp file. Then add it to the StaticConstructor below
 	static MCL mcl;
 	static MultiStep multiStep;
 public:
@@ -34,6 +53,7 @@ class ClusterRegistry{
 	}
 	static struct StaticConstructor {
 		StaticConstructor() {
+			// Add any new clustering algorithm here
 			regMap.emplace(multiStep.get_key(), &multiStep);
 			regMap.emplace(mcl.get_key(), &mcl);
 		}
diff --git a/src/cluster/clustering_algorithms.cpp b/src/cluster/clustering_algorithms.cpp
deleted file mode 100644
index ad423373..00000000
--- a/src/cluster/clustering_algorithms.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/****
-DIAMOND protein aligner
-Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-****/
-
-#include "cluster.h"
-#include "cluster.h"
-
-using namespace std;
-
-namespace Workflow { namespace Cluster{ 
-	static map<string, Cluster> clustering_algorithms;
-	static{
-		clustering_algorithms.add(C
-	}
-
-}}
diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp
index b2cc8eee..d47a0d44 100644
--- a/src/cluster/mcl.cpp
+++ b/src/cluster/mcl.cpp
@@ -1,3 +1,21 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
 #include "mcl.h"
 
 using namespace std;

From c31e615c536511c8b2762fd6672e01a796fb66e3 Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Mon, 4 May 2020 19:51:43 +0200
Subject: [PATCH 09/21] several improvements

- updated the cluster registry to also print a bit of info for the
  available clustering algos
- implemented a dense matrix mcl clustering and an automatic switch at
  80% sparsity
- fixed a (previously silent) bug related to putting entries into the
  matrices
---
 src/basic/config.cpp               |   2 +-
 src/cluster/cluster.h              |   3 +-
 src/cluster/cluster_registry.h     |   2 +-
 src/cluster/mcl.cpp                | 196 +++++++++++++++++++++++------
 src/cluster/mcl.h                  |  53 ++++----
 src/cluster/multi_step_cluster.cpp |   9 +-
 src/cluster/multi_step_cluster.h   |  10 +-
 7 files changed, 197 insertions(+), 78 deletions(-)

diff --git a/src/basic/config.cpp b/src/basic/config.cpp
index fb43885c..ac4b235e 100644
--- a/src/basic/config.cpp
+++ b/src/basic/config.cpp
@@ -475,7 +475,7 @@ Config::Config(int argc, const char **argv, bool check_io)
 			header_out << "Unkown clustering algorithm: " << cluster_algo << endl;
 			header_out << "Available options are: " << endl;
 			for(string c_algo : Workflow::Cluster::ClusterRegistry::getKeys()){
-				header_out << "\t" << c_algo << endl;
+				header_out << "\t" << c_algo << "\t"<< Workflow::Cluster::ClusterRegistry::get(c_algo)->get_description() << endl;
 			}
 			throw std::runtime_error("Clustering algorithm not found.");
 		}
diff --git a/src/cluster/cluster.h b/src/cluster/cluster.h
index fcbbb875..864cd79b 100644
--- a/src/cluster/cluster.h
+++ b/src/cluster/cluster.h
@@ -21,5 +21,6 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 class ClusteringAlgorithm {
 public:
 	virtual void run() = 0;
-	static std::string get_key();
+	virtual std::string get_key() = 0;
+	virtual std::string get_description() = 0;
 };
diff --git a/src/cluster/cluster_registry.h b/src/cluster/cluster_registry.h
index 7f89d20f..d6c872da 100644
--- a/src/cluster/cluster_registry.h
+++ b/src/cluster/cluster_registry.h
@@ -33,7 +33,7 @@ class ClusterRegistry{
 public:
 	static map<string, ClusteringAlgorithm*> regMap;
 	static ClusteringAlgorithm* get(string key){
-		auto ca = ClusterRegistry::regMap.find(key);
+		map<string, ClusteringAlgorithm*>::iterator ca = ClusterRegistry::regMap.find(key);
 		if(ca == ClusterRegistry::regMap.end()){
 			throw std::runtime_error("Clustering algorithm not found.");
 		}
diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp
index d47a0d44..e25bf696 100644
--- a/src/cluster/mcl.cpp
+++ b/src/cluster/mcl.cpp
@@ -15,27 +15,76 @@ GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
-
 #include "mcl.h"
 
 using namespace std;
 
 namespace Workflow { namespace Cluster{
+string MCL::get_key() {
+	return "mcl";
+}
+string MCL::get_description() {
+	return "Markov clustering according to doi:10.1137/040608635";
+}
+
+void MCL::get_exp(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* out, float r){
+	if( r - (int) r == 0 ){
+		// TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues
+		*out = *in;
+		for(uint32_t i=1; i<r; i++){ 
+			(*out) = (*out)*(*in);
+		}
+	}
+	else{ 
+		throw runtime_error(" Eigen does not provide an eigenvalue solver for sparse matrices");
+	}
+	*out = out->pruned();
+}
+
+void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
+	if( r - (int) r == 0 ){
+		// TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues
+		*out = *in;
+		for(uint32_t i=1; i<r; i++){ 
+			(*out) *= (*in);
+		}
+	}
+	else{ 
+		Eigen::EigenSolver<Eigen::MatrixXf> solver(*in);
+		Eigen::MatrixXcf d = solver.eigenvalues().asDiagonal();
+		for(uint32_t idiag = 0; idiag<d.rows(); idiag++){
+			d(idiag, idiag) = pow(d(idiag, idiag), r);
+		}
+		Eigen::MatrixXcf V = solver.eigenvectors();
+		(*out) = (V * d * V.transpose()).real();
+	}
+}
 
-Eigen::SparseMatrix<float> MCL::get_gamma(Eigen::SparseMatrix<float>* m, float r){
+void MCL::get_gamma(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* out, float r){
 	vector<Eigen::Triplet<float>> data;
-	for (uint32_t k=0; k<m->outerSize(); ++k){
+	for (uint32_t k=0; k<in->outerSize(); ++k){
 		float colSum = 0.0f;
-		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
+		for (Eigen::SparseMatrix<float>::InnerIterator it(*in, k); it; ++it){
 			colSum += pow(it.value(),r);
 		}
-		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
+		for (Eigen::SparseMatrix<float>::InnerIterator it(*in, k); it; ++it){
 			data.emplace_back(it.row(), it.col(), pow(it.value(),r) / colSum);
 		}
 	}
-	Eigen::SparseMatrix<float> gamma(m->rows(), m->cols());
-	gamma.setFromTriplets(data.begin(), data.end());
-	return gamma.pruned(1.0, std::numeric_limits<float>::epsilon());
+	out->setFromTriplets(data.begin(), data.end(), [] (const float&, const float &b) { return b; });
+	*out = out->pruned(1.0, std::numeric_limits<float>::epsilon());
+}
+
+void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
+	for (uint32_t icol=0; icol<in->cols(); ++icol){
+		float colSum = 0.0f;
+		for (uint32_t irow=0; irow<in->rows(); ++irow){
+			colSum += pow((*in)(irow, icol) , r);
+		}
+		for (uint32_t irow=0; irow<in->rows(); ++irow){
+			(*out)(irow, icol) =  pow((*in)(irow, icol) , r)/colSum;
+		}
+	}
 }
 
 vector<unordered_set<uint32_t>> MCL::get_list(Eigen::SparseMatrix<float>* m){
@@ -48,21 +97,53 @@ vector<unordered_set<uint32_t>> MCL::get_list(Eigen::SparseMatrix<float>* m){
 	return disjointSet.getListOfSets();
 }
 
+vector<unordered_set<uint32_t>> MCL::get_list(Eigen::MatrixXf* m){
+	LazyDisjointIntegralSet<uint32_t> disjointSet(m->cols());
+	for (uint32_t icol=0; icol<m->cols(); ++icol){
+		for (uint32_t irow=0; irow<m->rows(); ++irow){
+			if( abs((*m)(irow, icol)) > 1e-7 ){
+				disjointSet.merge(irow, icol);
+			}
+		}
+	}
+	return disjointSet.getListOfSets();
+}
+
 vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::SparseMatrix<float>* m){
 	uint32_t iteration = 0;
 	double diff_norm = std::numeric_limits<double>::max();
-	*m  = get_gamma(m,1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable
+	Eigen::SparseMatrix<float> msquared(m->rows(), m->cols());
+	Eigen::SparseMatrix<float> m_update(m->rows(), m->cols());
+	get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable
+	while( iteration < 100 && diff_norm > 1e-6*m->rows() ){
+		get_exp(m, &msquared, 2);
+		get_gamma(&msquared, &m_update, 2);
+		*m -= m_update;
+		diff_norm = m->norm();
+		*m = m_update;
+		iteration++;
+	}
+	return get_list(m);
+}
+
+vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::MatrixXf* m){
+	uint32_t iteration = 0;
+	double diff_norm = std::numeric_limits<double>::max();
+	Eigen::MatrixXf msquared(m->rows(), m->cols());
+	Eigen::MatrixXf m_update(m->rows(), m->cols());
+	get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable
 	while( iteration < 100 && diff_norm > 1e-6*m->rows() ){
-		Eigen::SparseMatrix<float> msquared = ((*m) * (*m)).pruned(1.0, std::numeric_limits<float>::epsilon());
-		Eigen::SparseMatrix<float> m_update = get_gamma(&msquared, 2);
-		diff_norm = (*m - m_update).norm();
-		*m = m_update.pruned(1.0, std::numeric_limits<float>::epsilon());
+		get_exp(m, &msquared, 2);
+		get_gamma(&msquared, &m_update, 2);
+		*m -= m_update;
+		diff_norm = m->norm();
+		*m = m_update;
 		iteration++;
 	}
 	return get_list(m);
 }
 
-void MCL::run_clustering() {
+void MCL::run(){
 	if (config.database == "")
 		throw runtime_error("Missing parameter: database file (--db/-d)");
 	config.command = Config::makedb;
@@ -165,7 +246,7 @@ void MCL::run_clustering() {
 	const size_t seq_count = db->ref_header.sequences;
 	statistics.reset();
 	config.command = Config::blastp;
-	config.no_self_hits = true;
+	config.no_self_hits = false;
 	config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore", "pident" };
 	config.query_cover = 80;
 	config.subject_cover = 80;
@@ -182,25 +263,35 @@ void MCL::run_clustering() {
 	opt.db_filter = nullptr;
 	Workflow::Search::run(opt);
 
+	task_timer timer;
+	timer.go("Computing independent components");
 	auto componentInfo = ms.getComponents();
 	vector<vector<uint32_t>> indices = get<0>(componentInfo);
-	vector<Eigen::SparseMatrix<float>> components = get<1>(componentInfo);
+	vector<vector<Eigen::Triplet<float>>> components = get<1>(componentInfo);
 	uint32_t nComponents = count_if(indices.begin(), indices.end(), [](vector<uint32_t> v){ return v.size() > 0;});
 	uint32_t nComponentsLt1 = count_if(indices.begin(), indices.end(), [](vector<uint32_t> v){ return v.size() > 1;});
-
-	cout << "DIAMOND done" << endl;
-	cout << "************" << endl;
+	timer.finish();
 	cout << "Found " << nComponentsLt1 <<" ("<<nComponents<< " incl. singletons) disconnected components" << endl;
 
-	vector<unordered_set<uint32_t>> clustering_result;
+	timer.go("Clustering components");
+	bool full = false;
+	vector<uint32_t> clustering_result(db->ref_header.sequences, std::numeric_limits<uint32_t>::max());
 	float max_sparsity = 0.0f;
 	float min_sparsity = 1.0f;
-	bool full = false;
+	uint32_t cluster_id = 0;
+	uint32_t nClustersEq1 = 0;
+	uint32_t n_dense_calculations = 0;
+	uint32_t n_sparse_calculations = 0;
+
 	if( full ){
 		// TODO: According to the SIAM publication this is not valid, just for debugging
 		Eigen::SparseMatrix<float> m = ms.getMatrix();
 		for(unordered_set<uint32_t> subset : markov_process(&m)){
-			clustering_result.emplace_back(subset);
+			for(uint32_t el : subset){
+				clustering_result[el] = cluster_id;
+			}
+			cluster_id++;
+			if(subset.size() == 1) nClustersEq1 ++;
 		}
 	}
 	else {
@@ -208,35 +299,60 @@ void MCL::run_clustering() {
 		for(uint32_t iComponent = 0; iComponent<indices.size(); iComponent++){
 			vector<uint32_t> order = indices[iComponent];
 			if(order.size() > 1){
-				Eigen::SparseMatrix<float> m = components[iComponent];
-				float sparsity = 1.0-(1.0 * m.nonZeros()) / (m.rows()*m.cols());
+				vector<Eigen::Triplet<float>> m = components[iComponent];
+				assert(m.size() <= order.size() * order.size());
+				float sparsity = 1.0-(1.0 * m.size()) / (order.size()*order.size());
 				max_sparsity = max(max_sparsity, sparsity);
 				min_sparsity = min(min_sparsity, sparsity);
 				// map back to original ids
-				for(unordered_set<uint32_t> subset : markov_process(&m)){
-					unordered_set<uint32_t> s;
+				vector<unordered_set<uint32_t>> list_of_sets;
+				if(sparsity >= 0.8){ //TODO: a size limit should be given as well
+					n_sparse_calculations++;
+					Eigen::SparseMatrix<float> m_sparse(order.size(), order.size());
+					m_sparse.setFromTriplets(m.begin(), m.end());
+					list_of_sets = markov_process(&m_sparse);
+				}
+				else{
+					n_dense_calculations++;
+					Eigen::MatrixXf m_dense(order.size(), order.size());
+					for(Eigen::Triplet<float> const & t : m){
+						m_dense(t.row(), t.col()) = t.value();
+					}
+					list_of_sets = markov_process(&m_dense);
+				}
+				for(unordered_set<uint32_t> subset : list_of_sets){
 					for(uint32_t el : subset){
-						s.emplace(order[el]);
+						clustering_result[order[el]] = cluster_id;
 					}
-					clustering_result.emplace_back(s);
+					cluster_id++;
+					if(subset.size() == 1) nClustersEq1 ++;
 				}
 			}
 			else if (order.size() == 1){
-				clustering_result.emplace_back(order.begin(), order.end());
+				clustering_result[order[0]] = cluster_id++;
+				nClustersEq1++;
 			}
 		}
 	}
-	uint32_t nClusters = clustering_result.size();
-	uint32_t nClustersLt1 = count_if(clustering_result.begin(), clustering_result.end(), [](unordered_set<uint32_t> v){ return v.size() > 1;});
-	cout << "Found "<<nClustersLt1<< " ("<< nClusters<<" incl. singletons) clusters with min sparsity "<<min_sparsity<< " and max. sparsity " << max_sparsity << endl;
-	// for(auto& set: clustering_result){
-	// 	cout<< "set :";
-	// 	for(auto item : set){
-	// 		cout<< " " << item;
-	// 	}
-	// 	cout<<endl;
-	// }
-	// 
+	timer.finish();
+	cout << "Found "<<cluster_id - nClustersEq1<< " ("<< cluster_id <<" incl. singletons) clusters with min sparsity "<<min_sparsity<< " and max. sparsity " << max_sparsity << " with " << n_dense_calculations << " dense, and " << n_sparse_calculations << " sparse calculations " << endl;
+
+	timer.go("Cluster output");
+	ostream *out = config.output_file.empty() ? &cout : new ofstream(config.output_file.c_str());
+	vector<Letter> seq;
+	string id;
+	db->seek_direct();
+	Hsp hsp;
+	size_t n;
+	out->precision(3);
+	for (int i = 0; i < (int)db->ref_header.sequences; ++i) {
+		db->read_seq(id, seq);
+		(*out) << blast_id(id) << '\t' << clustering_result[i] << endl;		
+		id.clear();
+		seq.clear();
+	}
+	if (out != &cout) delete out;
 	db->close();
+	timer.finish();
 }
 }}
diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h
index 4e5c1226..511433d6 100644
--- a/src/cluster/mcl.h
+++ b/src/cluster/mcl.h
@@ -18,6 +18,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 #pragma once
 #include <Eigen/Sparse>
+#include <Eigen/Eigenvalues>
 #include <algorithm>
 #include <stdexcept>
 #include <string>
@@ -43,17 +44,18 @@ using namespace std;
 namespace Workflow { namespace Cluster{
 class MCL: public ClusteringAlgorithm {
 private: 
-	Eigen::SparseMatrix<float> get_gamma(Eigen::SparseMatrix<float>* m, float r);
+	void get_exp(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* out, float r);
+	void get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r);
+	void get_gamma(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* out, float r);
+	void get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r);
 	vector<unordered_set<uint32_t>> get_list(Eigen::SparseMatrix<float>* m);
+	vector<unordered_set<uint32_t>> get_list(Eigen::MatrixXf* m);
 	vector<unordered_set<uint32_t>> markov_process(Eigen::SparseMatrix<float>* m);
-	void run_clustering();
+	vector<unordered_set<uint32_t>> markov_process(Eigen::MatrixXf* m);
 public:
-	void run(){
-		run_clustering();
-	}
-	static string get_key(){
-		return "mcl";
-	}
+	void run();
+	string get_key();
+	string get_description();
 };
 
 template <typename T>
@@ -66,10 +68,10 @@ class SparseMatrixStream : public Consumer {
 		float qcov, scov, bitscore, id;
 		const char *end = ptr + n;
 		while (ptr < end) {
-			//if (sscanf(ptr, "%i\t%i\n%n", &query, &subject, &count) != 2)
 			if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &id, &count) != 6) 
 				throw runtime_error("Cluster format error.");
-			data.emplace_back(query, subject, (qcov/100.0f) * (scov/100.0f) * (id/100.0f));
+			const float value = (qcov/100.0f) * (scov/100.0f) * (id/100.0f);
+			data.emplace_back(query, subject, value);
 			disjointSet->merge(query, subject);
 			ptr += count;
 		}
@@ -78,44 +80,41 @@ class SparseMatrixStream : public Consumer {
 	SparseMatrixStream(size_t n){
 		this->n = n;
 		disjointSet = new LazyDisjointIntegralSet<size_t>(n); 
-		// Note: this makes sure the self hits are always included, the value needs to be aligned with the measure used in consume!
-		for(uint32_t idiag=0; idiag<n; idiag++){
-			data.emplace_back(idiag, idiag, 1.0);
-		}
 	}
+
 	~SparseMatrixStream(){
 		delete disjointSet;
 	}
-	pair<vector<vector<uint32_t>>, vector<Eigen::SparseMatrix<T>>> getComponents(){
+	pair<vector<vector<uint32_t>>, vector<vector<Eigen::Triplet<T>>>> getComponents(){
 		vector<unordered_set<size_t>> sets = disjointSet->getListOfSets();
 		vector<vector<Eigen::Triplet<T>>> split(sets.size());
-		unordered_map<size_t, uint32_t> indexToSetId;
+		vector<uint32_t> indexToSetId(this->n, sets.size());
 		for(uint32_t iset = 0; iset < sets.size(); iset++){
 			split.push_back(vector<Eigen::Triplet<T>>());
 			for(size_t index : sets[iset]){
-				indexToSetId.emplace(index, iset);
+				indexToSetId[index] = iset;
 			}
 		}
-		for(Eigen::Triplet<T> d : data){
-			assert(indexToSetId[d.row()] == indexToSetId[d.row()]);
-			split[indexToSetId[d.row()]].emplace_back(d.row(), d.col(), d.value());
+		for(Eigen::Triplet<T> t : data){
+			uint32_t iset = indexToSetId[t.row()];
+			assert( iset == indexToSetId[t.col()]);
+			split[iset].emplace_back(t.row(), t.col(), t.value());
 		}
 		vector<vector<uint32_t>> indices(sets.size());
-		vector<Eigen::SparseMatrix<T>> components(sets.size());
+		vector<vector<Eigen::Triplet<T>>> components(sets.size());
 		for(uint32_t iset = 0; iset < sets.size(); iset++){
-			Eigen::SparseMatrix<T> component(sets[iset].size(), sets[iset].size());
 			vector<uint32_t> order(sets[iset].begin(), sets[iset].end());
 			map<uint32_t, uint32_t> index_map;
 			uint32_t iel = 0;
-			for(uint32_t el: order){
+			for(uint32_t const & el: order){
 				index_map.emplace(el, iel++);
 			}
-			vector<Eigen::Triplet<T>> remapped(split[iset].size());
-			for(Eigen::Triplet<T> t : split[iset]){
+			vector<Eigen::Triplet<T>> remapped;
+			for(Eigen::Triplet<T> const & t : split[iset]){
 				remapped.emplace_back(index_map[t.row()], index_map[t.col()], t.value());
 			}
-			component.setFromTriplets(remapped.begin(), remapped.end());
-			components.emplace_back(move(component));
+			remapped.shrink_to_fit();
+			components.emplace_back(move(remapped));
 			indices.emplace_back(move(order));
 		}
 		return make_pair(move(indices), move(components));
diff --git a/src/cluster/multi_step_cluster.cpp b/src/cluster/multi_step_cluster.cpp
index 85f90abf..95e6ad43 100644
--- a/src/cluster/multi_step_cluster.cpp
+++ b/src/cluster/multi_step_cluster.cpp
@@ -21,6 +21,13 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 using namespace std;
 
 namespace Workflow { namespace Cluster{ 
+string MultiStep::get_key() {
+	return "multi-step";
+}
+string MultiStep::get_description(){
+	return "A greedy stepwise vortex cover algorithm";
+}
+
 vector<bool> MultiStep::rep_bitset(const vector<int> &centroid, const vector<bool> *superset) {
 	vector<bool> r(centroid.size());
 	for (int c : centroid)
@@ -54,7 +61,7 @@ vector<int> MultiStep::cluster(DatabaseFile &db, const vector<bool> *filter) {
 	return Util::Algo::greedy_vortex_cover(nb);
 }
 
-void MultiStep::run_clustering() {
+void MultiStep::run() {
 	if (config.database == "")
 		throw runtime_error("Missing parameter: database file (--db/-d)");
 	config.command = Config::makedb;
diff --git a/src/cluster/multi_step_cluster.h b/src/cluster/multi_step_cluster.h
index b4cc2258..5f5bae69 100644
--- a/src/cluster/multi_step_cluster.h
+++ b/src/cluster/multi_step_cluster.h
@@ -44,14 +44,10 @@ class MultiStep : public ClusteringAlgorithm {
 private:
 	vector<bool> rep_bitset(const vector<int> &centroid, const vector<bool> *superset = nullptr);
 	vector<int> cluster(DatabaseFile &db, const vector<bool> *filter);
-	void run_clustering();
 public:
-	void run(){
-		run_clustering();
-	}
-	static string get_key(){
-		return "multi-step";
-	}
+	void run();
+	string get_key();
+	string get_description();
 };
 
 struct Neighbors : public vector<vector<int>>, public Consumer {

From 33217d6bfdeef09212bfbd888b9e14b0eefa92fe Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Tue, 5 May 2020 11:05:07 +0200
Subject: [PATCH 10/21] added inflation and expansion parameters

it is now possible to control the inflation and expansion parameters
from the command line
---
 src/basic/config.cpp |   5 +-
 src/basic/config.h   |   3 ++
 src/cluster/mcl.cpp  | 110 +++++++++++++++++++++++++++++++++----------
 src/cluster/mcl.h    |   4 +-
 4 files changed, 95 insertions(+), 27 deletions(-)

diff --git a/src/basic/config.cpp b/src/basic/config.cpp
index ac4b235e..e350f9d2 100644
--- a/src/basic/config.cpp
+++ b/src/basic/config.cpp
@@ -147,7 +147,10 @@ Config::Config(int argc, const char **argv, bool check_io)
 
 	Options_group cluster("Cluster options");
 	cluster.add()
-		("cluster-algo", 0, "Clustering algorithm (\"multi-step\", \"mcl\")", cluster_algo);
+		("cluster-algo", 0, "Clustering algorithm (\"multi-step\", \"mcl\")", cluster_algo)
+		("mcl-expansion", 0, "MCL expansion coefficient (default=2.0)", cluster_mcl_expansion, 2.0)
+		("mcl-inflation", 0, "MCL inflation coefficient (default=2.0)", cluster_mcl_inflation, 2.0)
+		("mcl-sparsity-switch", 0, "MCL switch to sparse matrix computation (default=0.8) ", cluster_mcl_sparsity_switch, 0.8);
 
 	Options_group aligner("Aligner options");
 	aligner.add()
diff --git a/src/basic/config.h b/src/basic/config.h
index dfc631b4..647bcecf 100644
--- a/src/basic/config.h
+++ b/src/basic/config.h
@@ -223,6 +223,9 @@ struct Config
 	int algo;
 
 	string cluster_algo;
+	double cluster_mcl_inflation;
+	double cluster_mcl_expansion;
+	double cluster_mcl_sparsity_switch;
 
 	enum { query_parallel = 0, target_parallel = 1 };
 	unsigned load_balancing;
diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp
index e25bf696..4a567f9e 100644
--- a/src/cluster/mcl.cpp
+++ b/src/cluster/mcl.cpp
@@ -42,21 +42,22 @@ void MCL::get_exp(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* ou
 }
 
 void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
-	if( r - (int) r == 0 ){
-		// TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues
+	// TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues
+	if( r - (int) r == 0){
 		*out = *in;
 		for(uint32_t i=1; i<r; i++){ 
-			(*out) *= (*in);
+			*out *= *in;
 		}
 	}
 	else{ 
+		// TODO: check whether the matrix is self-adjoint and use SelfAdjointEigenSolver instead
 		Eigen::EigenSolver<Eigen::MatrixXf> solver(*in);
 		Eigen::MatrixXcf d = solver.eigenvalues().asDiagonal();
 		for(uint32_t idiag = 0; idiag<d.rows(); idiag++){
 			d(idiag, idiag) = pow(d(idiag, idiag), r);
 		}
 		Eigen::MatrixXcf V = solver.eigenvectors();
-		(*out) = (V * d * V.transpose()).real();
+		out->noalias() = (V * d.real() * V.inverse()).real();
 	}
 }
 
@@ -76,13 +77,14 @@ void MCL::get_gamma(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>*
 }
 
 void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
+	// Note that Eigen matrices are column-major, so this is the most efficient way
 	for (uint32_t icol=0; icol<in->cols(); ++icol){
 		float colSum = 0.0f;
 		for (uint32_t irow=0; irow<in->rows(); ++irow){
-			colSum += pow((*in)(irow, icol) , r);
+			colSum += pow(in->coeffRef(irow, icol) , r);
 		}
 		for (uint32_t irow=0; irow<in->rows(); ++irow){
-			(*out)(irow, icol) =  pow((*in)(irow, icol) , r)/colSum;
+			out->coeffRef(irow, icol) =  pow(in->coeffRef(irow, icol) , r) / colSum;
 		}
 	}
 }
@@ -101,7 +103,7 @@ vector<unordered_set<uint32_t>> MCL::get_list(Eigen::MatrixXf* m){
 	LazyDisjointIntegralSet<uint32_t> disjointSet(m->cols());
 	for (uint32_t icol=0; icol<m->cols(); ++icol){
 		for (uint32_t irow=0; irow<m->rows(); ++irow){
-			if( abs((*m)(irow, icol)) > 1e-7 ){
+			if( abs((*m)(irow, icol)) > std::numeric_limits<float>::epsilon()){
 				disjointSet.merge(irow, icol);
 			}
 		}
@@ -109,15 +111,51 @@ vector<unordered_set<uint32_t>> MCL::get_list(Eigen::MatrixXf* m){
 	return disjointSet.getListOfSets();
 }
 
-vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::SparseMatrix<float>* m){
+// void printMatrix(Eigen::SparseMatrix<float>* m){
+// 	printf("[");
+// 	for (uint32_t irow=0; irow<m->rows(); ++irow){
+// 		printf("[");
+// 		for (uint32_t icol=0; icol<m->cols(); ++icol){
+// 			printf("%12.7f",m->coeffRef(irow, icol));
+// 		}
+// 		if(irow<m->rows() - 1){
+// 			printf("];\n");
+// 		}
+// 		else{
+// 			printf("]");
+// 		}
+// 	}
+// 	printf("]\n\n");
+// }
+// void printMatrix(Eigen::MatrixXf* m){
+// 	printf("[");
+// 	for (uint32_t irow=0; irow<m->rows(); ++irow){
+// 		printf("[");
+// 		for (uint32_t icol=0; icol<m->cols(); ++icol){
+// 			printf("%12.7f",m->coeffRef(irow, icol));
+// 		}
+// 		if(irow<m->rows() - 1){
+// 			printf("];\n");
+// 		}
+// 		else{
+// 			printf("]");
+// 		}
+// 	}
+// 	printf("]\n\n");
+// }
+
+vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::SparseMatrix<float>* m, float inflation, float expansion){
+	for (uint32_t idiag=0; idiag<m->cols(); ++idiag){
+		assert(abs(m->coeffRef(idiag, idiag)) > std::numeric_limits<float>::epsilon());
+	}
 	uint32_t iteration = 0;
-	double diff_norm = std::numeric_limits<double>::max();
+	float diff_norm = std::numeric_limits<float>::max();
 	Eigen::SparseMatrix<float> msquared(m->rows(), m->cols());
 	Eigen::SparseMatrix<float> m_update(m->rows(), m->cols());
 	get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable
 	while( iteration < 100 && diff_norm > 1e-6*m->rows() ){
-		get_exp(m, &msquared, 2);
-		get_gamma(&msquared, &m_update, 2);
+		get_exp(m, &msquared, expansion);
+		get_gamma(&msquared, &m_update, inflation);
 		*m -= m_update;
 		diff_norm = m->norm();
 		*m = m_update;
@@ -126,15 +164,19 @@ vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::SparseMatrix<float>*
 	return get_list(m);
 }
 
-vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::MatrixXf* m){
+vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::MatrixXf* m, float inflation, float expansion){
+	for (uint32_t idiag=0; idiag<m->cols(); ++idiag){
+		assert(abs(m->coeffRef(idiag, idiag)) > std::numeric_limits<float>::epsilon());
+	}
 	uint32_t iteration = 0;
-	double diff_norm = std::numeric_limits<double>::max();
+	float diff_norm = std::numeric_limits<float>::max();
 	Eigen::MatrixXf msquared(m->rows(), m->cols());
 	Eigen::MatrixXf m_update(m->rows(), m->cols());
 	get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable
 	while( iteration < 100 && diff_norm > 1e-6*m->rows() ){
-		get_exp(m, &msquared, 2);
-		get_gamma(&msquared, &m_update, 2);
+		get_exp(m, &msquared, expansion);
+		if(isnan(msquared.norm())) break; // converged if expansion is not integral
+		get_gamma(&msquared, &m_update, inflation);
 		*m -= m_update;
 		diff_norm = m->norm();
 		*m = m_update;
@@ -230,9 +272,25 @@ void MCL::run(){
 	// data.emplace_back(11, 8, 0.20f);
 	// data.emplace_back(11, 10, 0.20f);
 	// data.emplace_back(11, 11, 0.333f);
-	// Eigen::SparseMatrix<float> tmp(12,12);
-	// tmp.setFromTriplets(data.begin(), data.end());
-	// auto cr = markov_process(&tmp);
+	// vector<unordered_set<uint32_t>> cr;
+	// float inf = 2.0;
+	// float exp = 2.0;
+	// if(false){
+	// 	Eigen::SparseMatrix<float> tmp(12,12);
+	// 	tmp.setFromTriplets(data.begin(), data.end());
+	// 	printMatrix(&tmp);
+	// 	cr = markov_process(&tmp, inf, exp);
+	// 	printMatrix(&tmp);
+	// }
+	// else{
+	// 	Eigen::MatrixXf tmp = Eigen::MatrixXf::Zero(12,12);
+	// 	for(Eigen::Triplet<float> const & t : data){
+	// 		tmp(t.row(), t.col()) = t.value();
+	// 	}
+	// 	printMatrix(&tmp);
+	// 	cr = markov_process(&tmp, inf, exp);
+	// 	printMatrix(&tmp);
+	// }
 	// cout << "Found "<<cr.size()<<" clusters"<<endl;
 	// for(auto& set: cr){
 	// 	cout<< "set :";
@@ -282,11 +340,13 @@ void MCL::run(){
 	uint32_t nClustersEq1 = 0;
 	uint32_t n_dense_calculations = 0;
 	uint32_t n_sparse_calculations = 0;
+	float inflation = (float) config.cluster_mcl_inflation;
+	float expansion = (float) config.cluster_mcl_expansion;
 
 	if( full ){
 		// TODO: According to the SIAM publication this is not valid, just for debugging
 		Eigen::SparseMatrix<float> m = ms.getMatrix();
-		for(unordered_set<uint32_t> subset : markov_process(&m)){
+		for(unordered_set<uint32_t> subset : markov_process(&m, inflation, expansion)){
 			for(uint32_t el : subset){
 				clustering_result[el] = cluster_id;
 			}
@@ -295,7 +355,7 @@ void MCL::run(){
 		}
 	}
 	else {
-		// TODO: check if using dense matrices for non-sparse components improves performance
+		// TODO: check when/if using dense matrices for non-sparse components improves performance
 		for(uint32_t iComponent = 0; iComponent<indices.size(); iComponent++){
 			vector<uint32_t> order = indices[iComponent];
 			if(order.size() > 1){
@@ -306,19 +366,21 @@ void MCL::run(){
 				min_sparsity = min(min_sparsity, sparsity);
 				// map back to original ids
 				vector<unordered_set<uint32_t>> list_of_sets;
-				if(sparsity >= 0.8){ //TODO: a size limit should be given as well
+
+				//TODO: a size limit for the dense matrix should control this as well
+				if(sparsity >= config.cluster_mcl_sparsity_switch && expansion - (int) expansion == 0){ 
 					n_sparse_calculations++;
 					Eigen::SparseMatrix<float> m_sparse(order.size(), order.size());
 					m_sparse.setFromTriplets(m.begin(), m.end());
-					list_of_sets = markov_process(&m_sparse);
+					list_of_sets = markov_process(&m_sparse, inflation, expansion);
 				}
 				else{
 					n_dense_calculations++;
-					Eigen::MatrixXf m_dense(order.size(), order.size());
+					Eigen::MatrixXf m_dense = Eigen::MatrixXf::Zero(order.size(), order.size());
 					for(Eigen::Triplet<float> const & t : m){
 						m_dense(t.row(), t.col()) = t.value();
 					}
-					list_of_sets = markov_process(&m_dense);
+					list_of_sets = markov_process(&m_dense, inflation, expansion);
 				}
 				for(unordered_set<uint32_t> subset : list_of_sets){
 					for(uint32_t el : subset){
diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h
index 511433d6..60f06c2e 100644
--- a/src/cluster/mcl.h
+++ b/src/cluster/mcl.h
@@ -50,8 +50,8 @@ class MCL: public ClusteringAlgorithm {
 	void get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r);
 	vector<unordered_set<uint32_t>> get_list(Eigen::SparseMatrix<float>* m);
 	vector<unordered_set<uint32_t>> get_list(Eigen::MatrixXf* m);
-	vector<unordered_set<uint32_t>> markov_process(Eigen::SparseMatrix<float>* m);
-	vector<unordered_set<uint32_t>> markov_process(Eigen::MatrixXf* m);
+	vector<unordered_set<uint32_t>> markov_process(Eigen::SparseMatrix<float>* m, float inflation, float expansion);
+	vector<unordered_set<uint32_t>> markov_process(Eigen::MatrixXf* m, float inflation, float expansion);
 public:
 	void run();
 	string get_key();

From 00756f13f6897c9b910b95b0336ebfe4fa19c213 Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Thu, 7 May 2020 12:49:30 +0200
Subject: [PATCH 11/21] switching to binary format

---
 src/cluster/mcl.cpp          | 35 +++++++++++++++++++++++++++++------
 src/cluster/mcl.h            | 24 ++++++++++++++----------
 src/output/output_format.cpp | 19 ++++++++++++++++++-
 src/output/output_format.h   | 15 +++++++++++++++
 4 files changed, 76 insertions(+), 17 deletions(-)

diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp
index 4a567f9e..0f334ad1 100644
--- a/src/cluster/mcl.cpp
+++ b/src/cluster/mcl.cpp
@@ -28,6 +28,7 @@ string MCL::get_description() {
 }
 
 void MCL::get_exp(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* out, float r){
+	std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
 	if( r - (int) r == 0 ){
 		// TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues
 		*out = *in;
@@ -39,29 +40,38 @@ void MCL::get_exp(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* ou
 		throw runtime_error(" Eigen does not provide an eigenvalue solver for sparse matrices");
 	}
 	*out = out->pruned();
+	sparse_exp_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
 }
 
 void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
 	// TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues
-	if( r - (int) r == 0){
+	std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
+	if(r - (int) r == 0){
 		*out = *in;
 		for(uint32_t i=1; i<r; i++){ 
-			*out *= *in;
+			(*out) *= (*in);
 		}
+		dense_int_exp_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
 	}
 	else{ 
 		// TODO: check whether the matrix is self-adjoint and use SelfAdjointEigenSolver instead
+		// TODO: try and get out->noalias() = in->pow(r); to work (unsupported! http://eigen.tuxfamily.org/dox/unsupported/group__MatrixFunctions__Module.html).
 		Eigen::EigenSolver<Eigen::MatrixXf> solver(*in);
 		Eigen::MatrixXcf d = solver.eigenvalues().asDiagonal();
 		for(uint32_t idiag = 0; idiag<d.rows(); idiag++){
 			d(idiag, idiag) = pow(d(idiag, idiag), r);
 		}
 		Eigen::MatrixXcf V = solver.eigenvectors();
-		out->noalias() = (V * d.real() * V.inverse()).real();
+		double thr = 0.5 * abs(V.determinant());
+		if( thr > std::numeric_limits<float>::epsilon() ){
+			out->noalias() = (V * d.real() * V.inverse()).real();
+		}
+		dense_gen_exp_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
 	}
 }
 
 void MCL::get_gamma(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* out, float r){
+	std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
 	vector<Eigen::Triplet<float>> data;
 	for (uint32_t k=0; k<in->outerSize(); ++k){
 		float colSum = 0.0f;
@@ -74,9 +84,11 @@ void MCL::get_gamma(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>*
 	}
 	out->setFromTriplets(data.begin(), data.end(), [] (const float&, const float &b) { return b; });
 	*out = out->pruned(1.0, std::numeric_limits<float>::epsilon());
+	sparse_gamma_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
 }
 
 void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
+	std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
 	// Note that Eigen matrices are column-major, so this is the most efficient way
 	for (uint32_t icol=0; icol<in->cols(); ++icol){
 		float colSum = 0.0f;
@@ -87,19 +99,23 @@ void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
 			out->coeffRef(irow, icol) =  pow(in->coeffRef(irow, icol) , r) / colSum;
 		}
 	}
+	dense_gamma_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
 }
 
 vector<unordered_set<uint32_t>> MCL::get_list(Eigen::SparseMatrix<float>* m){
+	std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
 	LazyDisjointIntegralSet<uint32_t> disjointSet(m->cols());
 	for (uint32_t k=0; k<m->outerSize(); ++k){
 		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
 			disjointSet.merge(it.row(), it.col());
 		}
 	}
+	sparse_list_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
 	return disjointSet.getListOfSets();
 }
 
 vector<unordered_set<uint32_t>> MCL::get_list(Eigen::MatrixXf* m){
+	std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
 	LazyDisjointIntegralSet<uint32_t> disjointSet(m->cols());
 	for (uint32_t icol=0; icol<m->cols(); ++icol){
 		for (uint32_t irow=0; irow<m->rows(); ++irow){
@@ -108,6 +124,7 @@ vector<unordered_set<uint32_t>> MCL::get_list(Eigen::MatrixXf* m){
 			}
 		}
 	}
+	dense_list_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
 	return disjointSet.getListOfSets();
 }
 
@@ -173,9 +190,8 @@ vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::MatrixXf* m, float in
 	Eigen::MatrixXf msquared(m->rows(), m->cols());
 	Eigen::MatrixXf m_update(m->rows(), m->cols());
 	get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable
-	while( iteration < 100 && diff_norm > 1e-6*m->rows() ){
+	while( iteration < 100 && diff_norm > std::numeric_limits<float>::epsilon()*m->rows() ){
 		get_exp(m, &msquared, expansion);
-		if(isnan(msquared.norm())) break; // converged if expansion is not integral
 		get_gamma(&msquared, &m_update, inflation);
 		*m -= m_update;
 		diff_norm = m->norm();
@@ -305,7 +321,7 @@ void MCL::run(){
 	statistics.reset();
 	config.command = Config::blastp;
 	config.no_self_hits = false;
-	config.output_format = { "6", "qnum", "snum", "qcovhsp", "scovhsp", "bitscore", "pident" };
+	config.output_format = { "clus" };
 	config.query_cover = 80;
 	config.subject_cover = 80;
 	config.algo = 0;
@@ -368,10 +384,12 @@ void MCL::run(){
 				vector<unordered_set<uint32_t>> list_of_sets;
 
 				//TODO: a size limit for the dense matrix should control this as well
+				std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
 				if(sparsity >= config.cluster_mcl_sparsity_switch && expansion - (int) expansion == 0){ 
 					n_sparse_calculations++;
 					Eigen::SparseMatrix<float> m_sparse(order.size(), order.size());
 					m_sparse.setFromTriplets(m.begin(), m.end());
+					sparse_create_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
 					list_of_sets = markov_process(&m_sparse, inflation, expansion);
 				}
 				else{
@@ -380,6 +398,7 @@ void MCL::run(){
 					for(Eigen::Triplet<float> const & t : m){
 						m_dense(t.row(), t.col()) = t.value();
 					}
+					dense_create_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
 					list_of_sets = markov_process(&m_dense, inflation, expansion);
 				}
 				for(unordered_set<uint32_t> subset : list_of_sets){
@@ -399,6 +418,10 @@ void MCL::run(){
 	timer.finish();
 	cout << "Found "<<cluster_id - nClustersEq1<< " ("<< cluster_id <<" incl. singletons) clusters with min sparsity "<<min_sparsity<< " and max. sparsity " << max_sparsity << " with " << n_dense_calculations << " dense, and " << n_sparse_calculations << " sparse calculations " << endl;
 
+	cout << "Time used for matrix creation: " << sparse_create_time + dense_create_time <<" (sparse: " << sparse_create_time << ", dense: " << dense_create_time <<")" << endl;
+	cout << "Time used for exp: " << sparse_exp_time + dense_int_exp_time + dense_gen_exp_time << " (sparse: " << sparse_exp_time << ", dense int: " << dense_int_exp_time << ", dense gen: " << dense_gen_exp_time <<")" << endl; 
+	cout << "Time used for gamma: " << sparse_gamma_time + dense_gamma_time <<" (sparse: " << sparse_gamma_time << ", dense: " << dense_gamma_time <<")" << endl;
+	cout << "Time used for listing: " << sparse_list_time + dense_list_time <<" (sparse: " << sparse_list_time << ", dense: " << dense_list_time <<")" << endl;
 	timer.go("Cluster output");
 	ostream *out = config.output_file.empty() ? &cout : new ofstream(config.output_file.c_str());
 	vector<Letter> seq;
diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h
index 60f06c2e..6bb40b3d 100644
--- a/src/cluster/mcl.h
+++ b/src/cluster/mcl.h
@@ -52,6 +52,10 @@ class MCL: public ClusteringAlgorithm {
 	vector<unordered_set<uint32_t>> get_list(Eigen::MatrixXf* m);
 	vector<unordered_set<uint32_t>> markov_process(Eigen::SparseMatrix<float>* m, float inflation, float expansion);
 	vector<unordered_set<uint32_t>> markov_process(Eigen::MatrixXf* m, float inflation, float expansion);
+	double sparse_create_time, dense_create_time = 0.0;
+	double sparse_exp_time, dense_int_exp_time, dense_gen_exp_time = 0.0;
+	double sparse_gamma_time, dense_gamma_time = 0.0;
+	double sparse_list_time, dense_list_time = 0.0;
 public:
 	void run();
 	string get_key();
@@ -62,36 +66,36 @@ template <typename T>
 class SparseMatrixStream : public Consumer {
 	size_t n;
 	vector<Eigen::Triplet<T>> data;
-	LazyDisjointSet<size_t>* disjointSet;
+	LazyDisjointSet<uint32_t>* disjointSet;
 	virtual void consume(const char *ptr, size_t n) override {
-		size_t query, subject, count;
-		float qcov, scov, bitscore, id;
 		const char *end = ptr + n;
+		uint32_t query = *(uint32_t*) ptr;
+		ptr += sizeof(uint32_t);
 		while (ptr < end) {
-			if (sscanf(ptr, "%lu\t%lu\t%f\t%f\t%f\t%f\n%ln", &query, &subject, &qcov, &scov, &bitscore, &id, &count) != 6) 
-				throw runtime_error("Cluster format error.");
-			const float value = (qcov/100.0f) * (scov/100.0f) * (id/100.0f);
+			const uint32_t subject = *(uint32_t*) ptr;
+			ptr += sizeof(uint32_t);
+			const double value = *(double*) ptr;
+			ptr += sizeof(double);
 			data.emplace_back(query, subject, value);
 			disjointSet->merge(query, subject);
-			ptr += count;
 		}
 	}
 public:
 	SparseMatrixStream(size_t n){
 		this->n = n;
-		disjointSet = new LazyDisjointIntegralSet<size_t>(n); 
+		disjointSet = new LazyDisjointIntegralSet<uint32_t>(n); 
 	}
 
 	~SparseMatrixStream(){
 		delete disjointSet;
 	}
 	pair<vector<vector<uint32_t>>, vector<vector<Eigen::Triplet<T>>>> getComponents(){
-		vector<unordered_set<size_t>> sets = disjointSet->getListOfSets();
+		vector<unordered_set<uint32_t>> sets = disjointSet->getListOfSets();
 		vector<vector<Eigen::Triplet<T>>> split(sets.size());
 		vector<uint32_t> indexToSetId(this->n, sets.size());
 		for(uint32_t iset = 0; iset < sets.size(); iset++){
 			split.push_back(vector<Eigen::Triplet<T>>());
-			for(size_t index : sets[iset]){
+			for(uint32_t index : sets[iset]){
 				indexToSetId[index] = iset;
 			}
 		}
diff --git a/src/output/output_format.cpp b/src/output/output_format.cpp
index 7b8f8602..f34f4be2 100644
--- a/src/output/output_format.cpp
+++ b/src/output/output_format.cpp
@@ -91,6 +91,8 @@ Output_format* get_output_format()
 		return new PAF_format;
 	else if (f[0] == "bin1")
 		return new Bin1_format;
+	else if (f[0] == "clus")
+		return new Clustering_format;
 	else
 		throw std::runtime_error("Invalid output format. Allowed values: 0,5,6,100,101,102");
 }
@@ -131,4 +133,19 @@ void Bin1_format::print_match(const Hsp_context& r, const Metadata &metadata, Te
 		out.write((uint32_t)r.subject_id);
 		out.write(r.bit_score() / std::max((unsigned)r.query.source().length(), r.subject_len));
 	}
-}
\ No newline at end of file
+}
+
+void Clustering_format::print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const {
+	out.write((uint32_t)query_num);
+}
+
+void Clustering_format::print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out) {
+	if (r.query_id <= r.subject_id) {
+		out.write((uint32_t)r.subject_id);
+		// TODO: enable other clustering measures
+		double m = (double) r.query_source_range().length()*1.0 / r.query.source().length();
+		m *= (double) r.subject_range().length() * 1.0 / r.subject_len;
+		m *= (double)r.identities() * 1.0 / r.length();
+		out.write(m);
+	}
+}
diff --git a/src/output/output_format.h b/src/output/output_format.h
index 126a0997..a9291bb7 100644
--- a/src/output/output_format.h
+++ b/src/output/output_format.h
@@ -210,6 +210,21 @@ struct Bin1_format : public Output_format
 	}
 };
 
+struct Clustering_format : public Output_format
+{
+	Clustering_format():
+		Output_format(bin1)
+	{}
+	virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const override;
+	virtual void print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out) override;
+	virtual ~Clustering_format()
+	{ }
+	virtual Output_format* clone() const override
+	{
+		return new Clustering_format(*this);
+	}
+};
+
 Output_format* get_output_format();
 void init_output(bool have_taxon_id_lists, bool have_taxon_nodes, bool have_taxon_scientific_names);
 void print_hsp(Hsp &hsp, const TranslatedSequence &query);

From 027aeeebcb8594d9eb3b644cb03ab5333deeba29 Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Tue, 19 May 2020 14:07:15 +0200
Subject: [PATCH 12/21] CR comment: enabling other similarity measures

---
 CMakeLists.txt                   |  2 ++
 src/basic/config.cpp             |  3 ++-
 src/basic/config.h               |  3 ++-
 src/cluster/cluster.h            |  2 +-
 src/cluster/cluster_registry.cpp |  2 +-
 src/cluster/cluster_registry.h   |  2 +-
 src/cluster/disjoint_set.h       | 18 ++++++++++++++++++
 src/cluster/mcl.cpp              | 16 +++++++++++-----
 src/cluster/mcl.h                |  2 +-
 src/output/output_format.cpp     | 16 +---------------
 src/output/output_format.h       |  6 +++---
 11 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a7887ee0..d91db285 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -122,6 +122,8 @@ set(OBJECTS
   src/run/tools.cpp
   src/chaining/greedy_align.cpp
   src/output/output_format.cpp
+  src/output/clustering_variables.cpp
+  src/output/clustering_format.cpp
   src/output/join_blocks.cpp
   src/data/frequent_seeds.cpp
   src/align/legacy/query_mapper.cpp
diff --git a/src/basic/config.cpp b/src/basic/config.cpp
index e350f9d2..f693ef2f 100644
--- a/src/basic/config.cpp
+++ b/src/basic/config.cpp
@@ -148,7 +148,8 @@ Config::Config(int argc, const char **argv, bool check_io)
 	Options_group cluster("Cluster options");
 	cluster.add()
 		("cluster-algo", 0, "Clustering algorithm (\"multi-step\", \"mcl\")", cluster_algo)
-		("mcl-expansion", 0, "MCL expansion coefficient (default=2.0)", cluster_mcl_expansion, 2.0)
+		("cluster-similarity", 0, "Clustering similarity measure", cluster_similarity)
+		("mcl-expansion", 0, "MCL expansion coefficient (default=2)", cluster_mcl_expansion, (uint32_t) 2)
 		("mcl-inflation", 0, "MCL inflation coefficient (default=2.0)", cluster_mcl_inflation, 2.0)
 		("mcl-sparsity-switch", 0, "MCL switch to sparse matrix computation (default=0.8) ", cluster_mcl_sparsity_switch, 0.8);
 
diff --git a/src/basic/config.h b/src/basic/config.h
index 647bcecf..c086252b 100644
--- a/src/basic/config.h
+++ b/src/basic/config.h
@@ -224,8 +224,9 @@ struct Config
 
 	string cluster_algo;
 	double cluster_mcl_inflation;
-	double cluster_mcl_expansion;
+	uint32_t cluster_mcl_expansion;
 	double cluster_mcl_sparsity_switch;
+	string	cluster_similarity;
 
 	enum { query_parallel = 0, target_parallel = 1 };
 	unsigned load_balancing;
diff --git a/src/cluster/cluster.h b/src/cluster/cluster.h
index 864cd79b..62323f94 100644
--- a/src/cluster/cluster.h
+++ b/src/cluster/cluster.h
@@ -1,6 +1,6 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
+Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/src/cluster/cluster_registry.cpp b/src/cluster/cluster_registry.cpp
index b3345f5c..4c59d2bd 100644
--- a/src/cluster/cluster_registry.cpp
+++ b/src/cluster/cluster_registry.cpp
@@ -1,6 +1,6 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
+Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/src/cluster/cluster_registry.h b/src/cluster/cluster_registry.h
index d6c872da..a2aaa2f3 100644
--- a/src/cluster/cluster_registry.h
+++ b/src/cluster/cluster_registry.h
@@ -1,6 +1,6 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
+Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/src/cluster/disjoint_set.h b/src/cluster/disjoint_set.h
index 4b7acc7b..df5e5f96 100644
--- a/src/cluster/disjoint_set.h
+++ b/src/cluster/disjoint_set.h
@@ -1,3 +1,21 @@
+/****
+DIAMOND protein aligner - Markov Clustering Module
+Copyright (C) 2019 Patrick Ettenhuber <pettenhuber@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
 #include <algorithm>
 #include <stdio.h>
 #include <fstream>
diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp
index 0f334ad1..225a8368 100644
--- a/src/cluster/mcl.cpp
+++ b/src/cluster/mcl.cpp
@@ -1,6 +1,6 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
+Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -321,13 +321,14 @@ void MCL::run(){
 	statistics.reset();
 	config.command = Config::blastp;
 	config.no_self_hits = false;
-	config.output_format = { "clus" };
-	config.query_cover = 80;
-	config.subject_cover = 80;
+	string format = config.cluster_similarity;
+	if(format.empty()){
+		format = "qcovhsp/100*scovhsp/100*pident/100";
+	}
+	config.output_format = {"clus", format};
 	config.algo = 0;
 	config.index_mode = 0;
 	config.freq_sd = 0;
-	config.max_alignments = numeric_limits<uint64_t>::max();
 
 	Workflow::Search::Options opt;
 	opt.db = &(*db);
@@ -372,6 +373,11 @@ void MCL::run(){
 	}
 	else {
 		// TODO: check when/if using dense matrices for non-sparse components improves performance
+		// vector<std::thread> threads;
+		// for (size_t i = 0; i < config.threads_; ++i)
+		// 	threads.emplace_back(seed_join_worker, query_idx, ref_idx, &seedp, &range, query_seed_hits, ref_seed_hits);
+		// for (auto &t : threads)
+		// 	t.join();
 		for(uint32_t iComponent = 0; iComponent<indices.size(); iComponent++){
 			vector<uint32_t> order = indices[iComponent];
 			if(order.size() > 1){
diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h
index 6bb40b3d..8901b544 100644
--- a/src/cluster/mcl.h
+++ b/src/cluster/mcl.h
@@ -1,6 +1,6 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
+Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/src/output/output_format.cpp b/src/output/output_format.cpp
index f34f4be2..8da70715 100644
--- a/src/output/output_format.cpp
+++ b/src/output/output_format.cpp
@@ -92,7 +92,7 @@ Output_format* get_output_format()
 	else if (f[0] == "bin1")
 		return new Bin1_format;
 	else if (f[0] == "clus")
-		return new Clustering_format;
+		return new Clustering_format(&f[1]);
 	else
 		throw std::runtime_error("Invalid output format. Allowed values: 0,5,6,100,101,102");
 }
@@ -135,17 +135,3 @@ void Bin1_format::print_match(const Hsp_context& r, const Metadata &metadata, Te
 	}
 }
 
-void Clustering_format::print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const {
-	out.write((uint32_t)query_num);
-}
-
-void Clustering_format::print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out) {
-	if (r.query_id <= r.subject_id) {
-		out.write((uint32_t)r.subject_id);
-		// TODO: enable other clustering measures
-		double m = (double) r.query_source_range().length()*1.0 / r.query.source().length();
-		m *= (double) r.subject_range().length() * 1.0 / r.subject_len;
-		m *= (double)r.identities() * 1.0 / r.length();
-		out.write(m);
-	}
-}
diff --git a/src/output/output_format.h b/src/output/output_format.h
index a9291bb7..52d60013 100644
--- a/src/output/output_format.h
+++ b/src/output/output_format.h
@@ -212,9 +212,9 @@ struct Bin1_format : public Output_format
 
 struct Clustering_format : public Output_format
 {
-	Clustering_format():
-		Output_format(bin1)
-	{}
+	const string* const format;
+	Clustering_format(const string* const format): Output_format(bin1), format(format) {
+	}
 	virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const override;
 	virtual void print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out) override;
 	virtual ~Clustering_format()

From a848031a6f3cd63c91f9f709015e234c444309c9 Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Tue, 19 May 2020 19:18:42 +0200
Subject: [PATCH 13/21] adding forgotten files

---
 src/cluster/mcl.cpp                 |   6 +-
 src/output/clustering_format.cpp    | 163 +++++++++++++++
 src/output/clustering_variables.cpp |  44 ++++
 src/output/clustering_variables.h   | 309 ++++++++++++++++++++++++++++
 4 files changed, 521 insertions(+), 1 deletion(-)
 create mode 100644 src/output/clustering_format.cpp
 create mode 100644 src/output/clustering_variables.cpp
 create mode 100644 src/output/clustering_variables.h

diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp
index 225a8368..cfe346d6 100644
--- a/src/cluster/mcl.cpp
+++ b/src/cluster/mcl.cpp
@@ -343,6 +343,10 @@ void MCL::run(){
 	auto componentInfo = ms.getComponents();
 	vector<vector<uint32_t>> indices = get<0>(componentInfo);
 	vector<vector<Eigen::Triplet<float>>> components = get<1>(componentInfo);
+	std::vector<uint32_t> sort_order(components.size());
+	std::iota(sort_order.begin(), sort_order.end(), 0);
+	std::sort(sort_order.begin(), sort_order.end(), [&](uint32_t i, uint32_t j){return indices[i].size() > indices[j].size();});
+
 	uint32_t nComponents = count_if(indices.begin(), indices.end(), [](vector<uint32_t> v){ return v.size() > 0;});
 	uint32_t nComponentsLt1 = count_if(indices.begin(), indices.end(), [](vector<uint32_t> v){ return v.size() > 1;});
 	timer.finish();
@@ -378,7 +382,7 @@ void MCL::run(){
 		// 	threads.emplace_back(seed_join_worker, query_idx, ref_idx, &seedp, &range, query_seed_hits, ref_seed_hits);
 		// for (auto &t : threads)
 		// 	t.join();
-		for(uint32_t iComponent = 0; iComponent<indices.size(); iComponent++){
+		for(uint32_t iComponent : sort_order){
 			vector<uint32_t> order = indices[iComponent];
 			if(order.size() > 1){
 				vector<Eigen::Triplet<float>> m = components[iComponent];
diff --git a/src/output/clustering_format.cpp b/src/output/clustering_format.cpp
new file mode 100644
index 00000000..826c16c2
--- /dev/null
+++ b/src/output/clustering_format.cpp
@@ -0,0 +1,163 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "output_format.h"
+#include "clustering_variables.h"
+
+
+class RecursiveParser{
+	const Hsp_context& r; 
+	const char * expression_to_parse;
+	char peek()
+	{
+		return *expression_to_parse;
+	}
+	char peek(uint32_t ahead)
+	{
+		return *(expression_to_parse+ahead);
+	}
+	char get()
+	{
+		return *expression_to_parse++;
+	}
+	void advance(uint32_t ahead)
+	{
+		expression_to_parse+=ahead;
+	}
+	std::string variable()
+	{
+		vector<char> v;
+		while ( (peek() >= 'a' && peek() <= 'z') ||  (peek() >= 'A' && peek() <= 'Z') || peek() == '_')
+		{
+			v.push_back(get());
+		}
+		return std::string(v.begin(), v.end());
+	}
+	uint32_t integer()
+	{
+		uint32_t result = get() - '0';
+		while (peek() >= '0' && peek() <= '9')
+		{
+			result = 10*result + get() - '0';
+		}
+		return result;
+	}
+	double number()
+	{
+		double result = integer();
+		if(peek() == '.'){
+			advance(1); // '.'
+			const char *pos = expression_to_parse;
+			double decimals = integer() * 1.0 / (expression_to_parse - pos);
+			result += decimals;
+		}
+		return result;
+	}
+
+	double factor()
+	{
+		if (peek() >= '0' && peek() <= '9') {
+			return number();
+		}
+		else if (peek() == '(') {
+			advance(1); // '('
+			double result = expression();
+			advance(1); // ')'
+			return result;
+		}
+		else if (peek() == '-') {
+			advance(1);
+			return -factor();
+		}
+		else if (peek() == 'm' && peek(1)=='a' && peek(2)=='x' && peek(3)=='('){
+			advance(4); // 'max('
+			double result1 = expression();
+			advance(1); // ','
+			double result2 = expression();
+			advance(1); // ')'
+			return std::max(result1, result2);
+		}
+		else if (peek() == 'm' && peek(1)=='i' && peek(2)=='n' && peek(3)=='('){
+			advance(4); // 'min('
+			double result1 = expression();
+			advance(1); // ','
+			double result2 = expression();
+			advance(1); // ')'
+			return std::min(result1, result2);
+		}
+		else if (peek() == 'e' && peek(1)=='x' && peek(2)=='p' && peek(3)=='('){
+			advance(4); // 'exp('
+			double result1 = expression();
+			advance(1); // ')'
+			return std::exp(result1);
+		}
+		else {
+			return VariableRegistry::get(variable())->get(r);
+		}
+	}
+
+	double term()
+	{
+		double result = factor();
+		while (peek() == '*' || peek() == '/'){
+			if (get() == '*'){
+				result *= factor();
+			}
+			else{
+				result /= factor();
+			}
+		}
+		return result;
+	}
+
+	double expression() 
+	{
+		double result = term();
+		while (peek() == '+' || peek() == '-'){
+			if (get() == '+'){
+				result += term();
+			}
+			else{
+				result -= term();
+			}
+		}
+		return result;
+	}
+
+public:
+	static const vector<std::string> variables;
+	RecursiveParser(const Hsp_context& r, const char * c): r(r), expression_to_parse(c){}
+	double evaluate(){
+		return expression();
+	}
+};
+
+
+void Clustering_format::print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const 
+{
+	out.write((uint32_t)query_num);
+}
+
+void Clustering_format::print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out) 
+{
+	if (r.query_id <= r.subject_id) {
+		out.write((uint32_t)r.subject_id);
+		RecursiveParser rp(r, format->c_str());
+		out.write(rp.evaluate());
+	}
+}
diff --git a/src/output/clustering_variables.cpp b/src/output/clustering_variables.cpp
new file mode 100644
index 00000000..6011d905
--- /dev/null
+++ b/src/output/clustering_variables.cpp
@@ -0,0 +1,44 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include "clustering_variables.h"
+
+std::map<std::string, Variable*> VariableRegistry::regMap;
+QueryStart VariableRegistry::queryStart;
+QueryEnd VariableRegistry::queryEnd;
+SubjectStart VariableRegistry::subjectStart;
+SubjectEnd VariableRegistry::subjectEnd;
+EValue VariableRegistry::eValue;
+BitScore VariableRegistry::bitScore;
+Score VariableRegistry::score;
+Length VariableRegistry::length;
+PercentIdenticalMatches VariableRegistry::percentIdenticalMatches;
+NumberIdenticalMatches VariableRegistry::numberIdenticalMatches;
+NumberMismatches VariableRegistry::numberMismatches;
+NumberPositiveMatches VariableRegistry::numberPositiveMatches;
+NumberGapOpenings VariableRegistry::numberGapOpenings;
+NumberGaps VariableRegistry::numberGaps;
+PercentagePositiveMatches VariableRegistry::percentagePositiveMatches;
+QueryFrame VariableRegistry::queryFrame;
+QueryCoveragePerHsp VariableRegistry::queryCoveragePerHsp;
+SwDiff VariableRegistry::swdiff;
+Time VariableRegistry::time;
+SubjectCoveragePerHsp VariableRegistry::subjectCoveragePerHsp;
+UngappedScore VariableRegistry::ungappedScore;
+VariableRegistry::StaticConstructor VariableRegistry::_staticConstructor;
+
diff --git a/src/output/clustering_variables.h b/src/output/clustering_variables.h
new file mode 100644
index 00000000..f29f06bc
--- /dev/null
+++ b/src/output/clustering_variables.h
@@ -0,0 +1,309 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+****/
+
+#include <map>
+#include <string>
+#include "../basic/score_matrix.h"
+#include "../basic/match.h"
+
+class Variable{
+public:
+	virtual std::string get_name() = 0;
+	virtual double get(const Hsp_context& r) = 0;
+};
+
+class QueryLength: public Variable{
+public:
+	std::string get_name(){
+		return "qlen";
+	}
+	double get(const Hsp_context& r){
+		return r.query.source().length();
+	}
+};
+class SubjectLength: public Variable{
+public:
+	std::string get_name(){
+		return "slen";
+	}
+	double get(const Hsp_context& r){
+		return r.subject_len;
+	}
+};
+class QueryStart: public Variable{
+public:
+	std::string get_name(){
+		return "qstart";
+	}
+	double get(const Hsp_context& r){
+		return r.oriented_query_range().begin_ + 1;
+	}
+};
+class QueryEnd: public Variable{
+public:
+	std::string get_name(){
+		return "qend";
+	}
+	double get(const Hsp_context& r){
+		return r.oriented_query_range().end_ + 1;
+	}
+};
+class SubjectStart: public Variable{
+public:
+	std::string get_name(){
+		return "sstart";
+	}
+	double get(const Hsp_context& r){
+		return r.subject_range().begin_ + 1;
+	}
+};
+class SubjectEnd: public Variable{
+public:
+	std::string get_name(){
+		return "send";
+	}
+	double get(const Hsp_context& r){
+		return r.subject_range().end_;
+	}
+};
+class EValue: public Variable{
+public:
+	std::string get_name(){
+		return "evalue";
+	}
+	double get(const Hsp_context& r){
+		return r.evalue();
+	}
+};
+class BitScore: public Variable{
+public:
+	std::string get_name(){
+		return "bitscore";
+	}
+	double get(const Hsp_context& r){
+		return r.bit_score();
+	}
+};
+class Score: public Variable{
+public:
+	std::string get_name(){
+		return "score";
+	}
+	double get(const Hsp_context& r){
+		return r.score();
+	}
+};
+class Length: public Variable{
+public:
+	std::string get_name(){
+		return "length";
+	}
+	double get(const Hsp_context& r){
+		return r.length();
+	}
+};
+class PercentIdenticalMatches: public Variable{
+public:
+	std::string get_name(){
+		return "pident";
+	}
+	double get(const Hsp_context& r){
+		return (double)r.identities() * 100 / r.length();
+	}
+};
+class NumberIdenticalMatches: public Variable{
+public:
+	std::string get_name(){
+		return "nident";
+	}
+	double get(const Hsp_context& r){
+		return r.identities();
+	}
+};
+class NumberMismatches: public Variable{
+public:
+	std::string get_name(){
+		return "mismatch";
+	}
+	double get(const Hsp_context& r){
+		return r.mismatches();
+	}
+};
+class NumberPositiveMatches: public Variable{
+public:
+	std::string get_name(){
+		return "positive";
+	}
+	double get(const Hsp_context& r){
+		return r.positives();
+	}
+};
+class NumberGapOpenings: public Variable{
+public:
+	std::string get_name(){
+		return "gapopen";
+	}
+	double get(const Hsp_context& r){
+		return r.gap_openings();
+	}
+};
+class NumberGaps: public Variable{
+public:
+	std::string get_name(){
+		return "gaps";
+	}
+	double get(const Hsp_context& r){
+		return r.gaps();
+	}
+};
+class PercentagePositiveMatches: public Variable{
+public:
+	std::string get_name(){
+		return "ppos";
+	}
+	double get(const Hsp_context& r){
+		return (double)r.positives() * 100.0 / r.length();
+	}
+};
+class QueryFrame: public Variable{
+public:
+	std::string get_name(){
+		return "qframe";
+	}
+	double get(const Hsp_context& r){
+		return r.blast_query_frame();
+	}
+};
+class QueryCoveragePerHsp: public Variable{
+public:
+	std::string get_name(){
+		return "qcovhsp";
+	}
+	double get(const Hsp_context& r){
+		return (double)r.query_source_range().length()*100.0 / r.query.source().length();
+	}
+};
+class SwDiff: public Variable{
+public:
+	std::string get_name(){
+		return "swdiff";
+	}
+	double get(const Hsp_context& r){
+		return r.sw_score() - r.bit_score();
+	}
+};
+class Time: public Variable{
+public:
+	std::string get_name(){
+		return "time";
+	}
+	double get(const Hsp_context& r){
+		return r.time();
+	}
+};
+class SubjectCoveragePerHsp: public Variable{
+public:
+	std::string get_name(){
+		return "scovhsp";
+	}
+	double get(const Hsp_context& r){
+		return (double)r.subject_range().length() * 100.0 / r.subject_len;
+	}
+};
+class UngappedScore: public Variable{
+public:
+	std::string get_name(){
+		return "ungapped_score";
+	}
+	double get(const Hsp_context& r){
+		return score_matrix.bitscore(r.ungapped_score);
+	}
+};
+
+class VariableRegistry{
+private:
+	VariableRegistry(){};
+	// To include new clustering algorithms add the instantiation here and in the cluster_registry.cpp file. Then add it to the StaticConstructor below
+	static QueryStart queryStart;
+	static QueryEnd queryEnd;
+	static SubjectStart subjectStart;
+	static SubjectEnd subjectEnd;
+	static EValue eValue;
+	static BitScore bitScore;
+	static Score score;
+	static Length length;
+	static PercentIdenticalMatches percentIdenticalMatches;
+	static NumberIdenticalMatches numberIdenticalMatches;
+	static NumberMismatches numberMismatches;
+	static NumberPositiveMatches numberPositiveMatches;
+	static NumberGapOpenings numberGapOpenings;
+	static NumberGaps numberGaps;
+	static PercentagePositiveMatches percentagePositiveMatches;
+	static QueryFrame queryFrame;
+	static QueryCoveragePerHsp queryCoveragePerHsp;
+	static SwDiff swdiff;
+	static Time time;
+	static SubjectCoveragePerHsp subjectCoveragePerHsp;
+	static UngappedScore ungappedScore;
+public:
+	static std::map<std::string, Variable*> regMap;
+	static Variable* get(std::string key){
+		std::map<std::string, Variable*>::iterator ca = VariableRegistry::regMap.find(key);
+		if(ca == VariableRegistry::regMap.end()){
+			throw std::runtime_error(std::string("Variable not found:")+ key);
+		}
+		return ca->second;
+	}
+	static bool has(std::string key){
+		return VariableRegistry::regMap.find(key) != VariableRegistry::regMap.end();
+	}
+	static vector<std::string> getKeys(){
+		auto it = regMap.begin();
+		vector<std::string> keys;
+		while(it != regMap.end()){
+			keys.push_back(it->first);
+			it++;
+		}
+		return keys;
+	}
+	static struct StaticConstructor {
+		StaticConstructor() {
+			regMap.emplace(queryStart.get_name(), &queryStart);
+			regMap.emplace(queryEnd.get_name(), &queryEnd);
+			regMap.emplace(subjectStart.get_name(), &subjectStart);
+			regMap.emplace(subjectEnd.get_name(), &subjectEnd);
+			regMap.emplace(eValue.get_name(), &eValue);
+			regMap.emplace(bitScore.get_name(), &bitScore);
+			regMap.emplace(score.get_name(), &score);
+			regMap.emplace(length.get_name(), &length);
+			regMap.emplace(percentIdenticalMatches.get_name(), &percentIdenticalMatches);
+			regMap.emplace(numberIdenticalMatches.get_name(), &numberIdenticalMatches);
+			regMap.emplace(numberMismatches.get_name(), &numberMismatches);
+			regMap.emplace(numberPositiveMatches.get_name(), &numberPositiveMatches);
+			regMap.emplace(numberGapOpenings.get_name(), &numberGapOpenings);
+			regMap.emplace(numberGaps.get_name(), &numberGaps);
+			regMap.emplace(percentagePositiveMatches.get_name(), &percentagePositiveMatches);
+			regMap.emplace(queryFrame.get_name(), &queryFrame);
+			regMap.emplace(queryCoveragePerHsp.get_name(), &queryCoveragePerHsp);
+			regMap.emplace(swdiff.get_name(), &swdiff);
+			regMap.emplace(time.get_name(), &time);
+			regMap.emplace(subjectCoveragePerHsp.get_name(), &subjectCoveragePerHsp);
+			regMap.emplace(ungappedScore.get_name(), &ungappedScore);
+		}
+	} _staticConstructor;
+};

From 5f2ab15b995cf76e6adba5eee0b4f40d7aa4b72c Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Fri, 22 May 2020 07:46:46 +0200
Subject: [PATCH 14/21] update copyright

---
 src/cluster/cluster.h               | 3 ++-
 src/cluster/cluster_registry.cpp    | 3 ++-
 src/cluster/cluster_registry.h      | 3 ++-
 src/cluster/disjoint_set.h          | 3 ++-
 src/cluster/mcl.cpp                 | 3 ++-
 src/cluster/mcl.h                   | 3 ++-
 src/output/clustering_format.cpp    | 3 ++-
 src/output/clustering_variables.cpp | 3 ++-
 src/output/clustering_variables.h   | 3 ++-
 9 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/cluster/cluster.h b/src/cluster/cluster.h
index 62323f94..48e3021a 100644
--- a/src/cluster/cluster.h
+++ b/src/cluster/cluster.h
@@ -1,6 +1,7 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
+Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark)
+Code developed by Patrick Ettenhuber <patrick.ettenhuber@qiagen.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/src/cluster/cluster_registry.cpp b/src/cluster/cluster_registry.cpp
index 4c59d2bd..ce28ec1d 100644
--- a/src/cluster/cluster_registry.cpp
+++ b/src/cluster/cluster_registry.cpp
@@ -1,6 +1,7 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
+Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark)
+Code developed by Patrick Ettenhuber <patrick.ettenhuber@qiagen.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/src/cluster/cluster_registry.h b/src/cluster/cluster_registry.h
index a2aaa2f3..696116cf 100644
--- a/src/cluster/cluster_registry.h
+++ b/src/cluster/cluster_registry.h
@@ -1,6 +1,7 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
+Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark)
+Code developed by Patrick Ettenhuber <patrick.ettenhuber@qiagen.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/src/cluster/disjoint_set.h b/src/cluster/disjoint_set.h
index df5e5f96..1ac9ae5f 100644
--- a/src/cluster/disjoint_set.h
+++ b/src/cluster/disjoint_set.h
@@ -1,6 +1,7 @@
 /****
 DIAMOND protein aligner - Markov Clustering Module
-Copyright (C) 2019 Patrick Ettenhuber <pettenhuber@gmail.com>
+Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark)
+Code developed by Patrick Ettenhuber <patrick.ettenhuber@qiagen.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp
index cfe346d6..b47c3721 100644
--- a/src/cluster/mcl.cpp
+++ b/src/cluster/mcl.cpp
@@ -1,6 +1,7 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
+Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark)
+Code developed by Patrick Ettenhuber <patrick.ettenhuber@qiagen.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h
index 8901b544..7b11ebc5 100644
--- a/src/cluster/mcl.h
+++ b/src/cluster/mcl.h
@@ -1,6 +1,7 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
+Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark)
+Code developed by Patrick Ettenhuber <patrick.ettenhuber@qiagen.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/src/output/clustering_format.cpp b/src/output/clustering_format.cpp
index 826c16c2..1f68efab 100644
--- a/src/output/clustering_format.cpp
+++ b/src/output/clustering_format.cpp
@@ -1,6 +1,7 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
+Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark)
+Code developed by Patrick Ettenhuber <patrick.ettenhuber@qiagen.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/src/output/clustering_variables.cpp b/src/output/clustering_variables.cpp
index 6011d905..048aada2 100644
--- a/src/output/clustering_variables.cpp
+++ b/src/output/clustering_variables.cpp
@@ -1,6 +1,7 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
+Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark)
+Code developed by Patrick Ettenhuber <patrick.ettenhuber@qiagen.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
diff --git a/src/output/clustering_variables.h b/src/output/clustering_variables.h
index f29f06bc..2410cb09 100644
--- a/src/output/clustering_variables.h
+++ b/src/output/clustering_variables.h
@@ -1,6 +1,7 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2020 Patrick Ettenhuber <pettenhuber@gmail.com>
+Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark)
+Code developed by Patrick Ettenhuber <patrick.ettenhuber@qiagen.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by

From 9a77bf212ee3aa0d34bef709315d55cd5efb3138 Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Sun, 24 May 2020 21:33:42 +0200
Subject: [PATCH 15/21] added multithreading

---
 src/cluster/mcl.cpp | 180 ++++++++++++++++++++++++++++----------------
 src/cluster/mcl.h   |  14 +++-
 2 files changed, 125 insertions(+), 69 deletions(-)

diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp
index b47c3721..0224a2d7 100644
--- a/src/cluster/mcl.cpp
+++ b/src/cluster/mcl.cpp
@@ -21,6 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 using namespace std;
 
 namespace Workflow { namespace Cluster{
+
 string MCL::get_key() {
 	return "mcl";
 }
@@ -29,7 +30,7 @@ string MCL::get_description() {
 }
 
 void MCL::get_exp(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* out, float r){
-	std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
+	chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
 	if( r - (int) r == 0 ){
 		// TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues
 		*out = *in;
@@ -41,18 +42,18 @@ void MCL::get_exp(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* ou
 		throw runtime_error(" Eigen does not provide an eigenvalue solver for sparse matrices");
 	}
 	*out = out->pruned();
-	sparse_exp_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
+	sparse_exp_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
 }
 
 void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
 	// TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues
-	std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
+	chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
 	if(r - (int) r == 0){
 		*out = *in;
 		for(uint32_t i=1; i<r; i++){ 
 			(*out) *= (*in);
 		}
-		dense_int_exp_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
+		dense_int_exp_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
 	}
 	else{ 
 		// TODO: check whether the matrix is self-adjoint and use SelfAdjointEigenSolver instead
@@ -64,15 +65,15 @@ void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
 		}
 		Eigen::MatrixXcf V = solver.eigenvectors();
 		double thr = 0.5 * abs(V.determinant());
-		if( thr > std::numeric_limits<float>::epsilon() ){
+		if( thr > numeric_limits<float>::epsilon() ){
 			out->noalias() = (V * d.real() * V.inverse()).real();
 		}
-		dense_gen_exp_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
+		dense_gen_exp_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
 	}
 }
 
 void MCL::get_gamma(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* out, float r){
-	std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
+	chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
 	vector<Eigen::Triplet<float>> data;
 	for (uint32_t k=0; k<in->outerSize(); ++k){
 		float colSum = 0.0f;
@@ -84,12 +85,12 @@ void MCL::get_gamma(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>*
 		}
 	}
 	out->setFromTriplets(data.begin(), data.end(), [] (const float&, const float &b) { return b; });
-	*out = out->pruned(1.0, std::numeric_limits<float>::epsilon());
-	sparse_gamma_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
+	*out = out->pruned(1.0, numeric_limits<float>::epsilon());
+	sparse_gamma_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
 }
 
 void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
-	std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
+	chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
 	// Note that Eigen matrices are column-major, so this is the most efficient way
 	for (uint32_t icol=0; icol<in->cols(); ++icol){
 		float colSum = 0.0f;
@@ -100,32 +101,32 @@ void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
 			out->coeffRef(irow, icol) =  pow(in->coeffRef(irow, icol) , r) / colSum;
 		}
 	}
-	dense_gamma_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
+	dense_gamma_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
 }
 
 vector<unordered_set<uint32_t>> MCL::get_list(Eigen::SparseMatrix<float>* m){
-	std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
+	chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
 	LazyDisjointIntegralSet<uint32_t> disjointSet(m->cols());
 	for (uint32_t k=0; k<m->outerSize(); ++k){
 		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
 			disjointSet.merge(it.row(), it.col());
 		}
 	}
-	sparse_list_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
+	sparse_list_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
 	return disjointSet.getListOfSets();
 }
 
 vector<unordered_set<uint32_t>> MCL::get_list(Eigen::MatrixXf* m){
-	std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
+	chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
 	LazyDisjointIntegralSet<uint32_t> disjointSet(m->cols());
 	for (uint32_t icol=0; icol<m->cols(); ++icol){
 		for (uint32_t irow=0; irow<m->rows(); ++irow){
-			if( abs((*m)(irow, icol)) > std::numeric_limits<float>::epsilon()){
+			if( abs((*m)(irow, icol)) > numeric_limits<float>::epsilon()){
 				disjointSet.merge(irow, icol);
 			}
 		}
 	}
-	dense_list_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
+	dense_list_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
 	return disjointSet.getListOfSets();
 }
 
@@ -164,10 +165,10 @@ vector<unordered_set<uint32_t>> MCL::get_list(Eigen::MatrixXf* m){
 
 vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::SparseMatrix<float>* m, float inflation, float expansion){
 	for (uint32_t idiag=0; idiag<m->cols(); ++idiag){
-		assert(abs(m->coeffRef(idiag, idiag)) > std::numeric_limits<float>::epsilon());
+		assert(abs(m->coeffRef(idiag, idiag)) > numeric_limits<float>::epsilon());
 	}
 	uint32_t iteration = 0;
-	float diff_norm = std::numeric_limits<float>::max();
+	float diff_norm = numeric_limits<float>::max();
 	Eigen::SparseMatrix<float> msquared(m->rows(), m->cols());
 	Eigen::SparseMatrix<float> m_update(m->rows(), m->cols());
 	get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable
@@ -184,14 +185,14 @@ vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::SparseMatrix<float>*
 
 vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::MatrixXf* m, float inflation, float expansion){
 	for (uint32_t idiag=0; idiag<m->cols(); ++idiag){
-		assert(abs(m->coeffRef(idiag, idiag)) > std::numeric_limits<float>::epsilon());
+		assert(abs(m->coeffRef(idiag, idiag)) > numeric_limits<float>::epsilon());
 	}
 	uint32_t iteration = 0;
-	float diff_norm = std::numeric_limits<float>::max();
+	float diff_norm = numeric_limits<float>::max();
 	Eigen::MatrixXf msquared(m->rows(), m->cols());
 	Eigen::MatrixXf m_update(m->rows(), m->cols());
 	get_gamma(m, m, 1); // This is to get a matrix of random walks on the graph -> TODO: find out if something else is more suitable
-	while( iteration < 100 && diff_norm > std::numeric_limits<float>::epsilon()*m->rows() ){
+	while( iteration < 100 && diff_norm > numeric_limits<float>::epsilon()*m->rows() ){
 		get_exp(m, &msquared, expansion);
 		get_gamma(&msquared, &m_update, inflation);
 		*m -= m_update;
@@ -344,9 +345,9 @@ void MCL::run(){
 	auto componentInfo = ms.getComponents();
 	vector<vector<uint32_t>> indices = get<0>(componentInfo);
 	vector<vector<Eigen::Triplet<float>>> components = get<1>(componentInfo);
-	std::vector<uint32_t> sort_order(components.size());
-	std::iota(sort_order.begin(), sort_order.end(), 0);
-	std::sort(sort_order.begin(), sort_order.end(), [&](uint32_t i, uint32_t j){return indices[i].size() > indices[j].size();});
+	vector<uint32_t> sort_order(components.size());
+	iota(sort_order.begin(), sort_order.end(), 0);
+	sort(sort_order.begin(), sort_order.end(), [&](uint32_t i, uint32_t j){return indices[i].size() > indices[j].size();});
 
 	uint32_t nComponents = count_if(indices.begin(), indices.end(), [](vector<uint32_t> v){ return v.size() > 0;});
 	uint32_t nComponentsLt1 = count_if(indices.begin(), indices.end(), [](vector<uint32_t> v){ return v.size() > 1;});
@@ -355,52 +356,47 @@ void MCL::run(){
 
 	timer.go("Clustering components");
 	bool full = false;
-	vector<uint32_t> clustering_result(db->ref_header.sequences, std::numeric_limits<uint32_t>::max());
-	float max_sparsity = 0.0f;
-	float min_sparsity = 1.0f;
-	uint32_t cluster_id = 0;
-	uint32_t nClustersEq1 = 0;
-	uint32_t n_dense_calculations = 0;
-	uint32_t n_sparse_calculations = 0;
-	float inflation = (float) config.cluster_mcl_inflation;
-	float expansion = (float) config.cluster_mcl_expansion;
-
-	if( full ){
-		// TODO: According to the SIAM publication this is not valid, just for debugging
-		Eigen::SparseMatrix<float> m = ms.getMatrix();
-		for(unordered_set<uint32_t> subset : markov_process(&m, inflation, expansion)){
-			for(uint32_t el : subset){
-				clustering_result[el] = cluster_id;
-			}
-			cluster_id++;
-			if(subset.size() == 1) nClustersEq1 ++;
-		}
+	// Note, we will access the clustering_result from several threads below and a vector does not guarantee thread-safety in these situations.
+	// Note also, that the use of the disjoint_set structure guarantees that each thread will access a different part of the clustering_result
+	uint32_t* clustering_result = new uint32_t[db->ref_header.sequences];
+	for(uint64_t iseq = 0; iseq < db->ref_header.sequences; iseq++){
+		clustering_result[iseq]  = numeric_limits<uint32_t>::max();
 	}
-	else {
-		// TODO: check when/if using dense matrices for non-sparse components improves performance
-		// vector<std::thread> threads;
-		// for (size_t i = 0; i < config.threads_; ++i)
-		// 	threads.emplace_back(seed_join_worker, query_idx, ref_idx, &seedp, &range, query_seed_hits, ref_seed_hits);
-		// for (auto &t : threads)
-		// 	t.join();
-		for(uint32_t iComponent : sort_order){
+
+	uint32_t nThreads = min(config.threads_, nComponentsLt1);
+	float* max_sparsities = new float[nThreads];
+	float* min_sparsities = new float[nThreads];
+	atomic_uint32_t cluster_id(0);
+	atomic_uint32_t nClustersEq1(0);
+	atomic_uint32_t n_dense_calculations(0);
+	atomic_uint32_t n_sparse_calculations(0);
+	const float inflation = (float) config.cluster_mcl_inflation;
+	const float expansion = (float) config.cluster_mcl_expansion;
+
+	atomic_uint32_t component_counter(0);
+
+	auto mcl_clustering = [&](uint32_t iThr){
+		const uint32_t max_counter = sort_order.size();
+		uint32_t my_counter = component_counter.fetch_add(1, memory_order_relaxed);
+		while(my_counter < max_counter){
+			uint32_t iComponent = sort_order[my_counter];
 			vector<uint32_t> order = indices[iComponent];
 			if(order.size() > 1){
 				vector<Eigen::Triplet<float>> m = components[iComponent];
 				assert(m.size() <= order.size() * order.size());
 				float sparsity = 1.0-(1.0 * m.size()) / (order.size()*order.size());
-				max_sparsity = max(max_sparsity, sparsity);
-				min_sparsity = min(min_sparsity, sparsity);
+				max_sparsities[iThr] = max(max_sparsities[iThr], sparsity);
+				min_sparsities[iThr] = min(min_sparsities[iThr], sparsity);
 				// map back to original ids
 				vector<unordered_set<uint32_t>> list_of_sets;
 
 				//TODO: a size limit for the dense matrix should control this as well
-				std::chrono::high_resolution_clock::time_point t = std::chrono::high_resolution_clock::now();
+				chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
 				if(sparsity >= config.cluster_mcl_sparsity_switch && expansion - (int) expansion == 0){ 
 					n_sparse_calculations++;
 					Eigen::SparseMatrix<float> m_sparse(order.size(), order.size());
 					m_sparse.setFromTriplets(m.begin(), m.end());
-					sparse_create_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
+					sparse_create_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
 					list_of_sets = markov_process(&m_sparse, inflation, expansion);
 				}
 				else{
@@ -409,30 +405,83 @@ void MCL::run(){
 					for(Eigen::Triplet<float> const & t : m){
 						m_dense(t.row(), t.col()) = t.value();
 					}
-					dense_create_time += (double) std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t).count() / 1000.0;
+					dense_create_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
 					list_of_sets = markov_process(&m_dense, inflation, expansion);
 				}
 				for(unordered_set<uint32_t> subset : list_of_sets){
+					// FIXME: this does not assign stable cluster ids
+					uint32_t c_id = cluster_id.fetch_add(1, memory_order_relaxed);
 					for(uint32_t el : subset){
-						clustering_result[order[el]] = cluster_id;
+						clustering_result[order[el]] = c_id;
 					}
-					cluster_id++;
 					if(subset.size() == 1) nClustersEq1 ++;
 				}
 			}
 			else if (order.size() == 1){
-				clustering_result[order[0]] = cluster_id++;
+				uint32_t c_id = cluster_id.fetch_add(1, memory_order_relaxed);
+				clustering_result[order[0]] = c_id;
 				nClustersEq1++;
 			}
+			my_counter = component_counter.fetch_add(1, memory_order_relaxed);
 		}
+	};
+
+	vector<thread> threads;
+	for(uint32_t iThread = 0; iThread < nThreads ; iThread++ ) {
+		min_sparsities[iThread] = 1.0f;
+		max_sparsities[iThread] = 0.0f;
+		threads.emplace_back(mcl_clustering, iThread);
 	}
+
+	float min_sparsity = 1.0f;
+	float max_sparsity = 0.0f;
+	for(uint32_t iThread = 0; iThread < nThreads ; iThread++ ) {
+		threads[iThread].join();
+		min_sparsity = min(min_sparsity, min_sparsities[iThread]);
+		max_sparsity = max(max_sparsity, max_sparsities[iThread]);
+	}
+	delete[] min_sparsities;
+	delete[] max_sparsities;
+
 	timer.finish();
-	cout << "Found "<<cluster_id - nClustersEq1<< " ("<< cluster_id <<" incl. singletons) clusters with min sparsity "<<min_sparsity<< " and max. sparsity " << max_sparsity << " with " << n_dense_calculations << " dense, and " << n_sparse_calculations << " sparse calculations " << endl;
+	cout << "Found "
+		<< cluster_id - nClustersEq1
+		<< " ("<< cluster_id <<" incl. singletons) clusters with min sparsity "
+		<< min_sparsity 
+		<< " and max. sparsity " 
+		<< max_sparsity 
+		<< " with " 
+		<< n_dense_calculations 
+		<< " dense, and " 
+		<< n_sparse_calculations 
+		<< " sparse calculations " << endl;
+	cout << "Time used for matrix creation: " 
+		<< (sparse_create_time.load() + dense_create_time.load())/1000.0 
+		<<" (sparse: " 
+		<< sparse_create_time.load()/1000.0 
+		<< ", dense: " 
+		<< dense_create_time.load()/1000.0 <<")" << endl;
+	cout << "Time used for exp: " 
+		<< (sparse_exp_time.load() + dense_int_exp_time.load() + dense_gen_exp_time.load())/1000.0 
+		<< " (sparse: " 
+		<< sparse_exp_time.load()/1000.0 
+		<< ", dense int: " 
+		<< dense_int_exp_time.load()/1000.0 
+		<< ", dense gen: " 
+		<< dense_gen_exp_time.load()/1000.0 <<")" << endl; 
+	cout << "Time used for gamma: " 
+		<< (sparse_gamma_time.load() + dense_gamma_time.load())/1000.0 
+		<< " (sparse: " 
+		<< sparse_gamma_time.load()/1000.0 
+		<< ", dense: " 
+		<< dense_gamma_time.load()/1000.0 <<")" << endl;
+	cout << "Time used for listing: " 
+		<< (sparse_list_time.load() + dense_list_time.load())/1000.0 
+		<< " (sparse: " 
+		<< sparse_list_time.load()/1000.0 
+		<< ", dense: " 
+		<< dense_list_time.load()/1000.0 <<")" << endl;
 
-	cout << "Time used for matrix creation: " << sparse_create_time + dense_create_time <<" (sparse: " << sparse_create_time << ", dense: " << dense_create_time <<")" << endl;
-	cout << "Time used for exp: " << sparse_exp_time + dense_int_exp_time + dense_gen_exp_time << " (sparse: " << sparse_exp_time << ", dense int: " << dense_int_exp_time << ", dense gen: " << dense_gen_exp_time <<")" << endl; 
-	cout << "Time used for gamma: " << sparse_gamma_time + dense_gamma_time <<" (sparse: " << sparse_gamma_time << ", dense: " << dense_gamma_time <<")" << endl;
-	cout << "Time used for listing: " << sparse_list_time + dense_list_time <<" (sparse: " << sparse_list_time << ", dense: " << dense_list_time <<")" << endl;
 	timer.go("Cluster output");
 	ostream *out = config.output_file.empty() ? &cout : new ofstream(config.output_file.c_str());
 	vector<Letter> seq;
@@ -449,6 +498,7 @@ void MCL::run(){
 	}
 	if (out != &cout) delete out;
 	db->close();
+	delete clustering_result;
 	timer.finish();
 }
 }}
diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h
index 7b11ebc5..73396135 100644
--- a/src/cluster/mcl.h
+++ b/src/cluster/mcl.h
@@ -27,6 +27,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include <memory>
 #include <fstream>
 #include <limits>
+#include <atomic>
 #include "../util/system/system.h"
 #include "../util/util.h"
 #include "../basic/config.h"
@@ -53,10 +54,15 @@ class MCL: public ClusteringAlgorithm {
 	vector<unordered_set<uint32_t>> get_list(Eigen::MatrixXf* m);
 	vector<unordered_set<uint32_t>> markov_process(Eigen::SparseMatrix<float>* m, float inflation, float expansion);
 	vector<unordered_set<uint32_t>> markov_process(Eigen::MatrixXf* m, float inflation, float expansion);
-	double sparse_create_time, dense_create_time = 0.0;
-	double sparse_exp_time, dense_int_exp_time, dense_gen_exp_time = 0.0;
-	double sparse_gamma_time, dense_gamma_time = 0.0;
-	double sparse_list_time, dense_list_time = 0.0;
+	atomic_uint64_t sparse_create_time = {0};
+	atomic_uint64_t dense_create_time = {0};
+	atomic_uint64_t sparse_exp_time = {0};
+	atomic_uint64_t dense_int_exp_time = {0};
+	atomic_uint64_t dense_gen_exp_time = {0};
+	atomic_uint64_t sparse_gamma_time = {0};
+	atomic_uint64_t dense_gamma_time = {0};
+	atomic_uint64_t sparse_list_time = {0};
+	atomic_uint64_t dense_list_time = {0};
 public:
 	void run();
 	string get_key();

From c127db88040e8685f36f93c11700f8e921966ad7 Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Sun, 24 May 2020 21:40:29 +0200
Subject: [PATCH 16/21] also allowing for log as input

---
 src/output/clustering_format.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/output/clustering_format.cpp b/src/output/clustering_format.cpp
index 1f68efab..e2cddfe9 100644
--- a/src/output/clustering_format.cpp
+++ b/src/output/clustering_format.cpp
@@ -107,6 +107,12 @@ class RecursiveParser{
 			advance(1); // ')'
 			return std::exp(result1);
 		}
+		else if (peek() == 'l' && peek(1)=='o' && peek(2)=='g' && peek(3)=='('){
+			advance(4); // 'log('
+			double result1 = expression();
+			advance(1); // ')'
+			return std::exp(result1);
+		}
 		else {
 			return VariableRegistry::get(variable())->get(r);
 		}

From bb3d305bc249821518651e6803ca2cf0a92696d7 Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Sun, 24 May 2020 21:42:25 +0200
Subject: [PATCH 17/21] correct log

---
 src/output/clustering_format.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/output/clustering_format.cpp b/src/output/clustering_format.cpp
index e2cddfe9..3fffbd81 100644
--- a/src/output/clustering_format.cpp
+++ b/src/output/clustering_format.cpp
@@ -111,7 +111,7 @@ class RecursiveParser{
 			advance(4); // 'log('
 			double result1 = expression();
 			advance(1); // ')'
-			return std::exp(result1);
+			return std::log(result1);
 		}
 		else {
 			return VariableRegistry::get(variable())->get(r);

From 8481e0d7360e40b597cead68f1b6131932648867 Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Tue, 26 May 2020 09:14:24 +0200
Subject: [PATCH 18/21] change atomic types to be able to compile with clang5.0

---
 src/cluster/mcl.cpp | 12 ++++++------
 src/cluster/mcl.h   | 18 +++++++++---------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp
index 0224a2d7..634b6e49 100644
--- a/src/cluster/mcl.cpp
+++ b/src/cluster/mcl.cpp
@@ -366,14 +366,14 @@ void MCL::run(){
 	uint32_t nThreads = min(config.threads_, nComponentsLt1);
 	float* max_sparsities = new float[nThreads];
 	float* min_sparsities = new float[nThreads];
-	atomic_uint32_t cluster_id(0);
-	atomic_uint32_t nClustersEq1(0);
-	atomic_uint32_t n_dense_calculations(0);
-	atomic_uint32_t n_sparse_calculations(0);
+	atomic_uint cluster_id(0);
+	atomic_uint nClustersEq1(0);
+	atomic_uint n_dense_calculations(0);
+	atomic_uint n_sparse_calculations(0);
 	const float inflation = (float) config.cluster_mcl_inflation;
 	const float expansion = (float) config.cluster_mcl_expansion;
 
-	atomic_uint32_t component_counter(0);
+	atomic_uint component_counter(0);
 
 	auto mcl_clustering = [&](uint32_t iThr){
 		const uint32_t max_counter = sort_order.size();
@@ -498,7 +498,7 @@ void MCL::run(){
 	}
 	if (out != &cout) delete out;
 	db->close();
-	delete clustering_result;
+	delete[] clustering_result;
 	timer.finish();
 }
 }}
diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h
index 73396135..5de3aaef 100644
--- a/src/cluster/mcl.h
+++ b/src/cluster/mcl.h
@@ -54,15 +54,15 @@ class MCL: public ClusteringAlgorithm {
 	vector<unordered_set<uint32_t>> get_list(Eigen::MatrixXf* m);
 	vector<unordered_set<uint32_t>> markov_process(Eigen::SparseMatrix<float>* m, float inflation, float expansion);
 	vector<unordered_set<uint32_t>> markov_process(Eigen::MatrixXf* m, float inflation, float expansion);
-	atomic_uint64_t sparse_create_time = {0};
-	atomic_uint64_t dense_create_time = {0};
-	atomic_uint64_t sparse_exp_time = {0};
-	atomic_uint64_t dense_int_exp_time = {0};
-	atomic_uint64_t dense_gen_exp_time = {0};
-	atomic_uint64_t sparse_gamma_time = {0};
-	atomic_uint64_t dense_gamma_time = {0};
-	atomic_uint64_t sparse_list_time = {0};
-	atomic_uint64_t dense_list_time = {0};
+	atomic_ullong sparse_create_time = {0};
+	atomic_ullong dense_create_time = {0};
+	atomic_ullong sparse_exp_time = {0};
+	atomic_ullong dense_int_exp_time = {0};
+	atomic_ullong dense_gen_exp_time = {0};
+	atomic_ullong sparse_gamma_time = {0};
+	atomic_ullong dense_gamma_time = {0};
+	atomic_ullong sparse_list_time = {0};
+	atomic_ullong dense_list_time = {0};
 public:
 	void run();
 	string get_key();

From cb6dc00516e7183d9794ba3c276a97ae7f220513 Mon Sep 17 00:00:00 2001
From: Patrick Ettenhuber <pettenhuber@gmail.com>
Date: Thu, 28 May 2020 08:21:34 +0200
Subject: [PATCH 19/21] some minor improvements

- moving atomic updates out of loop
- work distribution is only governed by thread id
- printing of node labels, i.e. attractor, normal and single
---
 src/cluster/mcl.cpp | 172 ++++++++++++++++++++++++++++++--------------
 src/cluster/mcl.h   |   8 +--
 2 files changed, 123 insertions(+), 57 deletions(-)

diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp
index 634b6e49..fdfcc58c 100644
--- a/src/cluster/mcl.cpp
+++ b/src/cluster/mcl.cpp
@@ -17,6 +17,10 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
 #include "mcl.h"
+#define MASK_INVERSE 0xC0000000
+#define MASK_NORMAL_NODE 0x40000000
+#define MASK_ATTRACTOR_NODE 0x80000000
+#define MASK_SINGLE_NODE 0xC0000000
 
 using namespace std;
 
@@ -30,7 +34,9 @@ string MCL::get_description() {
 }
 
 void MCL::get_exp(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* out, float r){
+#ifdef MCL_TIMINGS
 	chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
+#endif
 	if( r - (int) r == 0 ){
 		// TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues
 		*out = *in;
@@ -42,18 +48,24 @@ void MCL::get_exp(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* ou
 		throw runtime_error(" Eigen does not provide an eigenvalue solver for sparse matrices");
 	}
 	*out = out->pruned();
+#ifdef MCL_TIMINGS
 	sparse_exp_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
+#endif
 }
 
 void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
 	// TODO: at some r it may be more beneficial to diagnoalize in and only take the exponents of the eigenvalues
+#ifdef MCL_TIMINGS
 	chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
+#endif
 	if(r - (int) r == 0){
 		*out = *in;
 		for(uint32_t i=1; i<r; i++){ 
 			(*out) *= (*in);
 		}
+#ifdef MCL_TIMINGS
 		dense_int_exp_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
+#endif
 	}
 	else{ 
 		// TODO: check whether the matrix is self-adjoint and use SelfAdjointEigenSolver instead
@@ -68,12 +80,16 @@ void MCL::get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
 		if( thr > numeric_limits<float>::epsilon() ){
 			out->noalias() = (V * d.real() * V.inverse()).real();
 		}
+#ifdef MCL_TIMINGS
 		dense_gen_exp_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
+#endif
 	}
 }
 
 void MCL::get_gamma(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* out, float r){
+#ifdef MCL_TIMINGS
 	chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
+#endif
 	vector<Eigen::Triplet<float>> data;
 	for (uint32_t k=0; k<in->outerSize(); ++k){
 		float colSum = 0.0f;
@@ -86,11 +102,15 @@ void MCL::get_gamma(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>*
 	}
 	out->setFromTriplets(data.begin(), data.end(), [] (const float&, const float &b) { return b; });
 	*out = out->pruned(1.0, numeric_limits<float>::epsilon());
+#ifdef MCL_TIMINGS
 	sparse_gamma_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
+#endif
 }
 
 void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
+#ifdef MCL_TIMINGS
 	chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
+#endif
 	// Note that Eigen matrices are column-major, so this is the most efficient way
 	for (uint32_t icol=0; icol<in->cols(); ++icol){
 		float colSum = 0.0f;
@@ -101,33 +121,9 @@ void MCL::get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r){
 			out->coeffRef(irow, icol) =  pow(in->coeffRef(irow, icol) , r) / colSum;
 		}
 	}
+#ifdef MCL_TIMINGS
 	dense_gamma_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
-}
-
-vector<unordered_set<uint32_t>> MCL::get_list(Eigen::SparseMatrix<float>* m){
-	chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
-	LazyDisjointIntegralSet<uint32_t> disjointSet(m->cols());
-	for (uint32_t k=0; k<m->outerSize(); ++k){
-		for (Eigen::SparseMatrix<float>::InnerIterator it(*m, k); it; ++it){
-			disjointSet.merge(it.row(), it.col());
-		}
-	}
-	sparse_list_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
-	return disjointSet.getListOfSets();
-}
-
-vector<unordered_set<uint32_t>> MCL::get_list(Eigen::MatrixXf* m){
-	chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
-	LazyDisjointIntegralSet<uint32_t> disjointSet(m->cols());
-	for (uint32_t icol=0; icol<m->cols(); ++icol){
-		for (uint32_t irow=0; irow<m->rows(); ++irow){
-			if( abs((*m)(irow, icol)) > numeric_limits<float>::epsilon()){
-				disjointSet.merge(irow, icol);
-			}
-		}
-	}
-	dense_list_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
-	return disjointSet.getListOfSets();
+#endif
 }
 
 // void printMatrix(Eigen::SparseMatrix<float>* m){
@@ -163,7 +159,7 @@ vector<unordered_set<uint32_t>> MCL::get_list(Eigen::MatrixXf* m){
 // 	printf("]\n\n");
 // }
 
-vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::SparseMatrix<float>* m, float inflation, float expansion){
+void MCL::markov_process(Eigen::SparseMatrix<float>* m, float inflation, float expansion){
 	for (uint32_t idiag=0; idiag<m->cols(); ++idiag){
 		assert(abs(m->coeffRef(idiag, idiag)) > numeric_limits<float>::epsilon());
 	}
@@ -180,10 +176,9 @@ vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::SparseMatrix<float>*
 		*m = m_update;
 		iteration++;
 	}
-	return get_list(m);
 }
 
-vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::MatrixXf* m, float inflation, float expansion){
+void MCL::markov_process(Eigen::MatrixXf* m, float inflation, float expansion){
 	for (uint32_t idiag=0; idiag<m->cols(); ++idiag){
 		assert(abs(m->coeffRef(idiag, idiag)) > numeric_limits<float>::epsilon());
 	}
@@ -200,7 +195,6 @@ vector<unordered_set<uint32_t>> MCL::markov_process(Eigen::MatrixXf* m, float in
 		*m = m_update;
 		iteration++;
 	}
-	return get_list(m);
 }
 
 void MCL::run(){
@@ -364,21 +358,27 @@ void MCL::run(){
 	}
 
 	uint32_t nThreads = min(config.threads_, nComponentsLt1);
+	// note that the disconnected components are sorted by size
+	const uint32_t max_counter = sort_order.size();
 	float* max_sparsities = new float[nThreads];
 	float* min_sparsities = new float[nThreads];
-	atomic_uint cluster_id(0);
-	atomic_uint nClustersEq1(0);
-	atomic_uint n_dense_calculations(0);
-	atomic_uint n_sparse_calculations(0);
 	const float inflation = (float) config.cluster_mcl_inflation;
 	const float expansion = (float) config.cluster_mcl_expansion;
 
+	atomic_uint n_clusters_found(0);
 	atomic_uint component_counter(0);
+	atomic_uint n_dense_calculations(0);
+	atomic_uint n_sparse_calculations(0);
+	atomic_uint nClustersEq1(0);
 
-	auto mcl_clustering = [&](uint32_t iThr){
-		const uint32_t max_counter = sort_order.size();
-		uint32_t my_counter = component_counter.fetch_add(1, memory_order_relaxed);
-		while(my_counter < max_counter){
+	auto mcl_clustering = [&](const uint32_t iThr){
+		uint32_t n_dense = 0;
+		uint32_t n_sparse = 0;
+		uint32_t n_singletons = 0;
+		uint32_t cluster_id = iThr;
+		//uint32_t my_counter = component_counter.fetch_add(1, memory_order_relaxed);
+		//while(my_counter < max_counter){
+		for(uint32_t my_counter = iThr; my_counter<max_counter; my_counter += nThreads){
 			uint32_t iComponent = sort_order[my_counter];
 			vector<uint32_t> order = indices[iComponent];
 			if(order.size() > 1){
@@ -389,41 +389,89 @@ void MCL::run(){
 				min_sparsities[iThr] = min(min_sparsities[iThr], sparsity);
 				// map back to original ids
 				vector<unordered_set<uint32_t>> list_of_sets;
+				unordered_set<uint32_t> attractors;
 
 				//TODO: a size limit for the dense matrix should control this as well
+#ifdef MCL_TIMINGS
 				chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
+#endif
 				if(sparsity >= config.cluster_mcl_sparsity_switch && expansion - (int) expansion == 0){ 
-					n_sparse_calculations++;
+					n_sparse++;
 					Eigen::SparseMatrix<float> m_sparse(order.size(), order.size());
 					m_sparse.setFromTriplets(m.begin(), m.end());
+#ifdef MCL_TIMINGS
 					sparse_create_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
-					list_of_sets = markov_process(&m_sparse, inflation, expansion);
+#endif
+					markov_process(&m_sparse, inflation, expansion);
+#ifdef MCL_TIMINGS
+					chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
+#endif
+					LazyDisjointIntegralSet<uint32_t> disjointSet(m_sparse.cols());
+					for (uint32_t k=0; k<m_sparse.outerSize(); ++k){
+						for (Eigen::SparseMatrix<float>::InnerIterator it(m_sparse, k); it; ++it){
+							assert(abs(it.value()) > numeric_limits<float>::epsilon());
+							disjointSet.merge(it.row(), it.col());
+							if(it.row() == it.col()){
+								attractors.emplace(it.row());
+							}
+						}
+					}
+					list_of_sets = disjointSet.getListOfSets();
+#ifdef MCL_TIMINGS
+					sparse_list_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
+#endif
 				}
 				else{
-					n_dense_calculations++;
+					n_dense++;
 					Eigen::MatrixXf m_dense = Eigen::MatrixXf::Zero(order.size(), order.size());
 					for(Eigen::Triplet<float> const & t : m){
 						m_dense(t.row(), t.col()) = t.value();
 					}
+#ifdef MCL_TIMINGS
 					dense_create_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
-					list_of_sets = markov_process(&m_dense, inflation, expansion);
+#endif
+					markov_process(&m_dense, inflation, expansion);
+#ifdef MCL_TIMINGS
+					chrono::high_resolution_clock::time_point t = chrono::high_resolution_clock::now();
+#endif
+					LazyDisjointIntegralSet<uint32_t> disjointSet(m_dense.cols());
+					for (uint32_t icol=0; icol<m_dense.cols(); ++icol){
+						for (uint32_t irow=0; irow<m_dense.rows(); ++irow){
+							if( abs(m_dense(irow, icol)) > numeric_limits<float>::epsilon()){
+								disjointSet.merge(irow, icol);
+								if(irow == icol){
+									attractors.emplace(irow);
+								}
+							}
+						}
+					}
+					list_of_sets = disjointSet.getListOfSets();
+#ifdef MCL_TIMINGS
+					dense_list_time += chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count();
+#endif
 				}
 				for(unordered_set<uint32_t> subset : list_of_sets){
-					// FIXME: this does not assign stable cluster ids
-					uint32_t c_id = cluster_id.fetch_add(1, memory_order_relaxed);
+					assert(cluster_id < 0x00ffffff);
 					for(uint32_t el : subset){
-						clustering_result[order[el]] = c_id;
+						const uint32_t mask = attractors.find(el) == attractors.end() ? MASK_NORMAL_NODE : MASK_ATTRACTOR_NODE;
+						clustering_result[order[el]] = mask | cluster_id;
 					}
-					if(subset.size() == 1) nClustersEq1 ++;
+					if(subset.size() == 1) n_singletons++;
+					cluster_id += nThreads;
 				}
 			}
 			else if (order.size() == 1){
-				uint32_t c_id = cluster_id.fetch_add(1, memory_order_relaxed);
-				clustering_result[order[0]] = c_id;
-				nClustersEq1++;
+				assert(cluster_id < 0x00ffffff);
+				clustering_result[order[0]] = MASK_SINGLE_NODE | cluster_id;
+				cluster_id += nThreads;
+				n_singletons++;
 			}
-			my_counter = component_counter.fetch_add(1, memory_order_relaxed);
+			//my_counter = component_counter.fetch_add(1, memory_order_relaxed);
 		}
+		n_clusters_found.fetch_add((cluster_id-iThr)/nThreads, memory_order_relaxed);
+		n_dense_calculations.fetch_add(n_dense, memory_order_relaxed);
+		n_sparse_calculations.fetch_add(n_sparse, memory_order_relaxed);
+		nClustersEq1.fetch_add(n_singletons, memory_order_relaxed);
 	};
 
 	vector<thread> threads;
@@ -445,8 +493,8 @@ void MCL::run(){
 
 	timer.finish();
 	cout << "Found "
-		<< cluster_id - nClustersEq1
-		<< " ("<< cluster_id <<" incl. singletons) clusters with min sparsity "
+		<< n_clusters_found - nClustersEq1
+		<< " ("<< n_clusters_found <<" incl. singletons) clusters with min sparsity "
 		<< min_sparsity 
 		<< " and max. sparsity " 
 		<< max_sparsity 
@@ -455,6 +503,7 @@ void MCL::run(){
 		<< " dense, and " 
 		<< n_sparse_calculations 
 		<< " sparse calculations " << endl;
+#ifdef MCL_TIMINGS
 	cout << "Time used for matrix creation: " 
 		<< (sparse_create_time.load() + dense_create_time.load())/1000.0 
 		<<" (sparse: " 
@@ -481,6 +530,7 @@ void MCL::run(){
 		<< sparse_list_time.load()/1000.0 
 		<< ", dense: " 
 		<< dense_list_time.load()/1000.0 <<")" << endl;
+#endif
 
 	timer.go("Cluster output");
 	ostream *out = config.output_file.empty() ? &cout : new ofstream(config.output_file.c_str());
@@ -492,7 +542,23 @@ void MCL::run(){
 	out->precision(3);
 	for (int i = 0; i < (int)db->ref_header.sequences; ++i) {
 		db->read_seq(id, seq);
-		(*out) << blast_id(id) << '\t' << clustering_result[i] << endl;		
+		const uint32_t res = clustering_result[i];
+		(*out) << blast_id(id) << '\t' << (res & (~MASK_INVERSE)) << '\t';
+		switch(res & MASK_INVERSE){
+			case MASK_SINGLE_NODE:
+				(*out) << 's';
+				break;
+			case MASK_ATTRACTOR_NODE:
+				(*out) << 'a';
+				break;
+			case MASK_NORMAL_NODE:
+				(*out) << 'n';
+				break;
+			default:
+				(*out) << (res & MASK_INVERSE) <<endl;
+				throw runtime_error("Unknown node label");
+		}
+		(*out) << endl;
 		id.clear();
 		seq.clear();
 	}
diff --git a/src/cluster/mcl.h b/src/cluster/mcl.h
index 5de3aaef..ec0e178d 100644
--- a/src/cluster/mcl.h
+++ b/src/cluster/mcl.h
@@ -50,10 +50,9 @@ class MCL: public ClusteringAlgorithm {
 	void get_exp(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r);
 	void get_gamma(Eigen::SparseMatrix<float>* in, Eigen::SparseMatrix<float>* out, float r);
 	void get_gamma(Eigen::MatrixXf* in, Eigen::MatrixXf* out, float r);
-	vector<unordered_set<uint32_t>> get_list(Eigen::SparseMatrix<float>* m);
-	vector<unordered_set<uint32_t>> get_list(Eigen::MatrixXf* m);
-	vector<unordered_set<uint32_t>> markov_process(Eigen::SparseMatrix<float>* m, float inflation, float expansion);
-	vector<unordered_set<uint32_t>> markov_process(Eigen::MatrixXf* m, float inflation, float expansion);
+	void markov_process(Eigen::SparseMatrix<float>* m, float inflation, float expansion);
+	void markov_process(Eigen::MatrixXf* m, float inflation, float expansion);
+#ifdef MCL_TIMINGS
 	atomic_ullong sparse_create_time = {0};
 	atomic_ullong dense_create_time = {0};
 	atomic_ullong sparse_exp_time = {0};
@@ -63,6 +62,7 @@ class MCL: public ClusteringAlgorithm {
 	atomic_ullong dense_gamma_time = {0};
 	atomic_ullong sparse_list_time = {0};
 	atomic_ullong dense_list_time = {0};
+#endif
 public:
 	void run();
 	string get_key();

From 026c9ef52ed3374ddddee0459b19f705dd1d9646 Mon Sep 17 00:00:00 2001
From: Benjamin Buchfink <buchfink@gmail.com>
Date: Fri, 5 Jun 2020 11:48:21 +0200
Subject: [PATCH 20/21] Fixed include.

---
 src/cluster/mcl.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/cluster/mcl.cpp b/src/cluster/mcl.cpp
index fdfcc58c..a6696278 100644
--- a/src/cluster/mcl.cpp
+++ b/src/cluster/mcl.cpp
@@ -16,6 +16,8 @@ GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ****/
+
+#include <numeric>
 #include "mcl.h"
 #define MASK_INVERSE 0xC0000000
 #define MASK_NORMAL_NODE 0x40000000

From 2e4bb1082f63a4a2890cf150a40d8f6267b3cd89 Mon Sep 17 00:00:00 2001
From: Benjamin Buchfink <buchfink@gmail.com>
Date: Fri, 5 Jun 2020 19:03:39 +0200
Subject: [PATCH 21/21] Fixed ifdef.

---
 src/tools/benchmark.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tools/benchmark.cpp b/src/tools/benchmark.cpp
index 5337cd9b..46a2d247 100644
--- a/src/tools/benchmark.cpp
+++ b/src/tools/benchmark.cpp
@@ -80,7 +80,7 @@ void benchmark_ungapped(const sequence &s1, const sequence &s2)
 	cout << "Scalar ungapped extension:\t" << (double)time_span.count() / (n*64) * 1000 << " ps/Cell" << endl;
 }
 
-#ifdef __SSSE3__
+#if defined(__SSSE3__) && defined(__SSE4_1__)
 void benchmark_ssse3_shuffle(const sequence &s1, const sequence &s2)
 {
 	static const size_t n = 100000000llu;
@@ -262,7 +262,7 @@ void benchmark() {
 	benchmark_hamming(s1, s2);
 #endif
 	benchmark_ungapped(ss1, ss2);
-#ifdef __SSSE3__
+#if defined(__SSSE3__) && defined(__SSE4_1__)
 	benchmark_ssse3_shuffle(s1, s2);
 #endif
 #ifdef __SSE4_1__