Skip to content

Commit

Permalink
Merged w/ master.
Browse files Browse the repository at this point in the history
  • Loading branch information
bbuchfink committed Jun 7, 2020
2 parents 2dcb98b + 2e4bb10 commit e7bb7e9
Show file tree
Hide file tree
Showing 20 changed files with 1,734 additions and 63 deletions.
6 changes: 5 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ set(OBJECTS
src/run/tools.cpp
src/chaining/greedy_align.cpp
src/output/output_format.cpp
src/output/clustering_variables.cpp
src/output/clustering_format.cpp
src/output/join_blocks.cpp
src/data/frequent_seeds.cpp
src/align/legacy/query_mapper.cpp
Expand Down Expand Up @@ -171,7 +173,6 @@ set(OBJECTS
src/data/seed_array.cpp
src/output/paf_format.cpp
src/util/system/system.cpp
src/run/cluster.cpp
src/util/algo/greedy_vortex_cover.cpp
src/util/sequence/sequence.cpp
src/tools/tsv_record.cpp
Expand All @@ -192,6 +193,9 @@ set(OBJECTS
src/align/gapped.cpp
src/align/culling.cpp
src/cluster/medoid.cpp
src/cluster/cluster_registry.cpp
src/cluster/multi_step_cluster.cpp
src/cluster/mcl.cpp
src/align/output.cpp
src/tools/roc.cpp
src/test/data.cpp
Expand Down
20 changes: 19 additions & 1 deletion src/basic/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "shape_config.h"
#include "../util/io/temp_file.h"
#include "../basic/match.h"
#include "../cluster/cluster_registry.h"
#include "../basic/translate.h"
#include "../dp/dp.h"
#include "masking.h"
Expand Down Expand Up @@ -148,6 +149,14 @@ Config::Config(int argc, const char **argv, bool check_io)
makedb.add()
("in", 0, "input reference file in FASTA format", input_ref_file);

Options_group cluster("Cluster options");
cluster.add()
("cluster-algo", 0, "Clustering algorithm (\"multi-step\", \"mcl\")", cluster_algo)
("cluster-similarity", 0, "Clustering similarity measure", cluster_similarity)
("mcl-expansion", 0, "MCL expansion coefficient (default=2)", cluster_mcl_expansion, (uint32_t) 2)
("mcl-inflation", 0, "MCL inflation coefficient (default=2.0)", cluster_mcl_inflation, 2.0)
("mcl-sparsity-switch", 0, "MCL switch to sparse matrix computation (default=0.8) ", cluster_mcl_sparsity_switch, 0.8);

Options_group aligner("Aligner options");
aligner.add()
("query", 'q', "input query file", query_file)
Expand Down Expand Up @@ -332,7 +341,7 @@ Config::Config(int argc, const char **argv, bool check_io)
("self", 0, "", self)
("trace-pt-fetch-size", 0, "", trace_pt_fetch_size, (size_t)10e9);

parser.add(general).add(makedb).add(aligner).add(advanced).add(view_options).add(getseq_options).add(hidden_options);
parser.add(general).add(makedb).add(cluster).add(aligner).add(advanced).add(view_options).add(getseq_options).add(hidden_options);
parser.store(argc, argv, command);

if (long_reads) {
Expand Down Expand Up @@ -475,6 +484,15 @@ Config::Config(int argc, const char **argv, bool check_io)
throw std::runtime_error("Custom scoring matrices require setting the --gapopen and --gapextend options.");
score_matrix = Score_matrix(matrix_file, lambda, K, gap_open, gap_extend);
}
if(command == Config::cluster && !Workflow::Cluster::ClusterRegistry::has(cluster_algo)){
ostream &header_out = command == Config::help ? cout : cerr;
header_out << "Unkown clustering algorithm: " << cluster_algo << endl;
header_out << "Available options are: " << endl;
for(string c_algo : Workflow::Cluster::ClusterRegistry::getKeys()){
header_out << "\t" << c_algo << "\t"<< Workflow::Cluster::ClusterRegistry::get(c_algo)->get_description() << endl;
}
throw std::runtime_error("Clustering algorithm not found.");
}
message_stream << "Scoring parameters: " << score_matrix << endl;
if (masking == 1)
Masking::instance = unique_ptr<Masking>(new Masking(score_matrix));
Expand Down
6 changes: 6 additions & 0 deletions src/basic/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,12 @@ struct Config
enum { double_indexed = 0, query_indexed = 1, subject_indexed = 2 };
int algo;

string cluster_algo;
double cluster_mcl_inflation;
uint32_t cluster_mcl_expansion;
double cluster_mcl_sparsity_switch;
string cluster_similarity;

enum { query_parallel = 0, target_parallel = 1 };
unsigned load_balancing;

Expand Down
3 changes: 2 additions & 1 deletion src/basic/masking.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/

#pragma once
#include <memory>
#include "value.h"
#include "score_matrix.h"
Expand All @@ -41,4 +42,4 @@ struct Masking
Letter mask_table_x_[size], mask_table_bit_[size];
};

size_t mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask = true);
size_t mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask = true);
27 changes: 27 additions & 0 deletions src/cluster/cluster.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/****
DIAMOND protein aligner
Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark)
Code developed by Patrick Ettenhuber <[email protected]>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/

#pragma once
#include <string>
class ClusteringAlgorithm {
public:
virtual void run() = 0;
virtual std::string get_key() = 0;
virtual std::string get_description() = 0;
};
29 changes: 29 additions & 0 deletions src/cluster/cluster_registry.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/****
DIAMOND protein aligner
Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark)
Code developed by Patrick Ettenhuber <[email protected]>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/

#include "cluster_registry.h"

using namespace std;

namespace Workflow { namespace Cluster{
map<string, ClusteringAlgorithm*> ClusterRegistry::regMap;
MCL ClusterRegistry::mcl;
MultiStep ClusterRegistry::multiStep;
ClusterRegistry::StaticConstructor ClusterRegistry::_staticConstructor;
}}
64 changes: 64 additions & 0 deletions src/cluster/cluster_registry.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/****
DIAMOND protein aligner
Copyright (C) 2020 QIAGEN A/S (Aarhus, Denmark)
Code developed by Patrick Ettenhuber <[email protected]>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/

#pragma once
#include "cluster.h"
#include "mcl.h"
#include "multi_step_cluster.h"

using namespace std;

namespace Workflow { namespace Cluster{
class ClusterRegistry{
private:
ClusterRegistry(){};
// To include new clustering algorithms add the instantiation here and in the cluster_registry.cpp file. Then add it to the StaticConstructor below
static MCL mcl;
static MultiStep multiStep;
public:
static map<string, ClusteringAlgorithm*> regMap;
static ClusteringAlgorithm* get(string key){
map<string, ClusteringAlgorithm*>::iterator ca = ClusterRegistry::regMap.find(key);
if(ca == ClusterRegistry::regMap.end()){
throw std::runtime_error("Clustering algorithm not found.");
}
return ca->second;
}
static bool has(string key){
return ClusterRegistry::regMap.find(key) != ClusterRegistry::regMap.end();
}
static vector<string> getKeys(){
auto it = regMap.begin();
vector<string> keys;
while(it != regMap.end()){
keys.push_back(it->first);
it++;
}
return keys;
}
static struct StaticConstructor {
StaticConstructor() {
// Add any new clustering algorithm here
regMap.emplace(multiStep.get_key(), &multiStep);
regMap.emplace(mcl.get_key(), &mcl);
}
} _staticConstructor;
};

}}
Loading

0 comments on commit e7bb7e9

Please sign in to comment.