diff --git a/README b/README
index 62852c8..ade7e0d 100644
--- a/README
+++ b/README
@@ -8,68 +8,71 @@ CXX=g++-7 make
see: https://stackoverflow.com/questions/29057437/compile-openmp-programs-with-gcc-compiler-on-os-x-yosemite
-
Linux/Unix compilation:
make
-Usage: bin/meshclust2 --id 0.x [OPTIONS] *.fasta
-
---id The most important parameter, --id, controls the identity cutoff of the sequences.
- Needs to be between 0 and 1.
- If it is not specified, an identity of 0.9 is used.
-
---kmer decides the size of the kmers. It is by default automatically decided by average sequence length,
- but if provided, MeShClust can speed up a little by not having to find the largest sequence length.
- Increasing kmer size can increase accuracy, but increases memory consumption.
-
---mut-type {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation}
- changes the mutation generation algorithm. By default, "single" is used, utilizing only
- single point mutations. On low identity data sets, "both", which includes single mutations
- and block mutations, is preferable. The option "nonsingle-typical" uses only block mutations,
- disallowing single point mutations. Other options include "all", which includes single,
- block, and nontypical mutations translocation and reversion.
-
---feat determines the combinations of features to be used. By default, "fast" allows 9 fast combinations
- to be selected from. "slow" adds 2 slower features which include logarithm based features,
- and "extraslow" includes 33 total features used in a previous study.
---min-feat (default 3) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs
- will be used. Recall that features include pairwise combinations of the "feat" option.
+If you find this tool helpful, please cite:
---max-feat (default 5) sets the maximum feature pairs to be used. Diminishing returns appears quickly,
- so a very large maximum is not advised.
+James, Benjamin T. et al. (2018), MeShClust2: Application of alignment-free identity scores in clustering long DNA sequences. bioRxiv, 451278.
---sample selects the total number of sequences used for both training and testing.
- 300 is the default value. Each sequence generates 10 synthetic mutants.
- That is, --sample 300 provides 3000 training pairs and 3000 testing pairs.
-
---min-id (default 0.35) sets the lower bound for mutation identity scores to be calculated. Shouldn't need
- to be set normally, as lower identites take much longer, especially with single mutations only.
-
---threads sets the number of threads to be used. By default OpenMP uses the number of available cores
- on your machine, but this parameter overwrites that.
-
---quiet (no arguments) removes the progress bars from output
-
---output specifies the output file, in CD-HIT's CLSTR format, described below:
- A '>Cluster ' followed by an increasing index designates a cluster.
- Otherwise, the sequence is printed out.
- A '*' at the end of a sequence designates the center of the cluster.
- An example of a small data set:
-
- >Cluster 0
- 0 993nt, >seq128 template_6... *
- >Cluster 1
- 0 1043nt, >seq235 template_10...
- 1 1000nt, >seq216 template_10... *
- 2 1015nt, >seq237 template_10...
+Usage: bin/meshclust2 --id 0.x [OPTIONS] *.fasta
---delta decides how many clusters are looked around in the final clustering stage.
- Increasing it creates more accuracy, but takes more time. Default value is 5.
+--id The most important parameter, --id, controls the identity cutoff of the sequences.
+ Needs to be between 0 and 1.
+ If it is not specified, an identity of 0.9 is used.
+
+--kmer decides the size of the kmers. It is by default automatically decided by average sequence
+ length, but if provided, MeShClust can speed up a little by not having to find the largest
+ sequence length. Increasing kmer size can increase accuracy, but increases memory consumption.
+
+--mut-type {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation}
+ changes the mutation generation algorithm. By default, "single" is used, utilizing only
+ single point mutations. On low identity data sets, "both", which includes single mutations
+ and block mutations, is preferable. The option "nonsingle-typical" uses only block mutations,
+ disallowing single point mutations. Other options include "all", which includes single,
+ block, and nontypical mutations translocation and reversion.
+
+--feat determines the combinations of features to be used. By default, "slow" allows 11
+ combinations to be selected from. "fast" removes 2 slower features from "slow"
+ which include logarithm based features, and "extraslow" includes 33 total features
+ used in a previous study.
+
+--min-feat (default 3) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs
+ will be used. Recall that features include pairwise combinations of the "feat" option.
+
+--max-feat (default 5) sets the maximum feature pairs to be used. Diminishing returns appears quickly,
+ so a very large maximum is not advised.
+
+--sample selects the total number of sequences used for both training and testing.
+ 300 is the default value. Each sequence generates 10 synthetic mutants.
+ That is, --sample 300 provides 3000 training pairs and 3000 testing pairs.
+
+--min-id (default 0.35) sets the lower bound for mutation identity scores to be calculated.
+ Shouldn't need to be set normally, as lower identites take much longer,
+ especially with single mutations only.
+
+--threads sets the number of threads to be used. By default OpenMP uses the number of available cores
+ on your machine, but this parameter overwrites that.
+
+--output specifies the output file, in CD-HIT's CLSTR format, described below:
+ A '>Cluster ' followed by an increasing index designates a cluster.
+ Otherwise, the sequence is printed out.
+ A '*' at the end of a sequence designates the center of the cluster.
+ An example of a small data set:
+ >Cluster 0
+ 0 993nt, >seq128 template_6... *
+ >Cluster 1
+ 0 1043nt, >seq235 template_10...
+ 1 1000nt, >seq216 template_10... *
+ 2 1015nt, >seq237 template_10...
+
+--delta decides how many clusters are looked around in the final clustering stage.
+ Increasing it creates more accuracy, but takes more time. Default value is 5.
--iterations specifies how many iterations in the final stage of merging are done until convergence.
- Default value is 15.
+ Default value is 15.
diff --git a/src/cluster/Makefile b/src/cluster/Makefile
index 817559a..9186210 100644
--- a/src/cluster/Makefile
+++ b/src/cluster/Makefile
@@ -1,5 +1,5 @@
TARGET ?= meshclust2
-VERSION ?= 2.0.0
+VERSION ?= 2.1.0
CXX ?= g++
ifeq ($(debug),yes)
CXXFLAGS += -ggdb -DDEBUG -fno-omit-frame-pointer -fopenmp
diff --git a/src/cluster/src/ClusterFactory.cpp b/src/cluster/src/ClusterFactory.cpp
index 741a325..c6ce7ae 100644
--- a/src/cluster/src/ClusterFactory.cpp
+++ b/src/cluster/src/ClusterFactory.cpp
@@ -332,7 +332,7 @@ void mean_shift_update(vector
> &part, int j, const Trainer& trn, i
cerr << "mean shift: NULL" << endl;
}
} else {
- cout << "GOOD: EMPTY" << endl;
+ //cout << "GOOD: EMPTY" << endl;
}
delete top;
delete temp;
diff --git a/src/cluster/src/Predictor.cpp b/src/cluster/src/Predictor.cpp
index f61c389..3f19ba9 100644
--- a/src/cluster/src/Predictor.cpp
+++ b/src/cluster/src/Predictor.cpp
@@ -356,16 +356,16 @@ void Predictor::train(const vector *> &points, const vector
size_t counter = 0;
// struct timespec start, stop;
// clock_gettime(CLOCK_MONOTONIC, &start);
- Progress prog(f_points_tr.size(), "Generating training data");
+ Progress prog1(f_points_tr.size(), "Generating training");
#pragma omp parallel for
for (size_t i = 0; i < f_points_tr.size(); i++) {
auto p = f_points_tr[i];
mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id);
mutate_seqs(p, 5, pos_buf, neg_buf, min_id, 100 * id, _id);
#pragma omp critical
- prog++;
+ prog1++;
}
- prog.end();
+ prog1.end();
// clock_gettime(CLOCK_MONOTONIC, &stop);
// printf("took %lu\n", stop.tv_sec - start.tv_sec);
@@ -389,7 +389,7 @@ void Predictor::train(const vector *> &points, const vector
}
pos_buf.clear();
neg_buf.clear();
- Progress prog2(f_points_test.size(), "Generating test data");
+ Progress prog2(f_points_test.size(), "Generating testing");
#pragma omp parallel for
for (size_t i = 0; i < f_points_test.size(); i++) {
auto p = f_points_test[i];
@@ -743,7 +743,7 @@ void Predictor::train_class(Feature* feat)
feat->finalize();
abs_best_acc = best_class_acc;
used_list.push_back(best_idx);
-// oss << "Feature added: " << best_class_feat.first << " " << (int)best_class_feat.second << endl;
+ oss << "Feature added: " << best_class_feat.first << " " << (int)best_class_feat.second << endl;
oss << "Accuracy: " << best_class_acc << endl;
possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_class_feat), possible_feats.end());
}
diff --git a/src/cluster/src/Progress.cpp b/src/cluster/src/Progress.cpp
index d5ace11..e16ef06 100644
--- a/src/cluster/src/Progress.cpp
+++ b/src/cluster/src/Progress.cpp
@@ -2,12 +2,6 @@
#include
#include
-bool Progress::is_quiet = false;
-
-void Progress::set_quiet(bool is_quiet_)
-{
- is_quiet = is_quiet_;
-}
Progress::Progress(long num, std::string prefix_)
{
pmax = num;
@@ -21,9 +15,6 @@ Progress::Progress(long num, std::string prefix_)
void Progress::print()
{
- if (is_quiet) {
- return;
- }
std::ostringstream oss;
double prog = (double)pcur / pmax;
oss << prefix << " [";
diff --git a/src/cluster/src/Progress.h b/src/cluster/src/Progress.h
index f60472b..f59d948 100644
--- a/src/cluster/src/Progress.h
+++ b/src/cluster/src/Progress.h
@@ -10,8 +10,6 @@
class Progress {
public:
- static void set_quiet(bool is_quiet_=true);
- static bool is_quiet;
Progress(long num, std::string prefix_);
~Progress() { end(); }
void end();
diff --git a/src/cluster/src/Runner.cpp b/src/cluster/src/Runner.cpp
index 9a142fa..8a367d7 100644
--- a/src/cluster/src/Runner.cpp
+++ b/src/cluster/src/Runner.cpp
@@ -30,7 +30,11 @@ Runner::Runner(int argc, char **argv)
// align = true;
// }
if (sample_size == 0) {
- sample_size = 300;
+ if (similarity < 0.6) {
+ sample_size = 1000;
+ } else {
+ sample_size = 300;
+ }
}
srand(10);
}
@@ -94,8 +98,76 @@ void usage(std::string progname)
#else
std::cout << " without OpenMP";
#endif
- std::cout << std::endl;
- std::cout << "See README for detailed options" << std::endl << std::endl;
+ std::cout << std::endl << std::endl;
+
+ std::string raw = R"(--id The most important parameter, --id, controls the identity cutoff of the sequences.
+ Needs to be between 0 and 1.
+ If it is not specified, an identity of 0.9 is used.
+
+--kmer decides the size of the kmers. It is by default automatically decided by average sequence
+ length, but if provided, MeShClust can speed up a little by not having to find the largest
+ sequence length. Increasing kmer size can increase accuracy, but increases memory consumption.
+
+--mut-type {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation}
+ changes the mutation generation algorithm. By default, "single" is used, utilizing only
+ single point mutations. On low identity data sets, "both", which includes single mutations
+ and block mutations, is preferable. The option "nonsingle-typical" uses only block mutations,
+ disallowing single point mutations. Other options include "all", which includes single,
+ block, and nontypical mutations translocation and reversion.
+
+--feat determines the combinations of features to be used. By default, "slow" allows 11
+ combinations to be selected from. "fast" removes 2 slower features from "slow"
+ which include logarithm based features, and "extraslow" includes 33 total features
+ used in a previous study.
+
+--min-feat (default 3) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs
+ will be used. Recall that features include pairwise combinations of the "feat" option.
+
+--max-feat (default 5) sets the maximum feature pairs to be used. Diminishing returns appears quickly,
+ so a very large maximum is not advised.
+
+--sample selects the total number of sequences used for both training and testing.
+ 300 is the default value. Each sequence generates 10 synthetic mutants.
+ That is, --sample 300 provides 3000 training pairs and 3000 testing pairs.
+
+--min-id (default 0.35) sets the lower bound for mutation identity scores to be calculated.
+ Shouldn't need to be set normally, as lower identites take much longer,
+ especially with single mutations only.
+
+--threads sets the number of threads to be used. By default OpenMP uses the number of available cores
+ on your machine, but this parameter overwrites that.
+
+--output specifies the output file, in CD-HIT's CLSTR format, described below:
+ A '>Cluster ' followed by an increasing index designates a cluster.
+ Otherwise, the sequence is printed out.
+ A '*' at the end of a sequence designates the center of the cluster.
+ An example of a small data set:
+
+ >Cluster 0
+ 0 993nt, >seq128 template_6... *
+ >Cluster 1
+ 0 1043nt, >seq235 template_10...
+ 1 1000nt, >seq216 template_10... *
+ 2 1015nt, >seq237 template_10...
+
+--delta decides how many clusters are looked around in the final clustering stage.
+ Increasing it creates more accuracy, but takes more time. Default value is 5.
+
+--iterations specifies how many iterations in the final stage of merging are done until convergence.
+ Default value is 15.
+
+
+
+If the argument is not listed here, it is interpreted as an input (FASTA format) file.
+
+
+If you find this tool helpful, please cite:
+
+James, Benjamin T. et al. (2018), MeShClust2: Application of alignment-free identity scores in clustering long DNA sequences. bioRxiv, 451278.
+
+)";
+
+ std::cout << raw << endl;
}
@@ -222,8 +294,6 @@ void Runner::get_opts(int argc, char **argv)
}
i++;
- } else if (arg == "-q" || arg == "--quiet") {
- Progress::set_quiet(true);
} else if ((arg == "-t" || arg == "--threads") && i + 1 < argc) {
try {
std::string opt = argv[i+1];
diff --git a/src/cluster/src/Runner.h b/src/cluster/src/Runner.h
index 6e04ebf..9cc8c04 100644
--- a/src/cluster/src/Runner.h
+++ b/src/cluster/src/Runner.h
@@ -34,7 +34,7 @@ class Runner {
int min_n_feat = 3;
int max_n_feat = 5;
int mut_type = HandleSeq::SINGLE;
- uint64_t feat_type = PRED_FEAT_FAST;
+ uint64_t feat_type = PRED_FEAT_FAST | PRED_FEAT_DIV;
double min_id = 0.35;
std::vector files;
string output = "output.clstr";
diff --git a/src/cluster/src/Trainer.cpp b/src/cluster/src/Trainer.cpp
index 471a048..432d624 100644
--- a/src/cluster/src/Trainer.cpp
+++ b/src/cluster/src/Trainer.cpp
@@ -596,6 +596,7 @@ void Trainer::train(int min_n_feat, int max_n_feat, uint64_t feat_type, int m
{
if (k != 0) {
+ std::cout << "Splitting data" << endl;
uintmax_t _id = points.size();
Predictor pred(k, cutoff, PRED_MODE_CLASS, feat_type,
mut_type, min_n_feat, max_n_feat, min_id);