From 3a5eaadb012faa1b836f5a5a192c8b9bdb71925d Mon Sep 17 00:00:00 2001 From: Benjamin James Date: Thu, 15 Nov 2018 17:03:38 -0600 Subject: [PATCH] v2.1.0 --- README | 107 +++++++++++++++-------------- src/cluster/Makefile | 2 +- src/cluster/src/ClusterFactory.cpp | 2 +- src/cluster/src/Predictor.cpp | 10 +-- src/cluster/src/Progress.cpp | 9 --- src/cluster/src/Progress.h | 2 - src/cluster/src/Runner.cpp | 80 +++++++++++++++++++-- src/cluster/src/Runner.h | 2 +- src/cluster/src/Trainer.cpp | 1 + 9 files changed, 139 insertions(+), 76 deletions(-) diff --git a/README b/README index 62852c8..ade7e0d 100644 --- a/README +++ b/README @@ -8,68 +8,71 @@ CXX=g++-7 make see: https://stackoverflow.com/questions/29057437/compile-openmp-programs-with-gcc-compiler-on-os-x-yosemite - Linux/Unix compilation: make -Usage: bin/meshclust2 --id 0.x [OPTIONS] *.fasta - ---id The most important parameter, --id, controls the identity cutoff of the sequences. - Needs to be between 0 and 1. - If it is not specified, an identity of 0.9 is used. - ---kmer decides the size of the kmers. It is by default automatically decided by average sequence length, - but if provided, MeShClust can speed up a little by not having to find the largest sequence length. - Increasing kmer size can increase accuracy, but increases memory consumption. - ---mut-type {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation} - changes the mutation generation algorithm. By default, "single" is used, utilizing only - single point mutations. On low identity data sets, "both", which includes single mutations - and block mutations, is preferable. The option "nonsingle-typical" uses only block mutations, - disallowing single point mutations. Other options include "all", which includes single, - block, and nontypical mutations translocation and reversion. - ---feat determines the combinations of features to be used. By default, "fast" allows 9 fast combinations - to be selected from. "slow" adds 2 slower features which include logarithm based features, - and "extraslow" includes 33 total features used in a previous study. ---min-feat (default 3) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs - will be used. Recall that features include pairwise combinations of the "feat" option. +If you find this tool helpful, please cite: ---max-feat (default 5) sets the maximum feature pairs to be used. Diminishing returns appears quickly, - so a very large maximum is not advised. +James, Benjamin T. et al. (2018), MeShClust2: Application of alignment-free identity scores in clustering long DNA sequences. bioRxiv, 451278. ---sample selects the total number of sequences used for both training and testing. - 300 is the default value. Each sequence generates 10 synthetic mutants. - That is, --sample 300 provides 3000 training pairs and 3000 testing pairs. - ---min-id (default 0.35) sets the lower bound for mutation identity scores to be calculated. Shouldn't need - to be set normally, as lower identites take much longer, especially with single mutations only. - ---threads sets the number of threads to be used. By default OpenMP uses the number of available cores - on your machine, but this parameter overwrites that. - ---quiet (no arguments) removes the progress bars from output - ---output specifies the output file, in CD-HIT's CLSTR format, described below: - A '>Cluster ' followed by an increasing index designates a cluster. - Otherwise, the sequence is printed out. - A '*' at the end of a sequence designates the center of the cluster. - An example of a small data set: - - >Cluster 0 - 0 993nt, >seq128 template_6... * - >Cluster 1 - 0 1043nt, >seq235 template_10... - 1 1000nt, >seq216 template_10... * - 2 1015nt, >seq237 template_10... +Usage: bin/meshclust2 --id 0.x [OPTIONS] *.fasta ---delta decides how many clusters are looked around in the final clustering stage. - Increasing it creates more accuracy, but takes more time. Default value is 5. +--id The most important parameter, --id, controls the identity cutoff of the sequences. + Needs to be between 0 and 1. + If it is not specified, an identity of 0.9 is used. + +--kmer decides the size of the kmers. It is by default automatically decided by average sequence + length, but if provided, MeShClust can speed up a little by not having to find the largest + sequence length. Increasing kmer size can increase accuracy, but increases memory consumption. + +--mut-type {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation} + changes the mutation generation algorithm. By default, "single" is used, utilizing only + single point mutations. On low identity data sets, "both", which includes single mutations + and block mutations, is preferable. The option "nonsingle-typical" uses only block mutations, + disallowing single point mutations. Other options include "all", which includes single, + block, and nontypical mutations translocation and reversion. + +--feat determines the combinations of features to be used. By default, "slow" allows 11 + combinations to be selected from. "fast" removes 2 slower features from "slow" + which include logarithm based features, and "extraslow" includes 33 total features + used in a previous study. + +--min-feat (default 3) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs + will be used. Recall that features include pairwise combinations of the "feat" option. + +--max-feat (default 5) sets the maximum feature pairs to be used. Diminishing returns appears quickly, + so a very large maximum is not advised. + +--sample selects the total number of sequences used for both training and testing. + 300 is the default value. Each sequence generates 10 synthetic mutants. + That is, --sample 300 provides 3000 training pairs and 3000 testing pairs. + +--min-id (default 0.35) sets the lower bound for mutation identity scores to be calculated. + Shouldn't need to be set normally, as lower identites take much longer, + especially with single mutations only. + +--threads sets the number of threads to be used. By default OpenMP uses the number of available cores + on your machine, but this parameter overwrites that. + +--output specifies the output file, in CD-HIT's CLSTR format, described below: + A '>Cluster ' followed by an increasing index designates a cluster. + Otherwise, the sequence is printed out. + A '*' at the end of a sequence designates the center of the cluster. + An example of a small data set: + >Cluster 0 + 0 993nt, >seq128 template_6... * + >Cluster 1 + 0 1043nt, >seq235 template_10... + 1 1000nt, >seq216 template_10... * + 2 1015nt, >seq237 template_10... + +--delta decides how many clusters are looked around in the final clustering stage. + Increasing it creates more accuracy, but takes more time. Default value is 5. --iterations specifies how many iterations in the final stage of merging are done until convergence. - Default value is 15. + Default value is 15. diff --git a/src/cluster/Makefile b/src/cluster/Makefile index 817559a..9186210 100644 --- a/src/cluster/Makefile +++ b/src/cluster/Makefile @@ -1,5 +1,5 @@ TARGET ?= meshclust2 -VERSION ?= 2.0.0 +VERSION ?= 2.1.0 CXX ?= g++ ifeq ($(debug),yes) CXXFLAGS += -ggdb -DDEBUG -fno-omit-frame-pointer -fopenmp diff --git a/src/cluster/src/ClusterFactory.cpp b/src/cluster/src/ClusterFactory.cpp index 741a325..c6ce7ae 100644 --- a/src/cluster/src/ClusterFactory.cpp +++ b/src/cluster/src/ClusterFactory.cpp @@ -332,7 +332,7 @@ void mean_shift_update(vector > &part, int j, const Trainer& trn, i cerr << "mean shift: NULL" << endl; } } else { - cout << "GOOD: EMPTY" << endl; + //cout << "GOOD: EMPTY" << endl; } delete top; delete temp; diff --git a/src/cluster/src/Predictor.cpp b/src/cluster/src/Predictor.cpp index f61c389..3f19ba9 100644 --- a/src/cluster/src/Predictor.cpp +++ b/src/cluster/src/Predictor.cpp @@ -356,16 +356,16 @@ void Predictor::train(const vector *> &points, const vector size_t counter = 0; // struct timespec start, stop; // clock_gettime(CLOCK_MONOTONIC, &start); - Progress prog(f_points_tr.size(), "Generating training data"); + Progress prog1(f_points_tr.size(), "Generating training"); #pragma omp parallel for for (size_t i = 0; i < f_points_tr.size(); i++) { auto p = f_points_tr[i]; mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id); mutate_seqs(p, 5, pos_buf, neg_buf, min_id, 100 * id, _id); #pragma omp critical - prog++; + prog1++; } - prog.end(); + prog1.end(); // clock_gettime(CLOCK_MONOTONIC, &stop); // printf("took %lu\n", stop.tv_sec - start.tv_sec); @@ -389,7 +389,7 @@ void Predictor::train(const vector *> &points, const vector } pos_buf.clear(); neg_buf.clear(); - Progress prog2(f_points_test.size(), "Generating test data"); + Progress prog2(f_points_test.size(), "Generating testing"); #pragma omp parallel for for (size_t i = 0; i < f_points_test.size(); i++) { auto p = f_points_test[i]; @@ -743,7 +743,7 @@ void Predictor::train_class(Feature* feat) feat->finalize(); abs_best_acc = best_class_acc; used_list.push_back(best_idx); -// oss << "Feature added: " << best_class_feat.first << " " << (int)best_class_feat.second << endl; + oss << "Feature added: " << best_class_feat.first << " " << (int)best_class_feat.second << endl; oss << "Accuracy: " << best_class_acc << endl; possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_class_feat), possible_feats.end()); } diff --git a/src/cluster/src/Progress.cpp b/src/cluster/src/Progress.cpp index d5ace11..e16ef06 100644 --- a/src/cluster/src/Progress.cpp +++ b/src/cluster/src/Progress.cpp @@ -2,12 +2,6 @@ #include #include -bool Progress::is_quiet = false; - -void Progress::set_quiet(bool is_quiet_) -{ - is_quiet = is_quiet_; -} Progress::Progress(long num, std::string prefix_) { pmax = num; @@ -21,9 +15,6 @@ Progress::Progress(long num, std::string prefix_) void Progress::print() { - if (is_quiet) { - return; - } std::ostringstream oss; double prog = (double)pcur / pmax; oss << prefix << " ["; diff --git a/src/cluster/src/Progress.h b/src/cluster/src/Progress.h index f60472b..f59d948 100644 --- a/src/cluster/src/Progress.h +++ b/src/cluster/src/Progress.h @@ -10,8 +10,6 @@ class Progress { public: - static void set_quiet(bool is_quiet_=true); - static bool is_quiet; Progress(long num, std::string prefix_); ~Progress() { end(); } void end(); diff --git a/src/cluster/src/Runner.cpp b/src/cluster/src/Runner.cpp index 9a142fa..8a367d7 100644 --- a/src/cluster/src/Runner.cpp +++ b/src/cluster/src/Runner.cpp @@ -30,7 +30,11 @@ Runner::Runner(int argc, char **argv) // align = true; // } if (sample_size == 0) { - sample_size = 300; + if (similarity < 0.6) { + sample_size = 1000; + } else { + sample_size = 300; + } } srand(10); } @@ -94,8 +98,76 @@ void usage(std::string progname) #else std::cout << " without OpenMP"; #endif - std::cout << std::endl; - std::cout << "See README for detailed options" << std::endl << std::endl; + std::cout << std::endl << std::endl; + + std::string raw = R"(--id The most important parameter, --id, controls the identity cutoff of the sequences. + Needs to be between 0 and 1. + If it is not specified, an identity of 0.9 is used. + +--kmer decides the size of the kmers. It is by default automatically decided by average sequence + length, but if provided, MeShClust can speed up a little by not having to find the largest + sequence length. Increasing kmer size can increase accuracy, but increases memory consumption. + +--mut-type {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation} + changes the mutation generation algorithm. By default, "single" is used, utilizing only + single point mutations. On low identity data sets, "both", which includes single mutations + and block mutations, is preferable. The option "nonsingle-typical" uses only block mutations, + disallowing single point mutations. Other options include "all", which includes single, + block, and nontypical mutations translocation and reversion. + +--feat determines the combinations of features to be used. By default, "slow" allows 11 + combinations to be selected from. "fast" removes 2 slower features from "slow" + which include logarithm based features, and "extraslow" includes 33 total features + used in a previous study. + +--min-feat (default 3) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs + will be used. Recall that features include pairwise combinations of the "feat" option. + +--max-feat (default 5) sets the maximum feature pairs to be used. Diminishing returns appears quickly, + so a very large maximum is not advised. + +--sample selects the total number of sequences used for both training and testing. + 300 is the default value. Each sequence generates 10 synthetic mutants. + That is, --sample 300 provides 3000 training pairs and 3000 testing pairs. + +--min-id (default 0.35) sets the lower bound for mutation identity scores to be calculated. + Shouldn't need to be set normally, as lower identites take much longer, + especially with single mutations only. + +--threads sets the number of threads to be used. By default OpenMP uses the number of available cores + on your machine, but this parameter overwrites that. + +--output specifies the output file, in CD-HIT's CLSTR format, described below: + A '>Cluster ' followed by an increasing index designates a cluster. + Otherwise, the sequence is printed out. + A '*' at the end of a sequence designates the center of the cluster. + An example of a small data set: + + >Cluster 0 + 0 993nt, >seq128 template_6... * + >Cluster 1 + 0 1043nt, >seq235 template_10... + 1 1000nt, >seq216 template_10... * + 2 1015nt, >seq237 template_10... + +--delta decides how many clusters are looked around in the final clustering stage. + Increasing it creates more accuracy, but takes more time. Default value is 5. + +--iterations specifies how many iterations in the final stage of merging are done until convergence. + Default value is 15. + + + +If the argument is not listed here, it is interpreted as an input (FASTA format) file. + + +If you find this tool helpful, please cite: + +James, Benjamin T. et al. (2018), MeShClust2: Application of alignment-free identity scores in clustering long DNA sequences. bioRxiv, 451278. + +)"; + + std::cout << raw << endl; } @@ -222,8 +294,6 @@ void Runner::get_opts(int argc, char **argv) } i++; - } else if (arg == "-q" || arg == "--quiet") { - Progress::set_quiet(true); } else if ((arg == "-t" || arg == "--threads") && i + 1 < argc) { try { std::string opt = argv[i+1]; diff --git a/src/cluster/src/Runner.h b/src/cluster/src/Runner.h index 6e04ebf..9cc8c04 100644 --- a/src/cluster/src/Runner.h +++ b/src/cluster/src/Runner.h @@ -34,7 +34,7 @@ class Runner { int min_n_feat = 3; int max_n_feat = 5; int mut_type = HandleSeq::SINGLE; - uint64_t feat_type = PRED_FEAT_FAST; + uint64_t feat_type = PRED_FEAT_FAST | PRED_FEAT_DIV; double min_id = 0.35; std::vector files; string output = "output.clstr"; diff --git a/src/cluster/src/Trainer.cpp b/src/cluster/src/Trainer.cpp index 471a048..432d624 100644 --- a/src/cluster/src/Trainer.cpp +++ b/src/cluster/src/Trainer.cpp @@ -596,6 +596,7 @@ void Trainer::train(int min_n_feat, int max_n_feat, uint64_t feat_type, int m { if (k != 0) { + std::cout << "Splitting data" << endl; uintmax_t _id = points.size(); Predictor pred(k, cutoff, PRED_MODE_CLASS, feat_type, mut_type, min_n_feat, max_n_feat, min_id);