-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3809c60
commit 55da02d
Showing
95 changed files
with
5,372 additions
and
4,394 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
bin/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
cmake_minimum_required (VERSION 3.1) | ||
project (MeshClust2) | ||
|
||
include_directories(src/exception src/nonltr src/utility src/cluster src/prediction src/clutil src/fastcar) | ||
set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin) | ||
set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}) | ||
set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}) | ||
|
||
|
||
add_library(Fastcar | ||
${CMAKE_SOURCE_DIR}/src/fastcar/FC_Runner.cpp | ||
) | ||
|
||
add_library(ClusterUtil | ||
${CMAKE_SOURCE_DIR}/src/clutil/DivergencePoint.cpp | ||
${CMAKE_SOURCE_DIR}/src/clutil/Histogram.cpp | ||
${CMAKE_SOURCE_DIR}/src/clutil/Loader.cpp | ||
${CMAKE_SOURCE_DIR}/src/clutil/SingleFileLoader.cpp | ||
${CMAKE_SOURCE_DIR}/src/clutil/Progress.cpp | ||
${CMAKE_SOURCE_DIR}/src/clutil/Datatype.cpp | ||
${CMAKE_SOURCE_DIR}/src/clutil/Clock.cpp | ||
) | ||
|
||
add_library(Predict | ||
${CMAKE_SOURCE_DIR}/src/predict/Feature.cpp | ||
${CMAKE_SOURCE_DIR}/src/predict/GLM.cpp | ||
${CMAKE_SOURCE_DIR}/src/predict/HandleSeq.cpp | ||
${CMAKE_SOURCE_DIR}/src/predict/Matrix.cpp | ||
${CMAKE_SOURCE_DIR}/src/predict/MultiMute.cpp | ||
${CMAKE_SOURCE_DIR}/src/predict/Predictor.cpp | ||
${CMAKE_SOURCE_DIR}/src/predict/SingMute.cpp | ||
${CMAKE_SOURCE_DIR}/src/predict/FeatureSelector.cpp | ||
${CMAKE_SOURCE_DIR}/src/predict/GreedySelector.cpp | ||
${CMAKE_SOURCE_DIR}/src/predict/BestFirstSelector.cpp | ||
) | ||
|
||
add_library(Cluster | ||
${CMAKE_SOURCE_DIR}/src/cluster/ClusterFactory.cpp | ||
${CMAKE_SOURCE_DIR}/src/cluster/CRunner.cpp | ||
${CMAKE_SOURCE_DIR}/src/cluster/Trainer.cpp | ||
${CMAKE_SOURCE_DIR}/src/cluster/bvec.cpp | ||
${CMAKE_SOURCE_DIR}/src/cluster/bvec_iterator.cpp | ||
|
||
) | ||
|
||
add_library(Exception | ||
${CMAKE_SOURCE_DIR}/src/exception/FileDoesNotExistException.cpp | ||
${CMAKE_SOURCE_DIR}/src/exception/InvalidInputException.cpp | ||
${CMAKE_SOURCE_DIR}/src/exception/InvalidOperationException.cpp | ||
${CMAKE_SOURCE_DIR}/src/exception/InvalidOrderOfOperationsException.cpp | ||
${CMAKE_SOURCE_DIR}/src/exception/InvalidScoreException.cpp | ||
${CMAKE_SOURCE_DIR}/src/exception/InvalidStateException.cpp | ||
) | ||
|
||
add_library(Nonltr | ||
${CMAKE_SOURCE_DIR}/src/nonltr/ChromDetectorMaxima.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/ChromListMaker.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/Chromosome.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigit.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitDna.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitProtein.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeRandom.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/DetectorMaxima.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/HMM.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/LocationList.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/LocationListCollection.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/Scanner.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/Scorer.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.cpp | ||
${CMAKE_SOURCE_DIR}/src/nonltr/Trainer.cpp | ||
) | ||
|
||
add_library(Utility | ||
${CMAKE_SOURCE_DIR}/src/utility/EmptyLocation.cpp | ||
${CMAKE_SOURCE_DIR}/src/utility/GlobAlignE.cpp | ||
${CMAKE_SOURCE_DIR}/src/utility/Location.cpp | ||
${CMAKE_SOURCE_DIR}/src/utility/Util.cpp | ||
) | ||
|
||
target_include_directories(Exception PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) | ||
target_include_directories(Nonltr PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) | ||
target_include_directories(Utility PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) | ||
target_include_directories(Cluster PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) | ||
target_include_directories(Fastcar PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) | ||
target_include_directories(ClusterUtil PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) | ||
target_include_directories(Predict PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) | ||
|
||
set (HEADER_FILES | ||
${CMAKE_SOURCE_DIR}/src/nonltr/KmerHashTable.h | ||
${CMAKE_SOURCE_DIR}/src/nonltr/EnrichmentMarkovView.h | ||
${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.h | ||
) | ||
|
||
set (CMAKE_CXX_COMPILER g++) | ||
set (CMAKE_CXX_STANDARD 11) | ||
set (CMAKE_CXX_FLAGS "-fopenmp -g -O3 -march=native") | ||
|
||
target_compile_definitions(Cluster PRIVATE VERSION="2.3.0") | ||
target_compile_definitions(Fastcar PRIVATE VERSION="0.7.1") | ||
|
||
add_executable(Red ${CMAKE_SOURCE_DIR}/src/nonltr/RepeatsDetector.cpp ) | ||
add_executable(meshclust2 ${CMAKE_SOURCE_DIR}/src/cluster/meshclust2.cpp) | ||
add_executable(fastcar ${CMAKE_SOURCE_DIR}/src/fastcar/fastcar.cpp) | ||
|
||
target_link_libraries(Red Exception Nonltr Utility ${HEADER_FILES}) | ||
target_link_libraries(Utility Exception ${HEADER_FILES}) | ||
target_link_libraries(Nonltr Utility Exception ${HEADER_FILES}) | ||
target_link_libraries(ClusterUtil Nonltr ${HEADER_FILES}) | ||
target_link_libraries(Predict ClusterUtil Nonltr ${HEADER_FILES}) | ||
target_link_libraries(meshclust2 Cluster Nonltr ClusterUtil Predict ${HEADER_FILES}) | ||
target_link_libraries(fastcar Nonltr ClusterUtil Fastcar Predict ${HEADER_FILES}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
## MeShClust2 | ||
Release version - 2.3.0 | ||
|
||
### Requirements | ||
g++ 4.9.1 or later, requires Homebrew on Mac OS X | ||
Compilation using g++ (homebrew) and CMake on Mac OS X see [this link](https://stackoverflow.com/questions/29057437/compile-openmp-programs-with-gcc-compiler-on-os-x-yosemite) | ||
|
||
### Linux/Unix compilation | ||
> mkdir bin && cd bin | ||
> cmake .. | ||
> make | ||
### Citation | ||
If you find this tool helpful, please cite: | ||
|
||
[James, Benjamin T. et al. (2018), MeShClust2: Application of alignment-free identity scores in clustering long DNA sequences. bioRxiv, 451278.](https://doi.org/10.1101/451278) | ||
|
||
### Usage | ||
|
||
Usage: meshclust2 --id 0.x [OPTIONS] *.fasta | ||
|
||
--id The most important parameter, --id, controls the identity cutoff of the sequences. | ||
Needs to be between 0 and 1. | ||
If it is not specified, an identity of 0.9 is used. | ||
|
||
--kmer decides the size of the kmers. It is by default automatically decided by average sequence | ||
length, but if provided, MeShClust can speed up a little by not having to find the largest | ||
sequence length. Increasing kmer size can increase accuracy, but increases memory consumption. | ||
|
||
--dump Run until the classifier is trained, and then dump the weights to the file, | ||
default 'weights.txt'. Can be used with --recover to recover the weights | ||
instead of re-training. | ||
|
||
--recover Recover weights for the classifier trained by a previous run which used --dump to dump | ||
the weights. | ||
|
||
--list Instead of specifying files as extra arguments, provide a text file with | ||
a list of files. Can use pipes or process substitutions such as "--list <(ls *.fasta) " | ||
|
||
--no-train-list Same as --list, but these files are not passed to the classifier, | ||
e.g. unassembled genomes | ||
|
||
--mut-type {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation} | ||
changes the mutation generation algorithm. By default, "both" is used, utilizing | ||
single point and block mutations. On higher identity data sets, "single", which includes only single point mutations, | ||
is preferable. The option "nonsingle-typical" uses only block mutations, | ||
disallowing single point mutations. Other options include "all", which includes single, | ||
block, and nontypical mutations translocation and reversion. | ||
|
||
--feat determines the combinations of features to be used. By default, "slow" allows 11 | ||
combinations to be selected from. "fast" removes 2 slower features from "slow" | ||
which include logarithm based features. | ||
|
||
--single-file Using this option, (no value is needed), each file is treated as a single sequence. | ||
If multiple sequences in a file are encountered, they are joined with 50 Ns, | ||
and the k-mers are not counted in that region. | ||
However, to be most accurate, it is advised to not use these sequences in the | ||
training step (for mutations) and instead 1) train using un-joined sequences and | ||
use --dump to dump to a file, and 2) use --recover with --single-file for the | ||
file list. | ||
|
||
--sample selects the total number of sequences used for both training and testing. | ||
2000 is the default value. That is, --sample 2000 provides 2000 training | ||
pairs and 2000 testing pairs. | ||
|
||
--num-templates selects the number of "template" sequences from which to mutate. | ||
For example, if 300 (the default) templates are requested, and the number of | ||
"samples" is requested to be 2000 (the default), 300 sequences will be read in | ||
and mutated 2000/300 times each to create 2000 semi-synthetic pairs. | ||
|
||
--min-feat (default 4) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs | ||
will be used. Recall that features include pairwise combinations of the "feat" option. | ||
|
||
--max-feat (default 4) sets the maximum feature pairs to be used. Diminishing returns appears quickly, | ||
so a very large maximum (>10) is not advised. | ||
|
||
--min-id (default 0.35) sets the lower bound for mutation identity scores to be calculated. | ||
Shouldn't need to be set normally, as lower identites take much longer, | ||
especially with single mutations only. | ||
|
||
--datatype (8,16,32,64) Decides the integer size of the histograms. If not provided, | ||
all sequences are read in and counted to ensure the largest k-mer does not | ||
overflow. If the provided k-mer is too small, it will overflow. | ||
|
||
--threads sets the number of threads to be used. By default OpenMP uses the number of available cores | ||
on your machine, but this parameter overwrites that. | ||
|
||
--output specifies the output file, in CD-HIT's CLSTR format, described below: | ||
A '>Cluster ' followed by an increasing index designates a cluster. | ||
Otherwise, the sequence is printed out. | ||
A '*' at the end of a sequence designates the center of the cluster. | ||
An example of a small data set: | ||
|
||
>Cluster 0 | ||
0 993nt, >seq128 template_6... * | ||
>Cluster 1 | ||
0 1043nt, >seq235 template_10... | ||
1 1000nt, >seq216 template_10... * | ||
2 1015nt, >seq237 template_10... | ||
|
||
--delta decides how many clusters are looked around in the final clustering stage. | ||
Increasing it creates more accuracy, but takes more time. Default value is 5. | ||
|
||
--iterations specifies how many iterations in the final stage of merging are done until convergence. | ||
Default value is 15. | ||
|
||
If the argument is not listed here, it is interpreted as an input (FASTA format) file. | ||
|
||
|
||
### License | ||
|
||
Academic use: The software is provided as-is under the GNU GPLv3. | ||
Any restrictions to use for-profit or non-academics: License needed. |
Oops, something went wrong.