Release 2.3

BioinformaticsToolsmith · Apr 24, 2019 · 55da02d · 55da02d
1 parent 3809c60
commit 55da02d
Show file tree

Hide file tree

Showing 95 changed files with 5,372 additions and 4,394 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+bin/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,111 @@
+cmake_minimum_required (VERSION 3.1)
+project (MeshClust2)
+
+include_directories(src/exception src/nonltr src/utility src/cluster src/prediction src/clutil src/fastcar)
+set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin)
+set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+
+
+add_library(Fastcar
+  ${CMAKE_SOURCE_DIR}/src/fastcar/FC_Runner.cpp
+)
+
+add_library(ClusterUtil
+  ${CMAKE_SOURCE_DIR}/src/clutil/DivergencePoint.cpp
+  ${CMAKE_SOURCE_DIR}/src/clutil/Histogram.cpp
+  ${CMAKE_SOURCE_DIR}/src/clutil/Loader.cpp
+  ${CMAKE_SOURCE_DIR}/src/clutil/SingleFileLoader.cpp
+  ${CMAKE_SOURCE_DIR}/src/clutil/Progress.cpp
+  ${CMAKE_SOURCE_DIR}/src/clutil/Datatype.cpp
+  ${CMAKE_SOURCE_DIR}/src/clutil/Clock.cpp
+)
+
+add_library(Predict
+  ${CMAKE_SOURCE_DIR}/src/predict/Feature.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/GLM.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/HandleSeq.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/Matrix.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/MultiMute.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/Predictor.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/SingMute.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/FeatureSelector.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/GreedySelector.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/BestFirstSelector.cpp
+)
+
+add_library(Cluster
+  ${CMAKE_SOURCE_DIR}/src/cluster/ClusterFactory.cpp
+  ${CMAKE_SOURCE_DIR}/src/cluster/CRunner.cpp
+  ${CMAKE_SOURCE_DIR}/src/cluster/Trainer.cpp
+  ${CMAKE_SOURCE_DIR}/src/cluster/bvec.cpp
+  ${CMAKE_SOURCE_DIR}/src/cluster/bvec_iterator.cpp
+
+)
+
+add_library(Exception
+  ${CMAKE_SOURCE_DIR}/src/exception/FileDoesNotExistException.cpp
+  ${CMAKE_SOURCE_DIR}/src/exception/InvalidInputException.cpp
+  ${CMAKE_SOURCE_DIR}/src/exception/InvalidOperationException.cpp
+  ${CMAKE_SOURCE_DIR}/src/exception/InvalidOrderOfOperationsException.cpp
+  ${CMAKE_SOURCE_DIR}/src/exception/InvalidScoreException.cpp
+  ${CMAKE_SOURCE_DIR}/src/exception/InvalidStateException.cpp
+)
+
+add_library(Nonltr
+  ${CMAKE_SOURCE_DIR}/src/nonltr/ChromDetectorMaxima.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/ChromListMaker.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/Chromosome.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigit.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitDna.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitProtein.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeRandom.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/DetectorMaxima.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/HMM.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/LocationList.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/LocationListCollection.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/Scanner.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/Scorer.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/Trainer.cpp
+)
+
+add_library(Utility
+  ${CMAKE_SOURCE_DIR}/src/utility/EmptyLocation.cpp
+  ${CMAKE_SOURCE_DIR}/src/utility/GlobAlignE.cpp
+  ${CMAKE_SOURCE_DIR}/src/utility/Location.cpp
+  ${CMAKE_SOURCE_DIR}/src/utility/Util.cpp
+)
+
+target_include_directories(Exception PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(Nonltr PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(Utility PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(Cluster PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(Fastcar PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(ClusterUtil PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(Predict PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+set (HEADER_FILES
+  ${CMAKE_SOURCE_DIR}/src/nonltr/KmerHashTable.h
+  ${CMAKE_SOURCE_DIR}/src/nonltr/EnrichmentMarkovView.h
+  ${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.h
+)
+
+set (CMAKE_CXX_COMPILER g++)
+set (CMAKE_CXX_STANDARD 11)
+set (CMAKE_CXX_FLAGS "-fopenmp -g -O3 -march=native")
+
+target_compile_definitions(Cluster PRIVATE VERSION="2.3.0")
+target_compile_definitions(Fastcar PRIVATE VERSION="0.7.1")
+
+add_executable(Red ${CMAKE_SOURCE_DIR}/src/nonltr/RepeatsDetector.cpp )
+add_executable(meshclust2 ${CMAKE_SOURCE_DIR}/src/cluster/meshclust2.cpp)
+add_executable(fastcar ${CMAKE_SOURCE_DIR}/src/fastcar/fastcar.cpp)
+
+target_link_libraries(Red Exception Nonltr Utility ${HEADER_FILES})
+target_link_libraries(Utility Exception ${HEADER_FILES})
+target_link_libraries(Nonltr Utility Exception ${HEADER_FILES})
+target_link_libraries(ClusterUtil Nonltr ${HEADER_FILES})
+target_link_libraries(Predict ClusterUtil Nonltr ${HEADER_FILES})
+target_link_libraries(meshclust2 Cluster Nonltr ClusterUtil Predict ${HEADER_FILES})
+target_link_libraries(fastcar Nonltr ClusterUtil Fastcar Predict ${HEADER_FILES})
diff --git a/Makefile b/Makefile
diff --git a/README b/README
diff --git a/README.md b/README.md
@@ -0,0 +1,113 @@
+## MeShClust2
+Release version - 2.3.0
+
+### Requirements
+g++ 4.9.1 or later, requires Homebrew on Mac OS X
+Compilation using g++ (homebrew) and CMake on Mac OS X see [this link](https://stackoverflow.com/questions/29057437/compile-openmp-programs-with-gcc-compiler-on-os-x-yosemite)
+
+### Linux/Unix compilation
+> mkdir bin && cd bin
+> cmake ..
+> make
+
+### Citation
+If you find this tool helpful, please cite:
+
+[James, Benjamin T. et al. (2018), MeShClust2: Application of alignment-free identity scores in clustering long DNA sequences. bioRxiv, 451278.](https://doi.org/10.1101/451278)
+
+### Usage
+
+  Usage: meshclust2 --id 0.x [OPTIONS] *.fasta
+
+  --id          The most important parameter, --id, controls the identity cutoff of the sequences.
+                Needs to be between 0 and 1.
+                If it is not specified, an identity of 0.9 is used.
+
+  --kmer        decides the size of the kmers. It is by default automatically decided by average sequence
+                length, but if provided, MeShClust can speed up a little by not having to find the largest
+                sequence length. Increasing kmer size can increase accuracy, but increases memory consumption.
+
+  --dump       Run until the classifier is trained, and then dump the weights to the file,
+               default 'weights.txt'. Can be used with --recover to recover the weights
+               instead of re-training.
+
+  --recover    Recover weights for the classifier trained by a previous run which used --dump to dump
+               the weights.
+
+  --list       Instead of specifying files as extra arguments, provide a text file with
+               a list of files. Can use pipes or process substitutions such as "--list <(ls *.fasta) "
+
+  --no-train-list    Same as --list, but these files are not passed to the classifier,
+                     e.g. unassembled genomes
+
+  --mut-type   {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation}
+               changes the mutation generation algorithm. By default, "both" is used, utilizing
+               single point and block mutations. On higher identity data sets, "single", which includes only single point mutations,
+               is preferable. The option "nonsingle-typical" uses only block mutations,
+               disallowing single point mutations. Other options include "all", which includes single,
+               block, and nontypical mutations translocation and reversion.
+
+  --feat       determines the combinations of features to be used. By default, "slow" allows 11
+               combinations to be selected from. "fast" removes 2 slower features from "slow"
+               which include logarithm based features.
+
+  --single-file  Using this option, (no value is needed), each file is treated as a single sequence.
+                 If multiple sequences in a file are encountered, they are joined with 50 Ns,
+                 and the k-mers are not counted in that region.
+                 However, to be most accurate, it is advised to not use these sequences in the
+                 training step (for mutations) and instead 1) train using un-joined sequences and
+                 use --dump to dump to a file, and 2) use --recover with --single-file for the
+                 file list.
+
+  --sample     selects the total number of sequences used for both training and testing.
+               2000 is the default value. That is, --sample 2000 provides 2000 training
+               pairs and 2000 testing pairs.
+
+  --num-templates   selects the number of "template" sequences from which to mutate.
+               For example, if 300 (the default) templates are requested, and the number of
+               "samples" is requested to be 2000 (the default), 300 sequences will be read in
+               and mutated 2000/300 times each to create 2000 semi-synthetic pairs.
+
+  --min-feat   (default 4) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs
+               will be used. Recall that features include pairwise combinations of the "feat" option.
+
+  --max-feat   (default 4) sets the maximum feature pairs to be used. Diminishing returns appears quickly,
+               so a very large maximum (>10) is not advised.
+
+  --min-id     (default 0.35) sets the lower bound for mutation identity scores to be calculated.
+               Shouldn't need to be set normally, as lower identites take much longer,
+               especially with single mutations only.
+
+  --datatype   (8,16,32,64) Decides the integer size of the histograms. If not provided,
+               all sequences are read in and counted to ensure the largest k-mer does not
+               overflow. If the provided k-mer is too small, it will overflow.
+
+  --threads    sets the number of threads to be used. By default OpenMP uses the number of available cores
+               on your machine, but this parameter overwrites that.
+
+  --output     specifies the output file, in CD-HIT's CLSTR format, described below:
+               A '>Cluster ' followed by an increasing index designates a cluster.
+               Otherwise, the sequence is printed out.
+               A '*' at the end of a sequence designates the center of the cluster.
+               An example of a small data set:
+
+               >Cluster 0
+               0       993nt, >seq128 template_6... *
+               >Cluster 1
+               0       1043nt, >seq235 template_10...
+               1       1000nt, >seq216 template_10... *
+               2       1015nt, >seq237 template_10...
+
+  --delta      decides how many clusters are looked around in the final clustering stage.
+               Increasing it creates more accuracy, but takes more time. Default value is 5.
+
+  --iterations specifies how many iterations in the final stage of merging are done until convergence.
+               Default value is 15.
+
+  If the argument is not listed here, it is interpreted as an input (FASTA format) file.
+
+
+### License
+
+Academic use: The software is provided as-is under the GNU GPLv3.
+Any restrictions to use for-profit or non-academics: License needed.