Skip to content

Commit

Permalink
Release 2.3
Browse files Browse the repository at this point in the history
  • Loading branch information
benjamin-james committed Apr 24, 2019
1 parent 3809c60 commit 55da02d
Show file tree
Hide file tree
Showing 95 changed files with 5,372 additions and 4,394 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
bin/
111 changes: 111 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
cmake_minimum_required (VERSION 3.1)
project (MeshClust2)

include_directories(src/exception src/nonltr src/utility src/cluster src/prediction src/clutil src/fastcar)
set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin)
set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})


add_library(Fastcar
${CMAKE_SOURCE_DIR}/src/fastcar/FC_Runner.cpp
)

add_library(ClusterUtil
${CMAKE_SOURCE_DIR}/src/clutil/DivergencePoint.cpp
${CMAKE_SOURCE_DIR}/src/clutil/Histogram.cpp
${CMAKE_SOURCE_DIR}/src/clutil/Loader.cpp
${CMAKE_SOURCE_DIR}/src/clutil/SingleFileLoader.cpp
${CMAKE_SOURCE_DIR}/src/clutil/Progress.cpp
${CMAKE_SOURCE_DIR}/src/clutil/Datatype.cpp
${CMAKE_SOURCE_DIR}/src/clutil/Clock.cpp
)

add_library(Predict
${CMAKE_SOURCE_DIR}/src/predict/Feature.cpp
${CMAKE_SOURCE_DIR}/src/predict/GLM.cpp
${CMAKE_SOURCE_DIR}/src/predict/HandleSeq.cpp
${CMAKE_SOURCE_DIR}/src/predict/Matrix.cpp
${CMAKE_SOURCE_DIR}/src/predict/MultiMute.cpp
${CMAKE_SOURCE_DIR}/src/predict/Predictor.cpp
${CMAKE_SOURCE_DIR}/src/predict/SingMute.cpp
${CMAKE_SOURCE_DIR}/src/predict/FeatureSelector.cpp
${CMAKE_SOURCE_DIR}/src/predict/GreedySelector.cpp
${CMAKE_SOURCE_DIR}/src/predict/BestFirstSelector.cpp
)

add_library(Cluster
${CMAKE_SOURCE_DIR}/src/cluster/ClusterFactory.cpp
${CMAKE_SOURCE_DIR}/src/cluster/CRunner.cpp
${CMAKE_SOURCE_DIR}/src/cluster/Trainer.cpp
${CMAKE_SOURCE_DIR}/src/cluster/bvec.cpp
${CMAKE_SOURCE_DIR}/src/cluster/bvec_iterator.cpp

)

add_library(Exception
${CMAKE_SOURCE_DIR}/src/exception/FileDoesNotExistException.cpp
${CMAKE_SOURCE_DIR}/src/exception/InvalidInputException.cpp
${CMAKE_SOURCE_DIR}/src/exception/InvalidOperationException.cpp
${CMAKE_SOURCE_DIR}/src/exception/InvalidOrderOfOperationsException.cpp
${CMAKE_SOURCE_DIR}/src/exception/InvalidScoreException.cpp
${CMAKE_SOURCE_DIR}/src/exception/InvalidStateException.cpp
)

add_library(Nonltr
${CMAKE_SOURCE_DIR}/src/nonltr/ChromDetectorMaxima.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/ChromListMaker.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/Chromosome.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigit.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitDna.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitProtein.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeRandom.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/DetectorMaxima.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/HMM.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/LocationList.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/LocationListCollection.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/Scanner.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/Scorer.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.cpp
${CMAKE_SOURCE_DIR}/src/nonltr/Trainer.cpp
)

add_library(Utility
${CMAKE_SOURCE_DIR}/src/utility/EmptyLocation.cpp
${CMAKE_SOURCE_DIR}/src/utility/GlobAlignE.cpp
${CMAKE_SOURCE_DIR}/src/utility/Location.cpp
${CMAKE_SOURCE_DIR}/src/utility/Util.cpp
)

target_include_directories(Exception PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(Nonltr PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(Utility PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(Cluster PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(Fastcar PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(ClusterUtil PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(Predict PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

set (HEADER_FILES
${CMAKE_SOURCE_DIR}/src/nonltr/KmerHashTable.h
${CMAKE_SOURCE_DIR}/src/nonltr/EnrichmentMarkovView.h
${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.h
)

set (CMAKE_CXX_COMPILER g++)
set (CMAKE_CXX_STANDARD 11)
set (CMAKE_CXX_FLAGS "-fopenmp -g -O3 -march=native")

target_compile_definitions(Cluster PRIVATE VERSION="2.3.0")
target_compile_definitions(Fastcar PRIVATE VERSION="0.7.1")

add_executable(Red ${CMAKE_SOURCE_DIR}/src/nonltr/RepeatsDetector.cpp )
add_executable(meshclust2 ${CMAKE_SOURCE_DIR}/src/cluster/meshclust2.cpp)
add_executable(fastcar ${CMAKE_SOURCE_DIR}/src/fastcar/fastcar.cpp)

target_link_libraries(Red Exception Nonltr Utility ${HEADER_FILES})
target_link_libraries(Utility Exception ${HEADER_FILES})
target_link_libraries(Nonltr Utility Exception ${HEADER_FILES})
target_link_libraries(ClusterUtil Nonltr ${HEADER_FILES})
target_link_libraries(Predict ClusterUtil Nonltr ${HEADER_FILES})
target_link_libraries(meshclust2 Cluster Nonltr ClusterUtil Predict ${HEADER_FILES})
target_link_libraries(fastcar Nonltr ClusterUtil Fastcar Predict ${HEADER_FILES})
19 changes: 0 additions & 19 deletions Makefile

This file was deleted.

85 changes: 0 additions & 85 deletions README

This file was deleted.

113 changes: 113 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
## MeShClust2
Release version - 2.3.0

### Requirements
g++ 4.9.1 or later, requires Homebrew on Mac OS X
Compilation using g++ (homebrew) and CMake on Mac OS X see [this link](https://stackoverflow.com/questions/29057437/compile-openmp-programs-with-gcc-compiler-on-os-x-yosemite)

### Linux/Unix compilation
> mkdir bin && cd bin
> cmake ..
> make
### Citation
If you find this tool helpful, please cite:

[James, Benjamin T. et al. (2018), MeShClust2: Application of alignment-free identity scores in clustering long DNA sequences. bioRxiv, 451278.](https://doi.org/10.1101/451278)

### Usage

Usage: meshclust2 --id 0.x [OPTIONS] *.fasta

--id The most important parameter, --id, controls the identity cutoff of the sequences.
Needs to be between 0 and 1.
If it is not specified, an identity of 0.9 is used.

--kmer decides the size of the kmers. It is by default automatically decided by average sequence
length, but if provided, MeShClust can speed up a little by not having to find the largest
sequence length. Increasing kmer size can increase accuracy, but increases memory consumption.

--dump Run until the classifier is trained, and then dump the weights to the file,
default 'weights.txt'. Can be used with --recover to recover the weights
instead of re-training.

--recover Recover weights for the classifier trained by a previous run which used --dump to dump
the weights.

--list Instead of specifying files as extra arguments, provide a text file with
a list of files. Can use pipes or process substitutions such as "--list <(ls *.fasta) "

--no-train-list Same as --list, but these files are not passed to the classifier,
e.g. unassembled genomes

--mut-type {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation}
changes the mutation generation algorithm. By default, "both" is used, utilizing
single point and block mutations. On higher identity data sets, "single", which includes only single point mutations,
is preferable. The option "nonsingle-typical" uses only block mutations,
disallowing single point mutations. Other options include "all", which includes single,
block, and nontypical mutations translocation and reversion.

--feat determines the combinations of features to be used. By default, "slow" allows 11
combinations to be selected from. "fast" removes 2 slower features from "slow"
which include logarithm based features.

--single-file Using this option, (no value is needed), each file is treated as a single sequence.
If multiple sequences in a file are encountered, they are joined with 50 Ns,
and the k-mers are not counted in that region.
However, to be most accurate, it is advised to not use these sequences in the
training step (for mutations) and instead 1) train using un-joined sequences and
use --dump to dump to a file, and 2) use --recover with --single-file for the
file list.

--sample selects the total number of sequences used for both training and testing.
2000 is the default value. That is, --sample 2000 provides 2000 training
pairs and 2000 testing pairs.

--num-templates selects the number of "template" sequences from which to mutate.
For example, if 300 (the default) templates are requested, and the number of
"samples" is requested to be 2000 (the default), 300 sequences will be read in
and mutated 2000/300 times each to create 2000 semi-synthetic pairs.

--min-feat (default 4) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs
will be used. Recall that features include pairwise combinations of the "feat" option.

--max-feat (default 4) sets the maximum feature pairs to be used. Diminishing returns appears quickly,
so a very large maximum (>10) is not advised.

--min-id (default 0.35) sets the lower bound for mutation identity scores to be calculated.
Shouldn't need to be set normally, as lower identites take much longer,
especially with single mutations only.

--datatype (8,16,32,64) Decides the integer size of the histograms. If not provided,
all sequences are read in and counted to ensure the largest k-mer does not
overflow. If the provided k-mer is too small, it will overflow.

--threads sets the number of threads to be used. By default OpenMP uses the number of available cores
on your machine, but this parameter overwrites that.

--output specifies the output file, in CD-HIT's CLSTR format, described below:
A '>Cluster ' followed by an increasing index designates a cluster.
Otherwise, the sequence is printed out.
A '*' at the end of a sequence designates the center of the cluster.
An example of a small data set:

>Cluster 0
0 993nt, >seq128 template_6... *
>Cluster 1
0 1043nt, >seq235 template_10...
1 1000nt, >seq216 template_10... *
2 1015nt, >seq237 template_10...

--delta decides how many clusters are looked around in the final clustering stage.
Increasing it creates more accuracy, but takes more time. Default value is 5.

--iterations specifies how many iterations in the final stage of merging are done until convergence.
Default value is 15.

If the argument is not listed here, it is interpreted as an input (FASTA format) file.


### License

Academic use: The software is provided as-is under the GNU GPLv3.
Any restrictions to use for-profit or non-academics: License needed.
Loading

0 comments on commit 55da02d

Please sign in to comment.