From 55da02d0f31a74cb9aff754c01db9edc2e0399bb Mon Sep 17 00:00:00 2001 From: Benjamin James Date: Wed, 24 Apr 2019 09:32:58 -0500 Subject: [PATCH] Release 2.3 --- .gitignore | 1 + CMakeLists.txt | 111 ++ Makefile | 19 - README | 85 -- README.md | 113 ++ src/Makefile | 175 --- src/cluster/{src/Runner.cpp => CRunner.cpp} | 376 +++++-- src/cluster/{src/Runner.h => CRunner.h} | 33 +- src/cluster/{src => }/Center.h | 2 +- src/cluster/{src => }/ClusterFactory.cpp | 251 +---- src/cluster/{src => }/ClusterFactory.h | 7 +- src/cluster/Makefile | 31 - src/cluster/Trainer.cpp | 198 ++++ src/cluster/Trainer.h | 46 + src/cluster/{src => }/bvec.cpp | 85 +- src/cluster/{src => }/bvec.h | 3 +- src/cluster/{src => }/bvec_iterator.cpp | 6 + src/cluster/{src => }/bvec_iterator.h | 0 src/cluster/{src/main.cpp => meshclust2.cpp} | 2 +- src/cluster/src/Loader.cpp | 111 -- src/cluster/src/LogTable.cpp | 41 - src/cluster/src/LogTable.h | 20 - src/cluster/src/Mat.h | 73 -- src/cluster/src/NearestNeighbor.h | 52 - src/cluster/src/Predictor.cpp | 841 --------------- src/cluster/src/Progress.cpp | 65 -- src/cluster/src/Random.h | 22 - src/cluster/src/SingMute.cpp | 116 -- src/cluster/src/SingleFeature.cpp | 50 - src/cluster/src/SingleFeature.h | 26 - src/cluster/src/SingleMute.cpp | 221 ---- src/cluster/src/SingleMute.h | 89 -- src/cluster/src/Trainer.cpp | 930 ---------------- src/cluster/src/Trainer.h | 67 -- src/cluster/src/needleman_wunsch.cpp | 153 --- src/cluster/src/needleman_wunsch.h | 43 - src/clutil/Clock.cpp | 19 + src/clutil/Clock.h | 16 + src/clutil/Datatype.cpp | 19 + src/clutil/Datatype.h | 17 + .../src => clutil}/DivergencePoint.cpp | 4 +- src/{cluster/src => clutil}/DivergencePoint.h | 1 + src/{cluster/src => clutil}/Histogram.cpp | 0 src/{cluster/src => clutil}/Histogram.h | 0 src/clutil/LCG.h | 51 + src/clutil/Loader.cpp | 223 ++++ src/{cluster/src => clutil}/Loader.h | 24 +- src/{cluster/src => clutil}/Point.h | 39 +- src/clutil/Progress.cpp | 79 ++ src/{cluster/src => clutil}/Progress.h | 8 +- src/clutil/Random.h | 61 ++ .../src => clutil}/SingleFileLoader.cpp | 39 + .../src => clutil}/SingleFileLoader.h | 3 +- src/fastcar/FC_Runner.cpp | 635 +++++++++++ src/fastcar/FC_Runner.h | 53 + src/fastcar/fastcar.cpp | 12 + src/nonltr/ChromListMaker.cpp | 117 ++- src/nonltr/ChromListMaker.h | 16 +- src/nonltr/Chromosome.cpp | 252 ++++- src/nonltr/Chromosome.h | 17 +- src/nonltr/ChromosomeOneDigit.cpp | 271 ++--- src/nonltr/ChromosomeOneDigit.h | 24 +- src/nonltr/ChromosomeOneDigitDna.cpp | 154 +++ src/nonltr/ChromosomeOneDigitDna.h | 36 + src/nonltr/ChromosomeOneDigitProtein.cpp | 64 ++ src/nonltr/ChromosomeOneDigitProtein.h | 28 + src/nonltr/KmerHashTable.cpp | 35 +- src/nonltr/KmerHashTable.h | 1 + src/{ => nonltr}/RepeatsDetector.cpp | 158 +-- src/nonltr/TableBuilder.cpp | 2 +- src/nonltr/Trainer.cpp | 10 +- src/predict/BestFirstSelector.cpp | 258 +++++ src/predict/BestFirstSelector.h | 25 + src/{cluster/src => predict}/Feature.cpp | 553 ++++++++-- src/{cluster/src => predict}/Feature.h | 62 +- src/predict/FeatureSelector.cpp | 110 ++ src/predict/FeatureSelector.h | 27 + src/{cluster/src => predict}/GLM.cpp | 9 +- src/{cluster/src => predict}/GLM.h | 2 + src/predict/GreedySelector.cpp | 154 +++ src/predict/GreedySelector.h | 23 + src/{cluster/src => predict}/HandleSeq.cpp | 20 +- src/{cluster/src => predict}/HandleSeq.h | 13 +- src/{cluster/src => predict}/Matrix.cpp | 10 +- src/{cluster/src => predict}/Matrix.h | 4 +- src/{cluster/src => predict}/MultiMute.cpp | 72 +- src/{cluster/src => predict}/MultiMute.h | 9 +- src/predict/Predictor.cpp | 992 ++++++++++++++++++ src/{cluster/src => predict}/Predictor.h | 21 +- src/predict/SingMute.cpp | 162 +++ src/{cluster/src => predict}/SingMute.h | 14 +- src/utility/AffineId.cpp | 212 ---- src/utility/AffineId.h | 50 - src/utility/Util.cpp | 8 +- src/utility/Util.h | 4 + 95 files changed, 5372 insertions(+), 4394 deletions(-) create mode 100644 .gitignore create mode 100644 CMakeLists.txt delete mode 100644 Makefile delete mode 100644 README create mode 100644 README.md delete mode 100644 src/Makefile rename src/cluster/{src/Runner.cpp => CRunner.cpp} (52%) rename src/cluster/{src/Runner.h => CRunner.h} (51%) rename src/cluster/{src => }/Center.h (97%) rename src/cluster/{src => }/ClusterFactory.cpp (79%) rename src/cluster/{src => }/ClusterFactory.h (94%) delete mode 100644 src/cluster/Makefile create mode 100644 src/cluster/Trainer.cpp create mode 100644 src/cluster/Trainer.h rename src/cluster/{src => }/bvec.cpp (70%) rename src/cluster/{src => }/bvec.h (96%) rename src/cluster/{src => }/bvec_iterator.cpp (88%) rename src/cluster/{src => }/bvec_iterator.h (100%) rename src/cluster/{src/main.cpp => meshclust2.cpp} (87%) delete mode 100644 src/cluster/src/Loader.cpp delete mode 100644 src/cluster/src/LogTable.cpp delete mode 100644 src/cluster/src/LogTable.h delete mode 100644 src/cluster/src/Mat.h delete mode 100644 src/cluster/src/NearestNeighbor.h delete mode 100644 src/cluster/src/Predictor.cpp delete mode 100644 src/cluster/src/Progress.cpp delete mode 100644 src/cluster/src/Random.h delete mode 100644 src/cluster/src/SingMute.cpp delete mode 100644 src/cluster/src/SingleFeature.cpp delete mode 100644 src/cluster/src/SingleFeature.h delete mode 100644 src/cluster/src/SingleMute.cpp delete mode 100644 src/cluster/src/SingleMute.h delete mode 100644 src/cluster/src/Trainer.cpp delete mode 100644 src/cluster/src/Trainer.h delete mode 100644 src/cluster/src/needleman_wunsch.cpp delete mode 100644 src/cluster/src/needleman_wunsch.h create mode 100644 src/clutil/Clock.cpp create mode 100644 src/clutil/Clock.h create mode 100644 src/clutil/Datatype.cpp create mode 100644 src/clutil/Datatype.h rename src/{cluster/src => clutil}/DivergencePoint.cpp (98%) rename src/{cluster/src => clutil}/DivergencePoint.h (98%) rename src/{cluster/src => clutil}/Histogram.cpp (100%) rename src/{cluster/src => clutil}/Histogram.h (100%) create mode 100644 src/clutil/LCG.h create mode 100644 src/clutil/Loader.cpp rename src/{cluster/src => clutil}/Loader.h (70%) rename src/{cluster/src => clutil}/Point.h (76%) create mode 100644 src/clutil/Progress.cpp rename src/{cluster/src => clutil}/Progress.h (75%) create mode 100644 src/clutil/Random.h rename src/{cluster/src => clutil}/SingleFileLoader.cpp (67%) rename src/{cluster/src => clutil}/SingleFileLoader.h (85%) create mode 100644 src/fastcar/FC_Runner.cpp create mode 100644 src/fastcar/FC_Runner.h create mode 100644 src/fastcar/fastcar.cpp create mode 100644 src/nonltr/ChromosomeOneDigitDna.cpp create mode 100644 src/nonltr/ChromosomeOneDigitDna.h create mode 100644 src/nonltr/ChromosomeOneDigitProtein.cpp create mode 100644 src/nonltr/ChromosomeOneDigitProtein.h rename src/{ => nonltr}/RepeatsDetector.cpp (96%) create mode 100644 src/predict/BestFirstSelector.cpp create mode 100644 src/predict/BestFirstSelector.h rename src/{cluster/src => predict}/Feature.cpp (81%) rename src/{cluster/src => predict}/Feature.h (90%) create mode 100644 src/predict/FeatureSelector.cpp create mode 100644 src/predict/FeatureSelector.h rename src/{cluster/src => predict}/GLM.cpp (89%) rename src/{cluster/src => predict}/GLM.h (91%) create mode 100644 src/predict/GreedySelector.cpp create mode 100644 src/predict/GreedySelector.h rename src/{cluster/src => predict}/HandleSeq.cpp (89%) rename src/{cluster/src => predict}/HandleSeq.h (83%) rename src/{cluster/src => predict}/Matrix.cpp (97%) rename src/{cluster/src => predict}/Matrix.h (90%) rename src/{cluster/src => predict}/MultiMute.cpp (90%) rename src/{cluster/src => predict}/MultiMute.h (95%) create mode 100644 src/predict/Predictor.cpp rename src/{cluster/src => predict}/Predictor.h (79%) create mode 100644 src/predict/SingMute.cpp rename src/{cluster/src => predict}/SingMute.h (72%) delete mode 100644 src/utility/AffineId.cpp delete mode 100644 src/utility/AffineId.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e660fd9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +bin/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..37abd71 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,111 @@ +cmake_minimum_required (VERSION 3.1) +project (MeshClust2) + +include_directories(src/exception src/nonltr src/utility src/cluster src/prediction src/clutil src/fastcar) +set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin) +set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}) +set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}) + + +add_library(Fastcar + ${CMAKE_SOURCE_DIR}/src/fastcar/FC_Runner.cpp +) + +add_library(ClusterUtil + ${CMAKE_SOURCE_DIR}/src/clutil/DivergencePoint.cpp + ${CMAKE_SOURCE_DIR}/src/clutil/Histogram.cpp + ${CMAKE_SOURCE_DIR}/src/clutil/Loader.cpp + ${CMAKE_SOURCE_DIR}/src/clutil/SingleFileLoader.cpp + ${CMAKE_SOURCE_DIR}/src/clutil/Progress.cpp + ${CMAKE_SOURCE_DIR}/src/clutil/Datatype.cpp + ${CMAKE_SOURCE_DIR}/src/clutil/Clock.cpp +) + +add_library(Predict + ${CMAKE_SOURCE_DIR}/src/predict/Feature.cpp + ${CMAKE_SOURCE_DIR}/src/predict/GLM.cpp + ${CMAKE_SOURCE_DIR}/src/predict/HandleSeq.cpp + ${CMAKE_SOURCE_DIR}/src/predict/Matrix.cpp + ${CMAKE_SOURCE_DIR}/src/predict/MultiMute.cpp + ${CMAKE_SOURCE_DIR}/src/predict/Predictor.cpp + ${CMAKE_SOURCE_DIR}/src/predict/SingMute.cpp + ${CMAKE_SOURCE_DIR}/src/predict/FeatureSelector.cpp + ${CMAKE_SOURCE_DIR}/src/predict/GreedySelector.cpp + ${CMAKE_SOURCE_DIR}/src/predict/BestFirstSelector.cpp +) + +add_library(Cluster + ${CMAKE_SOURCE_DIR}/src/cluster/ClusterFactory.cpp + ${CMAKE_SOURCE_DIR}/src/cluster/CRunner.cpp + ${CMAKE_SOURCE_DIR}/src/cluster/Trainer.cpp + ${CMAKE_SOURCE_DIR}/src/cluster/bvec.cpp + ${CMAKE_SOURCE_DIR}/src/cluster/bvec_iterator.cpp + +) + +add_library(Exception + ${CMAKE_SOURCE_DIR}/src/exception/FileDoesNotExistException.cpp + ${CMAKE_SOURCE_DIR}/src/exception/InvalidInputException.cpp + ${CMAKE_SOURCE_DIR}/src/exception/InvalidOperationException.cpp + ${CMAKE_SOURCE_DIR}/src/exception/InvalidOrderOfOperationsException.cpp + ${CMAKE_SOURCE_DIR}/src/exception/InvalidScoreException.cpp + ${CMAKE_SOURCE_DIR}/src/exception/InvalidStateException.cpp +) + +add_library(Nonltr + ${CMAKE_SOURCE_DIR}/src/nonltr/ChromDetectorMaxima.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/ChromListMaker.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/Chromosome.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigit.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitDna.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitProtein.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeRandom.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/DetectorMaxima.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/HMM.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/LocationList.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/LocationListCollection.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/Scanner.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/Scorer.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.cpp + ${CMAKE_SOURCE_DIR}/src/nonltr/Trainer.cpp +) + +add_library(Utility + ${CMAKE_SOURCE_DIR}/src/utility/EmptyLocation.cpp + ${CMAKE_SOURCE_DIR}/src/utility/GlobAlignE.cpp + ${CMAKE_SOURCE_DIR}/src/utility/Location.cpp + ${CMAKE_SOURCE_DIR}/src/utility/Util.cpp +) + +target_include_directories(Exception PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(Nonltr PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(Utility PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(Cluster PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(Fastcar PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(ClusterUtil PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(Predict PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +set (HEADER_FILES + ${CMAKE_SOURCE_DIR}/src/nonltr/KmerHashTable.h + ${CMAKE_SOURCE_DIR}/src/nonltr/EnrichmentMarkovView.h + ${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.h +) + +set (CMAKE_CXX_COMPILER g++) +set (CMAKE_CXX_STANDARD 11) +set (CMAKE_CXX_FLAGS "-fopenmp -g -O3 -march=native") + +target_compile_definitions(Cluster PRIVATE VERSION="2.3.0") +target_compile_definitions(Fastcar PRIVATE VERSION="0.7.1") + +add_executable(Red ${CMAKE_SOURCE_DIR}/src/nonltr/RepeatsDetector.cpp ) +add_executable(meshclust2 ${CMAKE_SOURCE_DIR}/src/cluster/meshclust2.cpp) +add_executable(fastcar ${CMAKE_SOURCE_DIR}/src/fastcar/fastcar.cpp) + +target_link_libraries(Red Exception Nonltr Utility ${HEADER_FILES}) +target_link_libraries(Utility Exception ${HEADER_FILES}) +target_link_libraries(Nonltr Utility Exception ${HEADER_FILES}) +target_link_libraries(ClusterUtil Nonltr ${HEADER_FILES}) +target_link_libraries(Predict ClusterUtil Nonltr ${HEADER_FILES}) +target_link_libraries(meshclust2 Cluster Nonltr ClusterUtil Predict ${HEADER_FILES}) +target_link_libraries(fastcar Nonltr ClusterUtil Fastcar Predict ${HEADER_FILES}) diff --git a/Makefile b/Makefile deleted file mode 100644 index 2e611c1..0000000 --- a/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -all: bin/Red.o bin/meshclust2 - -bin/Red.o: - mkdir -p bin - mkdir -p bin/exception - mkdir -p bin/nonltr - mkdir -p bin/utility - $(MAKE) -C src -bin/meshclust2: bin/Red.o - $(MAKE) -C src/cluster - cp src/cluster/meshclust2 bin - -clean: - $(MAKE) clean -C src - $(MAKE) clean -C src/cluster - $(RM) -r bin - -rebuild: clean all -.PHONY: all clean diff --git a/README b/README deleted file mode 100644 index 1388af3..0000000 --- a/README +++ /dev/null @@ -1,85 +0,0 @@ -MeShClust2 -Release version - -Requirements: g++ 4.9.1 or later, requires Homebrew on Mac OS X - -Compilation using g++ (homebrew) and GNU Make on Mac OS X -CXX=g++-7 make - -see: https://stackoverflow.com/questions/29057437/compile-openmp-programs-with-gcc-compiler-on-os-x-yosemite - -Linux/Unix compilation: -make - - -If you find this tool helpful, please cite: - -James, Benjamin T. et al. (2018), MeShClust2: Application of alignment-free identity scores in clustering long DNA sequences. bioRxiv, 451278. - - -Usage: bin/meshclust2 --id 0.x [OPTIONS] *.fasta - ---id The most important parameter, --id, controls the identity cutoff of the sequences. - Needs to be between 0 and 1. - If it is not specified, an identity of 0.9 is used. - ---kmer decides the size of the kmers. It is by default automatically decided by average sequence - length, but if provided, MeShClust can speed up a little by not having to find the largest - sequence length. Increasing kmer size can increase accuracy, but increases memory consumption. - ---mut-type {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation} - changes the mutation generation algorithm. By default, "single" is used, utilizing only - single point mutations. On low identity data sets, "both", which includes single mutations - and block mutations, is preferable. The option "nonsingle-typical" uses only block mutations, - disallowing single point mutations. Other options include "all", which includes single, - block, and nontypical mutations translocation and reversion. - ---feat determines the combinations of features to be used. By default, "slow" allows 11 - combinations to be selected from. "fast" removes 2 slower features from "slow" - which include logarithm based features, and "extraslow" includes 33 total features - used in a previous study. - ---min-feat (default 3) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs - will be used. Recall that features include pairwise combinations of the "feat" option. - ---max-feat (default 5) sets the maximum feature pairs to be used. Diminishing returns appears quickly, - so a very large maximum is not advised. - ---sample selects the total number of sequences used for both training and testing. - 300 is the default value. Each sequence generates 10 synthetic mutants. - That is, --sample 300 provides 3000 training pairs and 3000 testing pairs. - ---min-id (default 0.35) sets the lower bound for mutation identity scores to be calculated. - Shouldn't need to be set normally, as lower identites take much longer, - especially with single mutations only. - ---threads sets the number of threads to be used. By default OpenMP uses the number of available cores - on your machine, but this parameter overwrites that. - ---output specifies the output file, in CD-HIT's CLSTR format, described below: - A '>Cluster ' followed by an increasing index designates a cluster. - Otherwise, the sequence is printed out. - A '*' at the end of a sequence designates the center of the cluster. - An example of a small data set: - >Cluster 0 - 0 993nt, >seq128 template_6... * - >Cluster 1 - 0 1043nt, >seq235 template_10... - 1 1000nt, >seq216 template_10... * - 2 1015nt, >seq237 template_10... - ---delta decides how many clusters are looked around in the final clustering stage. - Increasing it creates more accuracy, but takes more time. Default value is 5. - ---iterations specifies how many iterations in the final stage of merging are done until convergence. - Default value is 15. - - - -If the argument is not listed here, it is interpreted as an input (FASTA format) file. - - -License - -Academic use: The software is provided as-is under the GNU GPLv3. -Any restrictions to use for-profit or non-academics: License needed. diff --git a/README.md b/README.md new file mode 100644 index 0000000..e099f63 --- /dev/null +++ b/README.md @@ -0,0 +1,113 @@ +## MeShClust2 +Release version - 2.3.0 + +### Requirements +g++ 4.9.1 or later, requires Homebrew on Mac OS X +Compilation using g++ (homebrew) and CMake on Mac OS X see [this link](https://stackoverflow.com/questions/29057437/compile-openmp-programs-with-gcc-compiler-on-os-x-yosemite) + +### Linux/Unix compilation +> mkdir bin && cd bin +> cmake .. +> make + +### Citation +If you find this tool helpful, please cite: + +[James, Benjamin T. et al. (2018), MeShClust2: Application of alignment-free identity scores in clustering long DNA sequences. bioRxiv, 451278.](https://doi.org/10.1101/451278) + +### Usage + + Usage: meshclust2 --id 0.x [OPTIONS] *.fasta + + --id The most important parameter, --id, controls the identity cutoff of the sequences. + Needs to be between 0 and 1. + If it is not specified, an identity of 0.9 is used. + + --kmer decides the size of the kmers. It is by default automatically decided by average sequence + length, but if provided, MeShClust can speed up a little by not having to find the largest + sequence length. Increasing kmer size can increase accuracy, but increases memory consumption. + + --dump Run until the classifier is trained, and then dump the weights to the file, + default 'weights.txt'. Can be used with --recover to recover the weights + instead of re-training. + + --recover Recover weights for the classifier trained by a previous run which used --dump to dump + the weights. + + --list Instead of specifying files as extra arguments, provide a text file with + a list of files. Can use pipes or process substitutions such as "--list <(ls *.fasta) " + + --no-train-list Same as --list, but these files are not passed to the classifier, + e.g. unassembled genomes + + --mut-type {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation} + changes the mutation generation algorithm. By default, "both" is used, utilizing + single point and block mutations. On higher identity data sets, "single", which includes only single point mutations, + is preferable. The option "nonsingle-typical" uses only block mutations, + disallowing single point mutations. Other options include "all", which includes single, + block, and nontypical mutations translocation and reversion. + + --feat determines the combinations of features to be used. By default, "slow" allows 11 + combinations to be selected from. "fast" removes 2 slower features from "slow" + which include logarithm based features. + + --single-file Using this option, (no value is needed), each file is treated as a single sequence. + If multiple sequences in a file are encountered, they are joined with 50 Ns, + and the k-mers are not counted in that region. + However, to be most accurate, it is advised to not use these sequences in the + training step (for mutations) and instead 1) train using un-joined sequences and + use --dump to dump to a file, and 2) use --recover with --single-file for the + file list. + + --sample selects the total number of sequences used for both training and testing. + 2000 is the default value. That is, --sample 2000 provides 2000 training + pairs and 2000 testing pairs. + + --num-templates selects the number of "template" sequences from which to mutate. + For example, if 300 (the default) templates are requested, and the number of + "samples" is requested to be 2000 (the default), 300 sequences will be read in + and mutated 2000/300 times each to create 2000 semi-synthetic pairs. + + --min-feat (default 4) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs + will be used. Recall that features include pairwise combinations of the "feat" option. + + --max-feat (default 4) sets the maximum feature pairs to be used. Diminishing returns appears quickly, + so a very large maximum (>10) is not advised. + + --min-id (default 0.35) sets the lower bound for mutation identity scores to be calculated. + Shouldn't need to be set normally, as lower identites take much longer, + especially with single mutations only. + + --datatype (8,16,32,64) Decides the integer size of the histograms. If not provided, + all sequences are read in and counted to ensure the largest k-mer does not + overflow. If the provided k-mer is too small, it will overflow. + + --threads sets the number of threads to be used. By default OpenMP uses the number of available cores + on your machine, but this parameter overwrites that. + + --output specifies the output file, in CD-HIT's CLSTR format, described below: + A '>Cluster ' followed by an increasing index designates a cluster. + Otherwise, the sequence is printed out. + A '*' at the end of a sequence designates the center of the cluster. + An example of a small data set: + + >Cluster 0 + 0 993nt, >seq128 template_6... * + >Cluster 1 + 0 1043nt, >seq235 template_10... + 1 1000nt, >seq216 template_10... * + 2 1015nt, >seq237 template_10... + + --delta decides how many clusters are looked around in the final clustering stage. + Increasing it creates more accuracy, but takes more time. Default value is 5. + + --iterations specifies how many iterations in the final stage of merging are done until convergence. + Default value is 15. + + If the argument is not listed here, it is interpreted as an input (FASTA format) file. + + +### License + +Academic use: The software is provided as-is under the GNU GPLv3. +Any restrictions to use for-profit or non-academics: License needed. diff --git a/src/Makefile b/src/Makefile deleted file mode 100644 index 3013ed0..0000000 --- a/src/Makefile +++ /dev/null @@ -1,175 +0,0 @@ -# CXX = /usr/bin/c++ -CXX ?= g++ - -CXXFLAGS = -O3 -g -fmessage-length=0 -Wall -march=native -std=c++11 - -# -# Objects -# - -ORed = ../bin/Red.o - -# Exception -OInvalidInputException = ../bin/exception/InvalidInputException.o -OInvalidStateException = ../bin/exception/InvalidStateException.o -OFileDoesNotExistException = ../bin/exception/FileDoesNotExistException.o -OInvalidOrderOfOperationsException = ../bin/exception/InvalidOrderOfOperationsException.o -OInvalidScoreException = ../bin/exception/InvalidScoreException.o -OInvalidOperationException = ../bin/exception/InvalidOperationException.o - -# Utility -OUtil = ../bin/utility/Util.o -OLocation = ../bin/utility/Location.o -OEmptyLocation = ../bin/utility/EmptyLocation.o -OLCSLen = ../bin/utility/LCSLen.o -OAffineId = ../bin/utility/AffineId.o -OGlobAlignE = ../bin/utility/GlobAlignE.o - -# Non TR -OChromosome = ../bin/nonltr/Chromosome.o -OChromosomeOneDigit = ../bin/nonltr/ChromosomeOneDigit.o -OChromosomeRandom = ../bin/nonltr/ChromosomeRandom.o -OChromListMaker = ../bin/nonltr/ChromListMaker.o -OTableBuilder = ../bin/nonltr/TableBuilder.o -OScorer = ../bin/nonltr/Scorer.o -ODetectorMaxima = ../bin/nonltr/DetectorMaxima.o -OChromDetectorMaxima = ../bin/nonltr/ChromDetectorMaxima.o -OHMM = ../bin/nonltr/HMM.o -OScanner = ../bin/nonltr/Scanner.o -OTrainer = ../bin/nonltr/Trainer.o -OLocationList = ../bin/nonltr/LocationList.o -OLocationListCollection = ../bin/nonltr/LocationListCollection.o - -OBJS = $(ORed) $(OInvalidInputException) $(OInvalidStateException) $(OFileDoesNotExistException) $(OInvalidOrderOfOperationsException) $(OInvalidOperationException) $(OInvalidScoreException) $(OUtil) $(OLocation) $(OEmptyLocation) $(OChromosome) $(OChromosomeOneDigit) $(OChromosomeRandom) $(OChromListMaker) $(OTableBuilder) $(OScorer) $(ODetectorMaxima) $(OChromDetector) $(OChromDetectorMaxima) $(OHMM) $(OScanner) $(OTrainer) $(OLocationList) $(OLocationListCollection) $(OLCSLen) $(OAffineId) $(OGlobAlignE) - -# -# Target -# - -TRed = ../bin/Red - -# -# Make RepeatsDetector -# - -$(TRed): $(OBJS) - $(CXX) -o $(TRed) $(OBJS) - -# -# RepeatsDetector -# - -$(ORed): RepeatsDetector.cpp nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/TableBuilder.h nonltr/HMM.h nonltr/Scanner.h nonltr/Trainer.h utility/Util.h - $(CXX) $(CXXFLAGS) -c RepeatsDetector.cpp -o $(ORed) - -# -# Exception -# -$(OInvalidInputException): exception/InvalidInputException.cpp exception/InvalidInputException.h - $(CXX) $(CXXFLAGS) -c exception/InvalidInputException.cpp -o $(OInvalidInputException) - -$(OInvalidStateException): exception/InvalidStateException.cpp exception/InvalidStateException.h - $(CXX) $(CXXFLAGS) -c exception/InvalidStateException.cpp -o $(OInvalidStateException) - -$(OFileDoesNotExistException): exception/FileDoesNotExistException.cpp exception/FileDoesNotExistException.h - $(CXX) $(CXXFLAGS) -c exception/FileDoesNotExistException.cpp -o $(OFileDoesNotExistException) - -$(OInvalidOrderOfOperationsException): exception/InvalidOrderOfOperationsException.cpp exception/InvalidOrderOfOperationsException.h - $(CXX) $(CXXFLAGS) -c exception/InvalidOrderOfOperationsException.cpp -o $(OInvalidOrderOfOperationsException) - -$(OInvalidScoreException): exception/InvalidScoreException.cpp exception/InvalidScoreException.h - $(CXX) $(CXXFLAGS) -c exception/InvalidScoreException.cpp -o $(OInvalidScoreException) - -$(OInvalidOperationException): exception/InvalidOperationException.cpp exception/InvalidOperationException.h - $(CXX) $(CXXFLAGS) -c exception/InvalidOperationException.cpp -o $(OInvalidOperationException) - -# -# Utility -# - -$(OUtil): utility/Util.cpp utility/Util.h utility/Location.h exception/FileDoesNotExistException.h - $(CXX) $(CXXFLAGS) -c utility/Util.cpp -o $(OUtil) - -$(OLocation): utility/Location.cpp utility/Location.h utility/ILocation.h exception/InvalidInputException.h utility/Util.h - $(CXX) $(CXXFLAGS) -c utility/Location.cpp -o $(OLocation) - -$(OEmptyLocation): utility/EmptyLocation.cpp utility/EmptyLocation.h utility/ILocation.h exception/InvalidOperationException.h - $(CXX) $(CXXFLAGS) -c utility/EmptyLocation.cpp -o $(OEmptyLocation) - -$(OLCSLen): utility/LCSLen.cpp utility/LCSLen.h - $(CXX) $(CXXFLAGS) -c utility/LCSLen.cpp -o $(OLCSLen) - -$(OAffineId): utility/AffineId.cpp utility/AffineId.h - $(CXX) $(CXXFLAGS) -c utility/AffineId.cpp -o $(OAffineId) - -$(OGlobAlignE): utility/GlobAlignE.cpp utility/GlobAlignE.h - $(CXX) $(CXXFLAGS) -c utility/GlobAlignE.cpp -o $(OGlobAlignE) -# -# Non LTR -# - -$(OChromosome): nonltr/Chromosome.cpp nonltr/Chromosome.h nonltr/IChromosome.h utility/Util.h exception/InvalidInputException.h exception/InvalidOperationException.h - $(CXX) $(CXXFLAGS) -c nonltr/Chromosome.cpp -o $(OChromosome) - -$(OChromosomeOneDigit): nonltr/ChromosomeOneDigit.cpp nonltr/ChromosomeOneDigit.h nonltr/Chromosome.h exception/InvalidInputException.h - $(CXX) $(CXXFLAGS) -c nonltr/ChromosomeOneDigit.cpp -o $(OChromosomeOneDigit) - -$(OChromosomeRandom): nonltr/ChromosomeRandom.cpp nonltr/ChromosomeRandom.h nonltr/IChromosome.h exception/InvalidInputException.h exception/InvalidStateException.h utility/Util.h - $(CXX) $(CXXFLAGS) -c nonltr/ChromosomeRandom.cpp -o $(OChromosomeRandom) - -$(OTableBuilder): nonltr/TableBuilder.cpp nonltr/TableBuilder.h utility/Util.h nonltr/ChromosomeOneDigit.h nonltr/ITableView.h nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/EnrichmentMarkovView.h nonltr/EnrichmentMarkovView.cpp exception/InvalidStateException.h nonltr/ChromListMaker.h nonltr/IChromosome.h - $(CXX) $(CXXFLAGS) -c nonltr/TableBuilder.cpp -o $(OTableBuilder) - -$(OScorer): nonltr/Scorer.cpp nonltr/Scorer.h nonltr/ChromosomeOneDigit.h utility/Util.h exception/InvalidStateException.h - $(CXX) $(CXXFLAGS) -c nonltr/Scorer.cpp -o $(OScorer) - -$(ODetectorMaxima): nonltr/DetectorMaxima.cpp nonltr/DetectorMaxima.h utility/ILocation.h exception/InvalidStateException.h - $(CXX) $(CXXFLAGS) -c nonltr/DetectorMaxima.cpp -o $(ODetectorMaxima) - -$(OChromDetectorMaxima): nonltr/ChromDetectorMaxima.cpp nonltr/ChromDetectorMaxima.h nonltr/DetectorMaxima.h nonltr/ChromosomeOneDigit.h utility/Util.h utility/ILocation.h utility/Location.h - $(CXX) $(CXXFLAGS) -c nonltr/ChromDetectorMaxima.cpp -o $(OChromDetectorMaxima) - -$(OHMM): nonltr/HMM.cpp nonltr/HMM.h utility/ILocation.h exception/InvalidStateException.h exception/InvalidInputException.h exception/FileDoesNotExistException.h exception/InvalidOperationException.h - $(CXX) $(CXXFLAGS) -c nonltr/HMM.cpp -o $(OHMM) - -$(OScanner): nonltr/Scanner.cpp nonltr/Scanner.h nonltr/Chromosome.h nonltr/ChromosomeOneDigit.h nonltr/HMM.h nonltr/ITableView.h nonltr/Scorer.h utility/Util.h utility/ILocation.h exception/InvalidInputException.h exception/InvalidStateException.h exception/FileDoesNotExistException.h exception/InvalidOperationException.h - $(CXX) $(CXXFLAGS) -c nonltr/Scanner.cpp -o $(OScanner) - -$(OTrainer): nonltr/Trainer.cpp nonltr/Trainer.h nonltr/TableBuilder.h nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/HMM.h nonltr/ChromDetectorMaxima.h nonltr/Scorer.h nonltr/ChromListMaker.h utility/Util.h nonltr/LocationListCollection.h - $(CXX) $(CXXFLAGS) -c nonltr/Trainer.cpp -o $(OTrainer) - -$(OChromListMaker): nonltr/ChromListMaker.cpp nonltr/ChromListMaker.h nonltr/Chromosome.h nonltr/ChromosomeOneDigit.h utility/Util.h - $(CXX) $(CXXFLAGS) -c nonltr/ChromListMaker.cpp -o $(OChromListMaker) - -$(OCluster): nonltr/Cluster.cpp nonltr/Cluster.h utility/Util.h exception/InvalidStateException.h exception/InvalidInputException.h - $(CXX) $(CXXFLAGS) -c nonltr/Cluster.cpp -o $(OCluster) - -$(OLocationList): nonltr/LocationList.cpp nonltr/LocationList.h utility/ILocation.h utility/Location.h exception/InvalidStateException.h - $(CXX) $(CXXFLAGS) -c nonltr/LocationList.cpp -o $(OLocationList) - -$(OLocationListCollection): nonltr/LocationListCollection.cpp nonltr/LocationListCollection.h utility/Location.h exception/InvalidStateException.h - $(CXX) $(CXXFLAGS) -c nonltr/LocationListCollection.cpp -o $(OLocationListCollection) - - -# -# Make binary directories -# - -red: $(TRed) - -# -# Make Red -# - -bin: - mkdir ../bin - mkdir ../bin/exception - mkdir ../bin/utility - mkdir ../bin/nonltr - -# -# Make clean -# - -clean: - rm -f ../bin/*.o ../bin/exception/*.o ../bin/ms/*.o ../bin/nonltr/*.o ../bin/test/*.o ../bin/utility/*.o ../bin/tr/*.o *.o $(TRed) diff --git a/src/cluster/src/Runner.cpp b/src/cluster/CRunner.cpp similarity index 52% rename from src/cluster/src/Runner.cpp rename to src/cluster/CRunner.cpp index 8a367d7..16514fd 100644 --- a/src/cluster/src/Runner.cpp +++ b/src/cluster/CRunner.cpp @@ -9,77 +9,117 @@ #include #include #include -#include "../../nonltr/ChromListMaker.h" -#include "../../utility/AffineId.h" -#include "Runner.h" +#include "../nonltr/ChromListMaker.h" +#include "../clutil/Datatype.h" +#include "../clutil/Loader.h" +#include "../clutil/Clock.h" +#include "CRunner.h" #include "Trainer.h" #include "ClusterFactory.h" #include "bvec.h" -#include "Progress.h" +#include "../clutil/Progress.h" +#include "../predict/Predictor.h" #ifdef _OPENMP #include #endif + +/* + * Constructor for runner. + * Gets the options + * If --recover is passed, set the K using that + * Otherwise, if the K wasn't set, find the K by iterating through all sequences + */ Runner::Runner(int argc, char **argv) { get_opts(argc, argv); - if (k == -1) { - auto pr = find_k(); - k = pr.first; - } - // if (similarity < 0.6) { - // align = true; - // } - if (sample_size == 0) { - if (similarity < 0.6) { - sample_size = 1000; - } else { - sample_size = 300; - } + if (pred64 != NULL) { + k = pred64->get_k(); + } else if (k == -1) { + k = find_k(); } srand(10); } + +/* + * Main entry point for MeShClust2. + * Sets datatype and identity if --recover was used + * If datatype wasn't set, run through and find the max histogram + * Based on the max histogram, call do_run with the smallest possible + * histogram that will fit all sequences + */ int Runner::run() { - largest_count = 0; - Progress progress(files.size(), "Reading in sequences"); - for (auto i = 0; i < files.size(); i++) { - auto f = files.at(i); - ChromListMaker maker(f); - auto chromList = maker.makeChromOneDigitList(); - - progress++; + if (pred64 != NULL) { + Datatype::set(pred64->get_datatype()); + similarity = pred64->get_id(); + } else if (Datatype::get() == "") { + largest_count = 0; + Progress progress(all_files.size(), "Reading in sequences"); +#pragma omp parallel for + for (auto i = 0; i < all_files.size(); i++) { + auto f = all_files.at(i); + ChromListMaker maker(f, is_single_file); + auto chromList = maker.makeChromOneDigitDnaList(); + + // cout << "Reading in sequences from " << f << "..." << endl; - uint64_t local_largest_count = 0; -#pragma omp parallel for reduction(max:local_largest_count) - for (int i = 0; i < chromList->size(); i++) { - std::vector values; - KmerHashTable table(k, 1); - ChromosomeOneDigit *chrom = dynamic_cast(chromList->at(i)); - fill_table(table, chrom, values); - uint64_t l_count = *std::max_element(values.begin(), values.end()); - if (l_count > local_largest_count) { - local_largest_count = l_count; + uint64_t local_largest_count = 0; +//#pragma omp parallel for reduction(max:local_largest_count) + for (int i = 0; i < chromList->size(); i++) { + std::vector values; + KmerHashTable table(k, 1); + ChromosomeOneDigit *chrom = dynamic_cast(chromList->at(i)); + fill_table(table, chrom, values); + uint64_t l_count = *std::max_element(values.begin(), values.end()); + if (l_count > local_largest_count) { + local_largest_count = l_count; + // #pragma omp critical + // cout << "local largest count reset to " << local_largest_count << endl; + } + } + + #pragma omp critical + { + if (local_largest_count > largest_count) { + largest_count = local_largest_count; + // #pragma omp critical + // cout << "largest count updated to " << largest_count << endl; + } + progress++; } } - if (local_largest_count > largest_count) { - largest_count = local_largest_count; - } + progress.end(); + cout << "Largest count: " << largest_count << endl; } - progress.end(); - + if (Datatype::get() != "") { + std::string type = Datatype::get(); + if (type == "uint8_t") { + largest_count = std::numeric_limits::max(); + } else if (type == "uint16_t") { + largest_count = std::numeric_limits::max(); + } else if (type == "uint32_t") { + largest_count = std::numeric_limits::max(); + } else if (type == "uint64_t") { + largest_count = std::numeric_limits::max(); + } + } if (largest_count <= std::numeric_limits::max()) { cout << "Using 8 bit histograms" << endl; + Datatype::set("uint8_t"); return do_run(); } else if (largest_count <= std::numeric_limits::max()) { cout << "Using 16 bit histograms" << endl; + Datatype::set("uint16_t"); return do_run(); } else if (largest_count <= std::numeric_limits::max()){ cout << "Using 32 bit histograms" << endl; + Datatype::set("uint32_t"); return do_run(); } else if (largest_count <= std::numeric_limits::max()) { cout << "Using 64 bit histograms" << endl; + Datatype::set("uint64_t"); return do_run(); } else { throw "Too big sequence"; @@ -108,32 +148,61 @@ void usage(std::string progname) length, but if provided, MeShClust can speed up a little by not having to find the largest sequence length. Increasing kmer size can increase accuracy, but increases memory consumption. +--dump Run until the classifier is trained, and then dump the weights to the file, + default 'weights.txt'. Can be used with --recover to recover the weights + instead of re-training. + +--recover Recover weights for the classifier trained by a previous run which used --dump to dump + the weights. + +--list Instead of specifying files as extra arguments, provide a text file with + a list of files. Can use pipes or process substitutions such as "--list <(ls *.fasta) " + +--no-train-list Same as --list, but these files are not passed to the classifier, + e.g. unassembled genomes + --mut-type {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation} - changes the mutation generation algorithm. By default, "single" is used, utilizing only - single point mutations. On low identity data sets, "both", which includes single mutations - and block mutations, is preferable. The option "nonsingle-typical" uses only block mutations, + changes the mutation generation algorithm. By default, "both" is used, utilizing + single point and block mutations. On higher identity data sets, "single", which includes only single point mutations, + is preferable. The option "nonsingle-typical" uses only block mutations, disallowing single point mutations. Other options include "all", which includes single, block, and nontypical mutations translocation and reversion. --feat determines the combinations of features to be used. By default, "slow" allows 11 combinations to be selected from. "fast" removes 2 slower features from "slow" - which include logarithm based features, and "extraslow" includes 33 total features - used in a previous study. - ---min-feat (default 3) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs - will be used. Recall that features include pairwise combinations of the "feat" option. + which include logarithm based features. ---max-feat (default 5) sets the maximum feature pairs to be used. Diminishing returns appears quickly, - so a very large maximum is not advised. +--single-file Using this option, (no value is needed), each file is treated as a single sequence. + If multiple sequences in a file are encountered, they are joined with 50 Ns, + and the k-mers are not counted in that region. + However, to be most accurate, it is advised to not use these sequences in the + training step (for mutations) and instead 1) train using un-joined sequences and + use --dump to dump to a file, and 2) use --recover with --single-file for the + file list. --sample selects the total number of sequences used for both training and testing. - 300 is the default value. Each sequence generates 10 synthetic mutants. - That is, --sample 300 provides 3000 training pairs and 3000 testing pairs. + 2000 is the default value. That is, --sample 2000 provides 2000 training + pairs and 2000 testing pairs. + +--num-templates selects the number of "template" sequences from which to mutate. + For example, if 300 (the default) templates are requested, and the number of + "samples" is requested to be 2000 (the default), 300 sequences will be read in + and mutated 2000/300 times each to create 2000 semi-synthetic pairs. + +--min-feat (default 4) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs + will be used. Recall that features include pairwise combinations of the "feat" option. + +--max-feat (default 4) sets the maximum feature pairs to be used. Diminishing returns appears quickly, + so a very large maximum (>10) is not advised. --min-id (default 0.35) sets the lower bound for mutation identity scores to be calculated. Shouldn't need to be set normally, as lower identites take much longer, especially with single mutations only. +--datatype (8,16,32,64) Decides the integer size of the histograms. If not provided, + all sequences are read in and counted to ensure the largest k-mer does not + overflow. If the provided k-mer is too small, it will overflow. + --threads sets the number of threads to be used. By default OpenMP uses the number of available cores on your machine, but this parameter overwrites that. @@ -187,6 +256,45 @@ void Runner::get_opts(int argc, char **argv) exit(EXIT_FAILURE); } i++; + } else if (arg == "--single-file") { + is_single_file = true; + } else if ((arg == "--list" || arg == "-l") && i + 1 < argc) { + std::ifstream in(argv[++i]); + std::string line; + while (getline(in, line)) { + files.push_back(line); + } + } else if ((arg == "--no-train-list" || arg == "--notrain-list") && i + 1 < argc) { + std::ifstream in(argv[++i]); + std::string line; + while (getline(in, line)) { + notrain_files.push_back(line); + } + } else if (arg == "--dump") { + if (i + 1 < argc && argv[i+1][0] != '-') { + dump_str = argv[++i]; + } + dump = true; + } else if ((arg == "--datatype") && i + 1 < argc) { + std::string val = argv[++i]; + if (val == "uint8_t" || val == "8" || val == "uint8") { + Datatype::set("uint8_t"); + } else if (val == "uint16_t" || val == "16" || val == "uint16") { + Datatype::set("uint16_t"); + } else if (val == "uint32_t" || val == "32" || val == "uint32") { + Datatype::set("uint32_t"); + } else if (val == "uint64_t" || val == "64" || val == "uint64") { + Datatype::set("uint64_t"); + } else { + cerr << "Histogram data type must have a valid data type or size: one of 8, 16, 32, 64" << endl; + exit(EXIT_FAILURE); + } + } else if ((arg == "-r" || arg == "--recover") && i + 1 < argc) { + dump_str = argv[++i]; + recover = true; + pred64 = new Predictor(dump_str); + similarity = pred64->get_id(); + k = pred64->get_k(); } else if (arg == "--min-id" && i + 1 < argc) { try { std::string opt = argv[i+1]; @@ -199,6 +307,8 @@ void Runner::get_opts(int argc, char **argv) exit(EXIT_FAILURE); } i++; + } else if ((arg == "-b" || arg == "--bias") && i + 1 < argc) { + bias = std::stod(argv[++i]); } else if ((arg == "-k" || arg == "--kmer") && i + 1 < argc) { k = strtol(argv[i+1], NULL, 10); if (errno) { @@ -212,26 +322,26 @@ void Runner::get_opts(int argc, char **argv) } else if ((arg == "-o" || arg == "--output") && i + 1 < argc) { output = string(argv[i+1]); i++; + } else if ((arg == "--num-templates") && i + 1 < argc) { + n_templates = strtol(argv[i+1], NULL, 10); + if (errno) { + perror(argv[i+1]); + exit(EXIT_FAILURE); + } else if (n_templates <= 0) { + fprintf(stderr, "Number of templates must be greater than 0.\n"); + exit(EXIT_FAILURE); + } + i++; } else if ((arg == "-s" || arg == "--sample") && i + 1 < argc) { - sample_size = strtol(argv[i+1], NULL, 10); + total_sample_size = strtol(argv[i+1], NULL, 10); if (errno) { perror(argv[i+1]); exit(EXIT_FAILURE); - } else if (sample_size <= 0) { + } else if (total_sample_size <= 0) { fprintf(stderr, "Sample size must be greater than 0.\n"); exit(EXIT_FAILURE); } i++; - // } else if ((arg == "-p" || arg == "--pivot") && i + 1 < argc) { - // pivots = strtol(argv[i+1], NULL, 10); - // if (errno) { - // perror(argv[i+1]); - // exit(EXIT_FAILURE); - // } else if (sample_size <= 0) { - // fprintf(stderr, "Points per pivot must be greater than 0.\n"); - // exit(EXIT_FAILURE); - // } - // i++; } else if ((arg == "--mut-type") && i + 1 < argc) { std::string opt = argv[i+1]; if (opt == "all") { @@ -342,6 +452,20 @@ void Runner::get_opts(int argc, char **argv) } } } + set file_list(files.begin(), files.end()); + set notrain_file_list; + set all_files_list(files.begin(), files.end()); + for (std::string val : notrain_files) { + if (file_list.find(val) == file_list.end()) { + notrain_file_list.insert(val); + all_files_list.insert(val); + } + } + files.assign(file_list.begin(), file_list.end()); + notrain_files.assign(notrain_file_list.begin(), + notrain_file_list.end()); + all_files.assign(all_files_list.begin(), + all_files_list.end()); if (files.empty()) { usage(*argv); exit(EXIT_FAILURE); @@ -352,53 +476,104 @@ void Runner::get_opts(int argc, char **argv) } } -pair Runner::find_k() +int Runner::find_k() { unsigned long long count = 0, length = 0, largest_count = 0; - uint64_t longest_seq = 0; - uintmax_t num_sequences = 0; - for (auto f : files) { - ChromListMaker maker(f); - auto chromList = maker.makeChromOneDigitList(); +#pragma omp parallel for + for (size_t i = 0; i < all_files.size(); i++) { +// cout << "Processing " << f << endl; + ChromListMaker maker(all_files.at(i), is_single_file); + auto chromList = maker.makeChromList(); unsigned long long l = 0; for (int i = 0; i < chromList->size(); i++) { - ChromosomeOneDigit *chrom = dynamic_cast(chromList->at(i)); - auto sz = chrom->size(); + Chromosome *chrom = dynamic_cast(chromList->at(i)); + auto sz = chrom->getEffectiveSize(); l += sz; - if (sz > longest_seq) { - longest_seq = sz; - } - num_sequences++; - } l /= chromList->size(); +#pragma omp atomic length += l; } length /= files.size(); int newk = ceil(log(length) / log(4)) - 1; cout << "avg length: " << length << endl; cout << "Recommended K: " << newk << endl; - return make_pair(newk, longest_seq); + return newk; } +template +void get_points(std::vector files, std::vector*> &points, int k, uintmax_t &_id, bool is_single_file, bool set_seq=true, int chunk_size=10000) +{ + if (files.empty()) { + return; + } + auto sort_func = [](Point*a, Point*b) { + return a->get_length() < b->get_length(); + }; + auto sort_hdr = [](Point*a, Point*b) { + return a->get_header() < b->get_header(); + }; + int n_threads = omp_get_max_threads(); + std::ostringstream oss; + oss << "Counting " << k << "-mers"; + Progress prog(files.size(), oss.str()); + #pragma omp parallel for + for (size_t i = 0; i < files.size(); i++) { + ChromListMaker maker(files.at(i), is_single_file); + auto chromList = maker.makeChromOneDigitDnaList(); + for (auto elt : *chromList) { + ChromosomeOneDigitDna* chrom = dynamic_cast(elt); + Point* pt = Loader::get_point(chrom, _id, k); +#pragma omp critical + points.push_back(pt); + } +#pragma omp critical + prog++; + } + prog.end(); + std::string warning = Loader::get_warning(); + if (warning != "") { + cout << warning << endl; + } + std::sort(points.begin(), points.end(), sort_hdr); + std::sort(points.begin(), points.end(), sort_func); + // for (auto seq : points) { + // cout << "SEQ " << seq->get_header() << endl; + // } -double global_mat[4][4] = {{1, -1, -1, -1}, - {-1, 1, -1, -1}, - {-1, -1, 1, -1}, - {-1, -1, -1, 1}}; -double global_sigma = -2; -double global_epsilon = -1; +} +/* + * Main launching point for the algorithm + * + * Reads in all points + * Trains the model using a previous model if provided + * Else trains the model from scratch + * Initializes bvec search structure + * Runs mean shift + */ template int Runner::do_run() { using pvec = vector *>; using pmap = map*, pvec*>; - - ClusterFactory factory(k); - auto points = factory.build_points(files, [&](nonltr::ChromosomeOneDigit *p){ return factory.get_divergence_point(p); }); - Trainer tr(points, sample_size, largest_count, similarity, pivots, global_mat, global_sigma, global_epsilon, align ? 0 : k); - tr.train(min_n_feat, max_n_feat, feat_type, mut_type, min_id); + uintmax_t s_id = 0; + Predictor::set_bias(bias); + + pvec points; + get_points(files, points, k, s_id, is_single_file); + Clock::stamp("read_in_points"); + Trainer tr(points, total_sample_size, largest_count, similarity, n_templates, k); + if (recover) { + tr.train(dump_str); + } else { + // If we are working in low-identity space, get more room + if (similarity < 0.6) { + min_id = 0.2; + } + tr.train(min_n_feat, max_n_feat, feat_type, mut_type, min_id, dump ? dump_str : ""); + } + get_points(notrain_files, points, k, s_id, is_single_file, false); vector lengths; for (Point* p : points) { if (!align) { @@ -416,28 +591,7 @@ int Runner::do_run() bv.insert(p); } bv.insert_finalize(); -// cout << "bv size: " << bv.report() << endl; - // Point* mid = points[points.size()/2]; - // auto rng = bv.get_range(mid->get_length() * 0.99, - // mid->get_length() / 0.99); - // auto begin = bv.iter(rng.first); - // auto end = bv.iter(rng.second); - // size_t before = bv.report(); - // for (int i = 0; i < 1; i++) { - // bool is_min = false; - // Point* p = tr.get_close(mid, begin, end, is_min); - // size_t after = bv.report(); - // if (is_min) { - // string expr = (after + 1 == before) ? "true" : "false"; - // if (expr == "false") { - // throw expr; - // } - // cout << expr << endl; - // cout << "is min" << endl; - // } else { - // cout << "is not min" << endl; - // } - // } + ClusterFactory factory(k); factory.MS(bv, bandwidth, similarity, tr, output, iterations, delta); return 0; } diff --git a/src/cluster/src/Runner.h b/src/cluster/CRunner.h similarity index 51% rename from src/cluster/src/Runner.h rename to src/cluster/CRunner.h index 9cc8c04..1ad6abb 100644 --- a/src/cluster/src/Runner.h +++ b/src/cluster/CRunner.h @@ -4,14 +4,14 @@ * * Author: Benjamin T James */ -#ifndef RUNNER_H -#define RUNNER_H +#ifndef CRUNNER_H +#define CRUNNER_H #include #include -#include "Point.h" -#include "HandleSeq.h" -#include "Predictor.h" +#include "../clutil/Point.h" +#include "../predict/HandleSeq.h" +#include "../predict/Predictor.h" using namespace std; class Runner { @@ -29,16 +29,23 @@ class Runner { int iterations = 15; int delta = 5; bool align = false; - int sample_size = 0; - int pivots = 40; - int min_n_feat = 3; - int max_n_feat = 5; - int mut_type = HandleSeq::SINGLE; - uint64_t feat_type = PRED_FEAT_FAST | PRED_FEAT_DIV; + int total_sample_size = 2000; + int n_templates = 300; + int min_n_feat = 4; + int max_n_feat = 4; + bool is_single_file = false; + double bias = 0; + int mut_type = HandleSeq::BOTH; + uint64_t feat_type = PRED_FEAT_FAST; double min_id = 0.35; - std::vector files; + std::vector files, notrain_files, all_files; string output = "output.clstr"; void get_opts(int argc, char** argv); - pair find_k(); + int find_k(); + + bool dump = false; + bool recover = false; + std::string dump_str = "weights.txt"; + Predictor *pred64 = NULL; }; #endif diff --git a/src/cluster/src/Center.h b/src/cluster/Center.h similarity index 97% rename from src/cluster/src/Center.h rename to src/cluster/Center.h index 8c2acc5..52c1cbd 100644 --- a/src/cluster/src/Center.h +++ b/src/cluster/Center.h @@ -7,7 +7,7 @@ #ifndef CENTER_H #define CENTER_H -#include "Point.h" +#include "../clutil/Point.h" template struct Center { diff --git a/src/cluster/src/ClusterFactory.cpp b/src/cluster/ClusterFactory.cpp similarity index 79% rename from src/cluster/src/ClusterFactory.cpp rename to src/cluster/ClusterFactory.cpp index c6ce7ae..c0f25fc 100644 --- a/src/cluster/src/ClusterFactory.cpp +++ b/src/cluster/ClusterFactory.cpp @@ -15,14 +15,16 @@ #include #include #include -#include "Histogram.h" -#include "../../nonltr/KmerHashTable.h" -#include "../../nonltr/ChromListMaker.h" -#include "DivergencePoint.h" +#include "../clutil/Histogram.h" +#include "../nonltr/KmerHashTable.h" +#include "../nonltr/ChromListMaker.h" +#include "../clutil/DivergencePoint.h" +#include "../clutil/Clock.h" #include "Center.h" -#include "Progress.h" +#include "../clutil/Progress.h" //#include +/* The main method is MS() */ template T avg_distance(Point &c, const std::vector*> &vec) { @@ -308,31 +310,25 @@ void mean_shift_update(vector > &part, int j, const Trainer& trn, i } } trn.filter(center, good); + uint64_t cen_len = center->get_length(); + uint64_t min_len = trn.get_id() * cen_len; + uint64_t max_len = cen_len / trn.get_id(); if (!good.empty()) { + for (auto p : good) { + uint64_t p_len = p.first->get_length(); + bool within = p_len >= min_len && p_len <= max_len; +// cout << "CEN " << j << " " << cen_len << " " << min_len << " " << max_len << " " << p_len << " " << (within ? "TRUE" : "FALSE") << endl; p.first->set_arg_to_this_d(*temp); *top += *temp; bottom++; } *top /= bottom; Point* next = trn.closest(top, good); - // Point *next = NULL; - // int next_dist = std::numeric_limits::max(); - // for (int i = 0; i < N; i++) { - // int dist = points[i]->distance_d(*top); - // if (dist < next_dist) { - // next_dist = dist; - // next = points[i]; - // } - // } - if (next != NULL) { - center->set(*next); - center->set_data_str(next->get_data_str()); - } else { - cerr << "mean shift: NULL" << endl; - } - } else { - //cout << "GOOD: EMPTY" << endl; + center->set(*next); + } else if (delta == 0) { + Point* first = part[j].getPoints()[0]; + center->set(*first); } delete top; delete temp; @@ -390,35 +386,12 @@ bool merge(vector > ¢ers, const Trainer& trn, int delta, int ba for (int i = 0; i < centers.size(); i++) { long ret = trn.merge(centers, i, i + 1, std::min((int)centers.size()-1, i + delta)); if (ret > i) { - num_merge++; auto &to_add = centers[ret].getPoints(); auto &to_del = centers[i].getPoints(); to_add.insert(std::end(to_add), std::begin(to_del), std::end(to_del)); centers[i].lazy_remove(); } - // vector*,double> > to_merge; - // for (int j = i + 1; j < std::min((int)centers.size(), i + 1 + delta); j++) { - // to_merge.push_back(std::make_pair(centers[j].getCenter(), -1)); - // } - // Point* closest = trn.merge(centers[i].getCenter(), to_merge); - // if (closest != NULL) { - // #ifdef DEBUG - // cout << "Merged center " << centers[i]->get_header() << " and " << closest->get_header() << endl; - // #endif - // num_merge++; - // // auto& to_del = partition[centers[i]]; - // // auto& to_add = partition[closest]; - // // to_add.insert(std::end(to_add), std::begin(to_del), std::end(to_del)); - // // partition.erase(centers[i]); - // // centers[i]->set_to_delete(true); - // auto& to_del = partition[centers[i]]; - // auto& to_add = partition[closest]; - // to_add.insert(std::end(to_add), std::begin(to_del), std::end(to_del)); - // partition.erase(centers[i]); - // centers[i]->set_to_delete(true); - - // } } //cout << "Merged " << num_merge << " centers" << endl; centers.erase(std::remove_if(centers.begin(), centers.end(), [](const Center& p) { @@ -430,7 +403,7 @@ bool merge(vector > ¢ers, const Trainer& trn, int delta, int ba template void print_output(const string& output, vector > & partition) { - cout << "Printing output" << endl; +// cout << "Printing output" << endl; std::ofstream ofs; ofs.open(output, std::ofstream::out); int counter = 0; @@ -447,11 +420,6 @@ void print_output(const string& output, vector > & partition) break; } } - if (!cen_found) { - cout << "Center not found" << endl; - cout << "Cluster " << counter << " has center " << cen.getCenter()->get_header() << endl; - // cen.getCenter()->set(*cen.getPoints().at(0)); - } for (auto p : cen.getPoints()) { string s = p->get_header(); ofs << pt << "\t" << p->get_length() << "nt, " << s << "... "; @@ -588,7 +556,7 @@ size_t accumulate(Point** last_ptr, bvec &points, vector > ¢ Point* last = *last_ptr; vector*> current = {last}; bool is_min = false; - + //cout << "Accumulation: " << last->get_header() << " length: " << last->get_length() << endl; for (int num_iter=0; !is_min; num_iter++) { #ifdef DEBUG cout << num_iter << " last: " << last->get_header() << endl; @@ -626,82 +594,65 @@ size_t accumulate(Point** last_ptr, bvec &points, vector > ¢ } } else { // keep adding points, find new mean size_t prev_size = current.size(); + //cout << "Center: " << last->get_header() << " length: " << last->get_length() << endl; points.remove_available(bounds.first, bounds.second, current); + last = get_mean(current, *last, bandwidth); + size_t added_size = current.size() - prev_size; - #ifdef DEBUG - cout << "added new points (" << added_size << ")" << endl; - #endif - if (last == NULL) { - cerr << "Last is null" << endl; - throw 100; - } } } // cout << "Pushed back center " << last->get_header() << endl; Center cc(last, current); centers.push_back(cc); -// Center cen(last, current); -// centers.emplace_back(last, current); - // Point* center = last->clone(); - // centers.push_back(center); - // part[center] = current; - #ifdef DEBUG - for (auto p : current) { - cout << total_iter << " Cluster " << last->get_header() << ": " << p->get_header() << endl; - } - #endif - // if (points.empty()) { - // return true; - // } else { - // return false; - // } return current.size(); } +/* + * The main method of this file, which drives the mean shift accumulate and update steps. + * As seen, the first FOR loop calls accumulate() and progresses through the accumulation stage. + * The second FOR loop iterates through the update stage, calling mean_shift_update() and merge() + * to update each center. + * Finally, the output is printed through print_output. + */ template void ClusterFactory::MS(bvec &points, T bandwidth, double sim, const Trainer& trn, string output, int iter, int delta) { vector > part; -// using partition = map*, vector*> >; -// partition part; - Point* last = points.pop(); - //cout << "First length: " << last->get_length() << endl; Progress pa(points.size(), "Accumulation"); for (int num = 0; last != NULL; num++) { size_t n = accumulate(&last, points, part, trn, sim, bandwidth, num); pa += n; } pa.end(); -// points.check(); - size_t total = 0; - for (auto cen : part) { - total += cen.getPoints().size(); - } - cout << "total size: " << total << endl; + cout << "Number of clusters before update: " << part.size() << endl; + Clock::stamp("accumulate"); + vector num_clusters; Progress pu(iter, "Update"); for (int i = 0; i < iter; i++) { - // #ifdef DEBUG - //print_output(output + to_string(i), part); - // #endif - //cout << "Mean shift iteration " << i << endl; + if (i >= 3 && part.size() == num_clusters[i-3]) { + break; + } #pragma omp parallel for for (int j = 0; j < part.size(); j++) { mean_shift_update(part, j, trn, delta); } merge(part, trn, delta, bandwidth); pu++; + num_clusters.push_back(part.size()); } #pragma omp parallel for - for (int j = 0; j < m_centers.size(); j++) { + for (int j = 0; j < part.size(); j++) { mean_shift_update(part, j, trn, 0); } pu.end(); print_output(output, part); + Clock::stamp("update"); + Clock::stamp("done"); } /* @@ -715,7 +666,6 @@ template std::vector*> ClusterFactory::build_points(vector fileList, std::function*(ChromosomeOneDigit *)> get_point) { std::vector*> points; - std::vector*> cpoints; unsigned fsize = fileList.size(); std::vector*> initial_centers; std::stringstream buffer; @@ -724,7 +674,7 @@ std::vector*> ClusterFactory::build_points(vector fileList, for (unsigned i = 0; i < fsize; i++) { p++; ChromListMaker *maker = new ChromListMaker(fileList.at(i)); - const std::vector * chromList = maker->makeChromOneDigitList(); + const std::vector * chromList = maker->makeChromOneDigitDnaList(); unsigned csize = chromList->size(); #pragma omp parallel for ordered for (unsigned h = 0; h < csize; h++) { @@ -745,111 +695,6 @@ std::vector*> ClusterFactory::build_points(vector fileList, delete maker; } return points; -// std::random_shuffle(points.begin(), points.end()); -// queue gaps; -// calculate_gaps(points, gaps, func); - // for (int i = 1; i < points.size(); i++) { - // int la = points[i]->get_length(); - // int lb = points[i-1]->get_length(); - // if (lb > la && 100.0 * la / lb < sim) { - // gaps.push(i); - // } - // } - - -// vector*>> p; -// vector*> tmp; -// tmp.push_back(points[0]); -// for (int j = 1; j < points.size(); j++) { - -// int la = points[j]->get_length(); -// int lb = points[j-1]->get_length(); -// assert(lb >= la); -// if (lb > la && 100.0 * la / lb < sim) { -// p.push_back(tmp); -// cout << "Gap " << tmp.size() << endl; -// tmp.clear(); -// } -// tmp.push_back(points[j]); -// } -// if (!tmp.empty()) { -// p.push_back(tmp); -// } - -// // calculate_distances(points); -// int idx = 0; -// for (auto &c : p) { -// sort_nn_func(c, func); -// for (auto v : c) { -// v->set_id(idx++); -// cpoints.push_back(v); -// } -// } - - // sort_nn_func(points, - // [&](const Point&a, const Point&b) { - // int la = a.get_length(); - // int lb = b.get_length(); - // return lb > la && 100.0 * la / lb < sim; - // }, - // [](const Point& a, const Point& b) { - // return a.distance_k1(b); - // }); - - - // // for(auto p : points){ - // // cout << p->get_header() << endl; - // // } - - - - // sort_nn_func(points, - // [&](const Point& a, const Point& b) { - // int la = a.get_length(); - // int lb = b.get_length(); - // if (lb > la && 100.0 * la / lb < sim) { - // double mono = a.distance_k1(b) * 100; - // bool q = mono < sim; - // /* - // if (q) { - // cout << "TRUE" << endl; - // } else { - // cout << "FALSE"<< endl; - // } - // */ - // return q; - // } else { - // return false; - // } - // }, - // [](const Point& a, const Point& b) { - // return a.distance(b); - // }); - // uint64_t idx = 0; - // for (auto v : points) { - // v->set_id(idx++); - - // cpoints.push_back(v); - // } - // cout << "Points: " << cpoints.size() << endl; - - - // for (int i = 0; i < points.size(); i++) { - // cout << points[i]->get_header(); - // if (i > 0) { - // cout << " " << points[i]->distance(*points[i-1]); - // } - // cout << endl; - // } - - - - // for (int i = 0; i < points.size(); i++) { - // points[i]->set_id(i); - // cpoints.push_back(points[i]); - // assert(cpoints[i]->get_id() == i); - // } - return points; } @@ -998,21 +843,7 @@ T ClusterFactory::find_h(const std::vector*> ¢ers) const return divs[divs.size()/2]; } } -/* -template -std::vector *> ClusterFactory::get_centers(const std::vector *> &points) -{ - std::vector*> centers; - for (typename std::vector*>::const_iterator it = points.begin(); it != points.end(); ++it) { - Point *p = *it; - if (choose_center(*p)) { - centers.push_back(p->clone()); - } - } - return centers; -} -*/ #ifndef HEADER_HACK template class ClusterFactory; template class ClusterFactory; diff --git a/src/cluster/src/ClusterFactory.h b/src/cluster/ClusterFactory.h similarity index 94% rename from src/cluster/src/ClusterFactory.h rename to src/cluster/ClusterFactory.h index 12180c9..75a2309 100644 --- a/src/cluster/src/ClusterFactory.h +++ b/src/cluster/ClusterFactory.h @@ -13,9 +13,9 @@ #include #include #include -#include "../../nonltr/ChromosomeOneDigit.h" -#include "../../nonltr/KmerHashTable.h" -#include "Point.h" +#include "../nonltr/ChromosomeOneDigit.h" +#include "../nonltr/KmerHashTable.h" +#include "../clutil/Point.h" #include "Trainer.h" #include "bvec.h" @@ -31,7 +31,6 @@ class ClusterFactory { void MS(bvec &points, T bandwidth, double sim, const Trainer& trn, string output, int iter, int delta); private: vector lookup_table; - vector*> m_centers; const int num_per_partition; int k; //void fill_table(KmerHashTable &table, ChromosomeOneDigit *chrom, std::vector& values); diff --git a/src/cluster/Makefile b/src/cluster/Makefile deleted file mode 100644 index 9186210..0000000 --- a/src/cluster/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -TARGET ?= meshclust2 -VERSION ?= 2.1.0 -CXX ?= g++ -ifeq ($(debug),yes) - CXXFLAGS += -ggdb -DDEBUG -fno-omit-frame-pointer -fopenmp -else - CXXFLAGS += -fopenmp -O3 -march=native -g -endif -CXXFLAGS += -std=c++11 -DVERSION=\"$(VERSION)\" -LDFLAGS += -lm - -SOURCES := $(shell find ./src -name '*.cpp') -OBJECTS = $(SOURCES:%.cpp=bin/%.o) -BIN_OBJECTS := $(shell find ../../bin/ -mindepth 2 -name '*.o') - -all: clean $(TARGET) - -$(TARGET): $(OBJECTS) $(BIN_OBJECTS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) - -bin/%.o: %.cpp - mkdir -p $(@D) - $(CXX) $(CXXFLAGS) -c $< -o $@ - -clean: - $(RM) $(OBJECTS) $(TARGET) - -install: $(TARGET) - cp $(TARGET) ~/bin - -.PHONY: all clean install diff --git a/src/cluster/Trainer.cpp b/src/cluster/Trainer.cpp new file mode 100644 index 0000000..923f410 --- /dev/null +++ b/src/cluster/Trainer.cpp @@ -0,0 +1,198 @@ +/* -*- C++ -*- */ +/* + * Trainer.cpp + * + * Author: Benjamin T James + */ +#include "Trainer.h" +#include "../predict/HandleSeq.h" +#include "../clutil/Datatype.h" +#include "../clutil/Loader.h" +#include "ClusterFactory.h" +#include +#include +#include +#include +#include "../predict/Predictor.h" +#include "../predict/GLM.h" +#include "../predict/Feature.h" +#include "../clutil/Progress.h" +#include "../clutil/Random.h" + +template +std::tuple*,double,size_t,size_t> Trainer::get_close(Point *p, bvec_iterator istart, bvec_iterator iend, bool &is_min_r) const +{ + int ncols = weights.getNumRow(); +#pragma omp declare reduction(pmax:std::tuple*,double,size_t,size_t>: \ + omp_out = get<1>(omp_in) > get<1>(omp_out) ? omp_in : omp_out ) \ + initializer (omp_priv=std::make_tuple((Point*)NULL,-1,0,0)) + + std::tuple*, + double, + size_t, + size_t> result = std::tuple*, double, size_t, size_t>(NULL, + -1, + 0, + 0); + bool has_found = false; + bool is_min = true; + uint64_t min_len = p->get_length() * cutoff; + uint64_t max_len = p->get_length() / cutoff; +#pragma omp parallel for reduction(pmax:result), reduction(&&:is_min) + for (bvec_iterator i = istart; i < iend; ++i) { + Point* pt = (*i).first; + + uint64_t len = pt->get_length(); + if (len < min_len || len > max_len) { + continue; + } + auto cache = feat->compute(*pt, *p); + double dist = (*feat)(0, cache); + double sum = classify(pt, p); + double res = round(sum) > 0; + // #pragma omp critical + // cout << "Result: " << sum << " raw_sigmoid: " << matrix::GLM::logistic(sum) << " classify_sum: " << Predictor::classify_sum(sum) << " final: " << res << endl; +// set second to true if result is not 1.0 +// which means it will be removed + result = (dist > std::get<1>(result)) ? std::make_tuple(pt, dist, i.r, i.c) : result; + is_min = is_min && (res != 1.0); +// has_found = has_found || (res != 1.0); + if (res == 1.0) { + *i = std::make_pair(pt, true); +// (*i).second = true; + } + } + +// is_min = !has_found; + is_min_r = is_min; +// return get<0>(result); + return result; + +} + +template +long Trainer::merge(vector > ¢ers, long current, long begin, long last) const +{ +#pragma omp declare reduction(ldpmax:std::pair: \ + omp_out = omp_in.second > omp_out.second ? omp_in : omp_out ) \ + initializer (omp_priv=std::make_pair(0, std::numeric_limits::min())) + std::pair best = std::make_pair(0, std::numeric_limits::min()); + Point* p = centers[current].getCenter(); + uint64_t cen_length = p->get_length(); + uint64_t min_length = cen_length * get_id(); + uint64_t max_length = cen_length / get_id(); +#pragma omp parallel for reduction(ldpmax:best) + for (long i = begin; i <= last; i++) { + double sum = weights.get(0, 0); + double dist = 0; + + Point* cen = centers[i].getCenter(); + uint64_t cen_len = cen->get_length(); + bool length_pass = cen_len >= min_length && cen_len <= max_length; + if (length_pass) { + auto cache = feat->compute(*cen, *p); + for (int col = 1; col < weights.getNumRow(); col++) { + double d = (*feat)(col-1, cache); + if (col == 1) { + dist = d; + } + sum += weights.get(col, 0) * d; + } + double res = round(Predictor::classify_sum(sum)); + + if (res == 1) { + best = best.second > dist ? best : std::make_pair(i, dist); + } + } + } + return best.first; +} + +template +double Trainer::classify(Point*a, Point*b) const +{ + double sum = weights.get(0, 0); + auto cache = feat->compute(*a, *b); + for (int col = 1; col < weights.getNumRow(); col++) { + sum += weights.get(col, 0) * (*feat)(col-1, cache); + } + return Predictor::classify_sum(sum); +} + +template +void Trainer::filter(Point *p, vector *, bool> > &vec) const +{ + uint64_t cen_length = p->get_length(); + uint64_t min_length = cen_length * get_id(); + uint64_t max_length = cen_length / get_id(); + for (auto& pt : vec) { + uint64_t pt_len = pt.first->get_length(); + bool length_pass = pt_len >= min_length && pt_len <= max_length; + pt.second = true; + if (length_pass) { + double sum = classify(p, pt.first); + double res = round(sum); + pt.second = (res == 0); + } + } + vec.erase(std::remove_if(vec.begin(), vec.end(), [](pair*, bool> p) { + return p.second; + }), vec.end()); +} + +template +Point* Trainer::closest(Point *p, vector *, bool> > &vec) const +{ + Point* best_pt = NULL; + double best_dist = 0; + for (auto& pt : vec) { + double sum = weights.get(0, 0); + double dist = pt.first->distance_d(*p); + if (best_pt == NULL || dist < best_dist) { + best_dist = dist; + best_pt = pt.first; + } + } + return best_pt; +} + +template +void Trainer::train(std::string dump_str) +{ + Predictor pred(dump_str); + delete feat; + auto pr = pred.get_class(); + feat = pr.first; + feat->set_save(false); + matrix::GLM glm = pr.second; + weights = glm.get_weights(); +} + +template +void Trainer::train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, std::string dump_str, double acc_cutoff) +{ + std::cout << "Splitting data" << endl; + uintmax_t _id = points.size(); + Predictor pred(k, cutoff, PRED_MODE_CLASS, feat_type, + mut_type, min_n_feat, max_n_feat, min_id); + pred.train(points, _id, n_samples, n_templates); + delete feat; + auto pr = pred.get_class(); + feat = pr.first; + matrix::GLM glm = pr.second; + weights = glm.get_weights(); + + if (dump_str != "") { + pred.save(dump_str, Datatype::get()); + exit(0); + } else { + pred.save("weights.txt", Datatype::get()); + } +} + +template class Trainer; +template class Trainer; +template class Trainer; +template class Trainer; +template class Trainer; +template class Trainer; diff --git a/src/cluster/Trainer.h b/src/cluster/Trainer.h new file mode 100644 index 0000000..b41031a --- /dev/null +++ b/src/cluster/Trainer.h @@ -0,0 +1,46 @@ +/* -*- C++ -*- */ +/* + * Trainer.h + * + * Author: Benjamin T James + */ + +#ifndef TRAINER_H +#define TRAINER_H + +#include "../clutil/Point.h" +#include "../predict/GLM.h" +#include "../predict/Feature.h" +#include "../predict/Predictor.h" +#include "bvec.h" +#include "Center.h" +#include + +template +class Trainer { +public: + Trainer(std::vector*> v, size_t num_points, size_t largest_count, double cutoff_, size_t max_pts_from_one_, int ksize) : points(v), n_samples(num_points), cutoff(cutoff_), n_templates(max_pts_from_one_), k(ksize) { + uintmax_t size = 1000 * 1000 * 10; + feat = new Feature(k); + }; + ~Trainer() { delete feat; } + void train(std::string); + void train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, std::string dump_str, double acc_cutoff=97.5); + + std::tuple*,double,size_t,size_t> get_close(Point*, bvec_iterator istart, bvec_iterator iend, bool& is_min) const; + + void filter(Point*, vector*,bool> >&) const; + Point* closest(Point*, vector*,bool> >&) const; + long merge(vector > ¢ers, long current, long begin, long end) const; + + double get_id() const { return cutoff > 1 ? cutoff / 100.0 : cutoff; } +private: + double classify(Point*, Point*) const; + matrix::Matrix weights; + Feature *feat; + std::vector*> points; + size_t n_samples, n_templates; + double cutoff; + int k; +}; +#endif diff --git a/src/cluster/src/bvec.cpp b/src/cluster/bvec.cpp similarity index 70% rename from src/cluster/src/bvec.cpp rename to src/cluster/bvec.cpp index 2efed1e..8bd35d6 100644 --- a/src/cluster/src/bvec.cpp +++ b/src/cluster/bvec.cpp @@ -123,15 +123,13 @@ template bool bvec::index_of(uint64_t point, size_t* pfront, size_t* pback) const { size_t low = begin_bounds.size()-1, high = 0; + size_t prev = 0; + size_t prev_index = 0; - for (size_t i = 0; i < begin_bounds.size(); i++) { - size_t prev = 0; - size_t prev_index = 0; - if (i > 0) { - prev_index = i - 1; - prev = begin_bounds[i-1]; - } - if (point >= prev && point <= begin_bounds[i]) { + for (size_t i = 1; i < begin_bounds.size(); i++) { + prev_index = i - 1; + prev = begin_bounds[i-1]; + if (point >= prev && point < begin_bounds[i]) { low = std::min(low, prev_index); high = std::max(high, prev_index); } @@ -174,6 +172,15 @@ void bvec::insert(Point *p) } auto mid_min = min_sizes[min_sizes.size() / 2]; data.at(mid_min).push_back(std::make_pair(p, false)); + if (begin_bounds.at(mid_min) > len) { + cerr << "Begin Insertion of " << len << " should not be in bin " << begin_bounds.at(mid_min) << endl; + throw std::exception(); + } + if (mid_min < begin_bounds.size() - 1 && begin_bounds.at(mid_min+1) < len) { + cerr << "End Insertion of " << len << " should not be in bin " << begin_bounds.at(mid_min+1) << endl; + throw std::exception(); + } + } template @@ -214,6 +221,14 @@ void bvec::insert_finalize() for (size_t i = 0; i < data.size(); i++) { std::sort(std::begin(data[i]), std::end(data[i]), sorter); data[i].shrink_to_fit(); +/* if (data[i][0].first->get_length() < begin_bounds[i]) { + cerr << "Length " << data[i][0].first->get_length() << " should not be in bin " << begin_bounds[i] << endl; + throw std::exception(); + } + if (i < data.size()-1 && data[i][data[i].size()-1].first->get_length() > begin_bounds[i+1]) { + cerr << "Length " << data[i][0].first->get_length() << " should not be in bin " << begin_bounds[i] << " to " << begin_bounds[i+1] << endl; + throw std::exception(); + }*/ } } @@ -251,13 +266,24 @@ bvec::get_range(uint64_t begin_len, uint64_t end_len) const front.first = 0; front.second = 0; back.first = data.size()-1; - back.second = data[back.first].size() - 1; + back.second = data[back.first].size() - 1; + + // Determination of the outer indices if (!index_of(begin_len, &front.first, NULL)) { throw 100; } if (!index_of(end_len, NULL, &back.first)) { throw 100; } + // if (begin_len < begin_bounds.at(front.first)) { + // cerr << "Low index is not accurate" << endl; + // throw std::exception(); + // } + // if (front.first > back.first) { + // cerr << "Front index is greater than back index" << endl; + // throw std::exception(); + // } + // Determination of the inner indices if (!inner_index_of(begin_len, front.first, &front.second, NULL)) { throw 100; } @@ -274,6 +300,32 @@ bvec::get_range(uint64_t begin_len, uint64_t end_len) const // } else { // throw 101; // } + + if (back.first == (uint64_t)-1 || back.second == (uint64_t)-1) { + back.is_empty = true; + } +// for (uint64_t i = front.first; i <= back.first; i++) { +// uint64_t j = 0; +// uint64_t end = data.at(i).size(); +// if (i == front.first) { +// j = front.second; +// } +// if (i == back.first) { +// end = min(back.second, end); +// } +// for (; j < end; j++) { +// uint64_t len = data.at(i).at(j).first->get_length(); +// if (len < begin_len || len > end_len) { +// // cerr << "Warning: Length in BVec " << len << " is not in [" << begin_len << ", " << end_len << "]. The classifier will not select these points." << endl; +// // if (i == front.first) { +// // cerr << "Front Bounds of selected bin: " << j << " -> " << data.at(i).at(j).first->get_length(); +// // } +// // if (i == back.first) { +// // cerr << "End Bounds of selected bin: " << end-1 << " -> " << data.at(i).at(end-1).first->get_length(); +// // } +// } +// } +// } return std::make_pair(front, back); } @@ -291,6 +343,9 @@ void bvec::remove_available(bvec_idx_t begin, bvec_idx_t end, std::vector d) { return d.second; }; auto inserter = [&](const std::pair*,bool> p) { @@ -299,6 +354,14 @@ void bvec::remove_available(bvec_idx_t begin, bvec_idx_t end, std::vectorget_length(); + // uint64_t end_len; + // if (end.second == data.at(b).size()) { + // end_len = data.at(b).at(end.second-1).first->get_length(); + // } else { + // end_len = data.at(b).at(end.second).first->get_length(); + // } + // cout << "Boundary: " << begin_len << " -> " << end_len; #pragma omp parallel for for (size_t i = a; i <= b; i++) { /* move marked points to end of vector, then copy, then erase */ @@ -308,6 +371,10 @@ void bvec::remove_available(bvec_idx_t begin, bvec_idx_t end, std::vectorget_header() << " length: " << kv.first->get_length() << endl; + // if (kv.first->get_length() > end_len || kv.first->get_length() < begin_len) { + // cerr << "Error in bvec" << endl; + // } available.push_back(kv.first); } } diff --git a/src/cluster/src/bvec.h b/src/cluster/bvec.h similarity index 96% rename from src/cluster/src/bvec.h rename to src/cluster/bvec.h index 43384e9..0c1b98a 100644 --- a/src/cluster/src/bvec.h +++ b/src/cluster/bvec.h @@ -7,11 +7,12 @@ #ifndef BVEC_H #define BVEC_H -#include "Point.h" +#include "../clutil/Point.h" #include "bvec_iterator.h" typedef struct bvec_idx { size_t first, second; + bool is_empty = false; } bvec_idx_t; /* diff --git a/src/cluster/src/bvec_iterator.cpp b/src/cluster/bvec_iterator.cpp similarity index 88% rename from src/cluster/src/bvec_iterator.cpp rename to src/cluster/bvec_iterator.cpp index f8d1c76..e97e1a1 100644 --- a/src/cluster/src/bvec_iterator.cpp +++ b/src/cluster/bvec_iterator.cpp @@ -1,3 +1,9 @@ +/* -*- C++ -*- */ +/* + * bvec_iterator.cpp + * + * Author: Benjamin T James + */ #include "bvec_iterator.h" template diff --git a/src/cluster/src/bvec_iterator.h b/src/cluster/bvec_iterator.h similarity index 100% rename from src/cluster/src/bvec_iterator.h rename to src/cluster/bvec_iterator.h diff --git a/src/cluster/src/main.cpp b/src/cluster/meshclust2.cpp similarity index 87% rename from src/cluster/src/main.cpp rename to src/cluster/meshclust2.cpp index 562fd96..81340b7 100644 --- a/src/cluster/src/main.cpp +++ b/src/cluster/meshclust2.cpp @@ -4,7 +4,7 @@ * * Author: Benjamin T James */ -#include "Runner.h" +#include "CRunner.h" int main(int argc, char **argv) { Runner runner(argc, argv); diff --git a/src/cluster/src/Loader.cpp b/src/cluster/src/Loader.cpp deleted file mode 100644 index 73691b6..0000000 --- a/src/cluster/src/Loader.cpp +++ /dev/null @@ -1,111 +0,0 @@ -/* -*- C++ -*- - * - * Loader.cpp - * - * Author: Benjamin T James - * - * Class which can 'preload' chunks of sequences from a file list, - * and then count the k-mers separately, which can be done in - * multiple threads - */ -#include "Loader.h" -#include "ClusterFactory.h" -#include "DivergencePoint.h" -#include - -template -bool Loader::done() const -{ - return file_idx == files.size(); -} - -template -void Loader::preload(int tid) -{ - if (file_idx == files.size()) { - return; - } - for (uint64_t j = 0; j < chunk_size; j++) { - auto chrom = next(); - if (chrom.first == "") { - return; - } - cache_list.at(tid).emplace_back(chrom.first, chrom.second); - } -} - - -template -Point* Loader::get_point(std::string header, const std::string &base, uintmax_t& id, int k) -{ - KmerHashTable table(k, 1); - KmerHashTable table_k1(1, 0); - std::vector values; - vector values_k1; - values.clear(); - ChromosomeOneDigit chrom; - chrom.setHeader(header); - chrom.appendToSequence(base); - chrom.finalize(); - fill_table(table, &chrom, values); - fill_table(table_k1, &chrom, values_k1); -// int tmplate = get_template(chrom->getHeader(), templates); - Point *p = new DivergencePoint(values, chrom.size()); -// cout << "mag: " << ((DivergencePoint*)p)->getPseudoMagnitude() << std::endl; - p->set_1mers(values_k1); - p->set_header(header); - p->set_length(chrom.getBase()->length()); - p->set_data_str(*chrom.getBase()); - DivergencePoint* q = dynamic_cast*>(p); - const auto N = q->points.size(); - double aq = (double) q->getPseudoMagnitude() / N; - double sq = 0; - for (auto i = 0; i < N; i++) { - double qdiff = q->points[i] - aq; - sq += qdiff * qdiff; - } - sq = sqrt(sq / N); - q->set_stddev(sq); - p->set_id(id); - #pragma omp atomic - id++; - return p; -} - -template -std::vector*> Loader::load_next(int tid) -{ - std::vector*> points; - for (size_t i = 0; i < cache_list.at(tid).size(); i++) { - auto pr = cache_list.at(tid).at(i); - Point* p = get_point(pr.first, *pr.second, id_list.at(tid), k); - points.push_back(p); - delete pr.second; - } - cache_list.at(tid).clear(); - return points; -} - -template -std::pair Loader::next() -{ - auto n = maker->next(); - if (n.first != "") { - return n; - } - delete maker; - maker = NULL; - file_idx++; - if (file_idx >= files.size()) { - return n; - } - maker = new SingleFileLoader(files.at(file_idx)); - return maker->next(); -} - -template class Loader; -template class Loader; -template class Loader; -template class Loader; -template class Loader; -template class Loader; diff --git a/src/cluster/src/LogTable.cpp b/src/cluster/src/LogTable.cpp deleted file mode 100644 index 0a05a9d..0000000 --- a/src/cluster/src/LogTable.cpp +++ /dev/null @@ -1,41 +0,0 @@ -#include "LogTable.h" - -#include -#include - -LogTable::LogTable() : coeff(1000000 / 2) -{ - uintmax_t size = 1000000; - double imax = 2; -// map = new double[size]; - double lsize = log(size); - for (uintmax_t i = 0; i < size; i++) { - map[i] = log(imax * (i + 1)) - lsize; - } - std::cout << "dmax: " << coeff << std::endl; -} -LogTable::LogTable(uintmax_t size, double imax) : coeff(size / imax) -{ - //map = new double[size]; - double lsize = log(size); - for (uintmax_t i = 0; i < size; i++) { - map[i] = log(imax * (i + 1)) - lsize; - } - std::cout << "dmax: " << coeff << std::endl; -} - -LogTable::~LogTable() -{ - //delete[] map; -} - -double LogTable::at(double d) const -{ - size_t idx = d * coeff; - return map[idx]; -} -double LogTable::operator[](double d) const -{ - size_t index = d * coeff; - return map[index]; -} diff --git a/src/cluster/src/LogTable.h b/src/cluster/src/LogTable.h deleted file mode 100644 index 6fab42e..0000000 --- a/src/cluster/src/LogTable.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef LOGTABLE_H -#define LOGTABLE_H - -#include -#include - -#define TBLSIZE 1000000 -class LogTable { -public: - LogTable(); - LogTable(uintmax_t _size, double imax=2); - ~LogTable(); - double at(double d) const; - double operator[](double d) const; -private: - double map[TBLSIZE]; - - const double coeff; -}; -#endif diff --git a/src/cluster/src/Mat.h b/src/cluster/src/Mat.h deleted file mode 100644 index eb711ed..0000000 --- a/src/cluster/src/Mat.h +++ /dev/null @@ -1,73 +0,0 @@ -/* -*- C++ -*- - * - * Mat.h - * - * Author: Benjamin T James - */ -#ifndef MAT_H -#define MAT_H -#include -#include -using namespace std; -template -class Mat { -public: - Mat(function func, const long size) : n(size), table_size(size*(size+1)/2), compute(func) { - if (size <= 0) { - throw "Invalid size"; - } - table = new T[table_size]; - set = new bool[table_size](); - }; - ~Mat() { - delete[] table; - delete[] set; - }; - void fill() { - unsigned long long count = 0; - #ifdef OPENMP - #pragma omp parallel for collapse(2) shared(set) - #endif - for (long i = 0; i < n; i++) { - for (long j = 0; j < n; j++) { - const auto idx = addr(i, j); - if (!set[idx]) { - auto res = compute(i, j); - table[idx] = res; - set[idx] = true; - count++; - } - if (count % 10000 == 0) { - cout << count << " / " << table_size << endl; - } - } - } - - }; - T& operator[](pair index) { - const unsigned long idx = addr(index.first, index.second); - if (!set[idx]) { - table[idx] = compute(index.first, index.second); - set[idx] = true; - } - return table[idx]; - }; - bool exists(int i, int j) const { - return set[addr(i, j)]; - } -private: - T* table; - bool* set; - const unsigned long table_size; - const unsigned long n; - function compute; - - unsigned long addr(unsigned long i, unsigned long j) const { - if (i <= j) { - return i * n - (i - 1) * i / 2 + j - i; - } else { - return j * n - (j - 1) * j / 2 + i - j; - } - }; -}; -#endif diff --git a/src/cluster/src/NearestNeighbor.h b/src/cluster/src/NearestNeighbor.h deleted file mode 100644 index a59b87b..0000000 --- a/src/cluster/src/NearestNeighbor.h +++ /dev/null @@ -1,52 +0,0 @@ -/* -*- C++ -*- - * - * NearestNeighbor.h - * - * Author: Benjamin T James - */ -#ifndef NEARESTNEIGHBOR_H -#define NEARESTNEIGHBOR_H -// #include -// #include "Point.h" -// template -// class NearestNeighbor { -// public: -// NearestNeighbor(const vector*> &pts) : points(pts) { -// const int dim = pts[0]->get_data().size(); -// const int maxPts = pts.size(); -// dataPts = annAllocPts(maxPts, dim); -// queryPt = annAllocPt(dim); -// for (int nPts = 0; nPts < maxPts; nPts++) { -// auto vec = pts[nPts]->get_data(); -// for (int i = 0; i < vec.size(); i++) { -// dataPts[nPts][i] = vec[i]; -// } -// } -// kd_tree = new ANNkd_tree(dataPts, maxPts, dim); -// nnIdx = new ANNidx[1]; -// dists = new ANNdist[1]; -// }; -// ~NearestNeighbor() { -// delete[] nnIdx; -// delete[] dists; -// delete kd_tree; -// annClose(); -// }; -// void find_nearest_neighbor(Point ¢er) const { -// auto vec = center.get_data(); -// for (int i = 0; i < vec.size(); i++) { -// queryPt[i] = vec[i]; -// } -// kd_tree->annkSearch(queryPt, 1, nnIdx, dists); -// ANNidx idx = nnIdx[0]; -// center.set(*points[idx]); -// }; -// private: -// ANNkd_tree *kd_tree = NULL; -// ANNpointArray dataPts; -// ANNpoint queryPt; -// ANNidxArray nnIdx; -// ANNdistArray dists; -// const vector*> &points; -// }; -#endif diff --git a/src/cluster/src/Predictor.cpp b/src/cluster/src/Predictor.cpp deleted file mode 100644 index 3f19ba9..0000000 --- a/src/cluster/src/Predictor.cpp +++ /dev/null @@ -1,841 +0,0 @@ -/* -*- C++ -*- - * - * Predictor.cpp - * - * Author: Benjamin T James - * - * Predictor implementation class - * train(vector<>...) is entry point, generates "semi-synthetic" sequences - * train() actually trains applicable GLM's. - * close() and similarity() are callable once trained - */ -#include "Predictor.h" -#include "Loader.h" -#include "Matrix.h" -#include "ClusterFactory.h" -#include "HandleSeq.h" -#include "Progress.h" -#include "Random.h" -#include - -template -void Predictor::save(std::string file) -{ - std::ofstream out(file); - out << "k: " << k << endl; - out << "mode: " << (unsigned int)mode << endl; - out << "max_features: " << max_num_feat << endl; - out << "ID: " << id << endl; - if (mode & PRED_MODE_CLASS) { - write_to(out, feat_c, c_glm); - } - if (mode & PRED_MODE_REGR) { - write_to(out, feat_r, r_glm); - } -} - -template -Predictor::Predictor(const std::string filename) -{ - std::ifstream in(filename); - std::string buf; - unsigned mode_ = 0; - in >> buf >> k; - cout << buf << k << endl; - in >> buf >> mode_; - mode = mode_; - cout << buf << mode << endl; - in >> buf >> max_num_feat; - cout << buf << max_num_feat << endl; - in >> buf >> id; - cout << buf << id << endl; - is_trained = true; - is_training = false; - if (mode & PRED_MODE_CLASS) { - auto pr = read_from(in, k); - c_glm = pr.first; - feat_c = pr.second; - } - if (mode & PRED_MODE_REGR) { - auto pr = read_from(in, k); - r_glm = pr.first; - feat_r = pr.second; - } -} - -template -void Predictor::write_to(std::ofstream &out, Feature* feat, matrix::GLM glm) -{ - auto combos = feat->get_combos(); - auto lookup = feat->get_lookup(); - auto mins = feat->get_mins(); - auto maxs = feat->get_maxs(); - out << std::endl << "n_combos: " << combos.size() << std::endl; - out << glm.get_weights().get(0, 0) << endl; - for (int j = 0; j < combos.size(); j++) { - auto cmb = combos[j]; - unsigned int val = 0; - uint64_t flags = 0; - for (auto i : cmb.second) { - flags |= lookup[i]; - } - switch (cmb.first) { - case Combo::xy: - val = 0; - break; - case Combo::xy2: - val = 1; - break; - case Combo::x2y: - val = 2; - break; - case Combo::x2y2: - val = 3; - break; - } - out << val << " "; - out << flags << " "; - out << glm.get_weights().get(j+1, 0) << std::endl; - } - out << std::endl << "n_singles: " << lookup.size() << std::endl; - for (int j = 0; j < lookup.size(); j++) { - out << lookup[j] << " "; - out << mins[j] << " "; - out << maxs[j] << std::endl; - } -} - - -template -pair*> Predictor::read_from(std::ifstream& in, int k_) -{ - matrix::GLM glm; - int c_num_raw_feat, c_num_combos; - Feature *feat = new Feature(k_); - std::string buf; - in >> buf >> c_num_combos; - cout << buf << "\"" << c_num_combos << "\"" << endl; - matrix::Matrix weights(c_num_combos+1, 1); - double d_; - in >> d_; - weights.set(0, 0, d_); - for (int i = 0; i < c_num_combos; i++) { - int cmb; - in >> cmb; - cout << (int)cmb << endl; - uint64_t flags; - in >> flags; - cout << flags << endl; - double d; - in >> d; - cout << "[" << 0 << "," << i << "] " << d << endl; - weights.set(i+1, 0, d);//push_back(d); - Combo cmb_ = Combo::xy; - switch (cmb) { - case 0: - cmb_ = Combo::xy; - break; - case 1: - cmb_ = Combo::xy2; - break; - case 2: - cmb_ = Combo::x2y; - break; - case 3: - cmb_ = Combo::x2y2; - break; - default: - cerr << "error reading weights file" << endl; - break; - } - feat->add_feature(flags, cmb_); - } - - in >> buf >> c_num_raw_feat; - cout << buf << "\"" << c_num_raw_feat << "\"" << endl; - for (int i = 0; i < c_num_raw_feat; i++) { - uint64_t single_flag; - double min_, max_; - in >> single_flag; - cout << single_flag << endl; - in >> min_; - cout << min_ << endl; - in >> max_; - cout << max_ << endl; - feat->set_normal(single_flag, min_, max_); - } - feat->finalize(); - glm.load(weights); - return {glm, feat}; -} - -template -void Predictor::add_feats(std::vector >& vec, uint64_t feat_flags) -{ - for (uint64_t i = 1; i <= feat_flags; i *= 2) { - if ((i & feat_flags) == 0) { - continue; - } - for (uint64_t j = 1; j <= i; j *= 2) { - if ((j & feat_flags) == 0) { - continue; - } - vec.emplace_back(i | j, Combo::xy); - vec.emplace_back(i | j, Combo::x2y2); - if (i != j) { - vec.emplace_back(i | j, Combo::x2y); - vec.emplace_back(i | j, Combo::xy2); - } - } - } -} -template -void Predictor::check() -{ - // if (!is_trained && training.size() >= threshold && !is_training) { - // omp_set_lock(&lock); - // is_training = true; - // train(); - // is_training = false; - // omp_unset_lock(&lock); - // } -} -template -double Predictor::similarity(Point* a, Point* b) -{ - if (!is_trained) { -// double d = Selector::align(a, b); - cerr << "alignment: we don't do that here" << endl; - throw "Bad"; - // return d; - // if (!is_training) { - // omp_set_lock(&lock); - // if (training.size() < testing.size() && training.size() < threshold) { - // training.push_back(pra(a, b, d)); - // } else if (training.size() >= testing.size() && testing.size() < threshold) { - // testing.push_back(pra(a, b, d)); - // } - // omp_unset_lock(&lock); - // } - return 0; - - } else { - return predict(a, b); - } -} - -template -bool Predictor::close(Point *a, Point *b) -{ - if (!is_trained) { -// double d = Selector::align(a, b); - cerr << "alignment shouldn't be used here" << endl; - throw "bad"; - // if (!is_training) { - // omp_set_lock(&lock); - // if (training.size() < testing.size() && training.size() < threshold) { - // training.push_back(pra(a, b, d)); - // } else if (training.size() >= testing.size() && testing.size() < threshold) { - // testing.push_back(pra(a, b, d)); - // } - // omp_unset_lock(&lock); - // } -// return d > id; - return false; - } - bool val = p_close(a, b); - if ((mode & PRED_MODE_REGR) && val) { - // val = p_predict(a, b) > id; - // if (!val) { - // cout << "FIXED" << endl; - // } - } - return val; -} - -template -double Predictor::p_predict(Point* a, Point* b) -{ - auto cache = feat_r->compute(*a, *b); - auto weights = r_glm.get_weights(); - double sum = weights.get(0, 0); - for (int col = 0; col < feat_r->size(); col++) { - double val = (*feat_r)(col, cache); - sum += weights.get(col+1, 0) * val; - } - if (sum < 0) { - sum = 0; - } else if (sum > 1) { - sum = 1; - } - return sum; -} -template -double Predictor::predict(Point* a, Point* b) -{ - if ((mode & PRED_MODE_CLASS) && !p_close(a, b)) { - return 0; - } - return p_predict(a, b); -} - -template -bool Predictor::p_close(Point* a, Point* b) -{ - auto weights = c_glm.get_weights(); - double sum = weights.get(0, 0); - auto cache = feat_c->compute(*a, *b); - for (int col = 1; col < weights.getNumRow(); col++) { - double d = (*feat_c)(col-1, cache); - sum += weights.get(col, 0) * d; - } - return sum > 0.0; -} - - -template -std::pair generate_feat_mat(const vector > &data, Feature& feat, double cutoff) -{ - bool classify = (cutoff >= 0); - int nrows = data.size(); - int ncols = feat.size()+1; - matrix::Matrix feat_mat(nrows, ncols); - matrix::Matrix labels(nrows, 1); - #pragma omp parallel for - for (int row = 0; row < data.size(); row++) { - auto kv = data.at(row); - vector cache; - // #pragma omp critical - // { - cache = feat.compute(*kv.first, *kv.second); - // } - feat_mat.set(row, 0, 1); - if (classify) { - labels.set(row, 0, kv.val >= cutoff ? 1 : -1); - } else { - labels.set(row, 0, kv.val); - } - for (int col = 1; col < ncols; col++) { - double val = feat(col-1, cache); - feat_mat.set(row, col, val); - } - } - return std::make_pair(feat_mat, labels); -} - -template -void Predictor::train(const vector *> &points, const vector* > &queries, uintmax_t &_id, size_t num_sample) -{ - if (is_trained) { return; } - - num_sample = min(num_sample, points.size()); - - vector*> f_points_tr, f_points_test; - size_t total_size = points.size();// + queries.size(); - for (int i = 0; i < num_sample; i++) { - int i1 = floor((double)i * total_size / (2 * num_sample)); - int i2 = floor((i + 1) * (double)total_size / (2 * num_sample)); - f_points_tr.push_back(points.at(i1)); - f_points_test.push_back(points.at(i2)); - } - // size_t q_sample = min(num_sample / 10, queries.size()); - // while (10 * f_points_tr.size() <= 11 * num_sample) { - // for (int i = 0; i < q_sample; i++) { - // int i1 = floor((double)i * queries.size() / (2 * q_sample)); - // int i2 = floor((i + 1) * (double)queries.size() / (2 * q_sample)); - // f_points_tr.push_back(queries.at(i1)); - // f_points_test.push_back(queries.at(i2)); - // } - // } - training.clear(); - testing.clear(); - if (mode & PRED_MODE_CLASS) { - - std::vector > pos_buf, neg_buf; - cout << "mutating sequences" << endl; - size_t counter = 0; - // struct timespec start, stop; - // clock_gettime(CLOCK_MONOTONIC, &start); - Progress prog1(f_points_tr.size(), "Generating training"); - #pragma omp parallel for - for (size_t i = 0; i < f_points_tr.size(); i++) { - auto p = f_points_tr[i]; - mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id); - mutate_seqs(p, 5, pos_buf, neg_buf, min_id, 100 * id, _id); - #pragma omp critical - prog1++; - } - prog1.end(); - // clock_gettime(CLOCK_MONOTONIC, &stop); - // printf("took %lu\n", stop.tv_sec - start.tv_sec); - - counter = 0; - size_t buf_size = std::min(pos_buf.size(), neg_buf.size()); - cout << "training +: " << pos_buf.size() << endl; - cout << "training -: " << neg_buf.size() << endl; - std::random_shuffle(pos_buf.begin(), pos_buf.end()); - std::random_shuffle(neg_buf.begin(), neg_buf.end()); - for (size_t i = 0; i < buf_size; i++) { - training.push_back(pos_buf[i].deep_clone()); - training.push_back(neg_buf[i].deep_clone()); - } - for (auto p : pos_buf) { - delete p.first; - delete p.second; - } - for (auto p : neg_buf) { - delete p.first; - delete p.second; - } - pos_buf.clear(); - neg_buf.clear(); - Progress prog2(f_points_test.size(), "Generating testing"); - #pragma omp parallel for - for (size_t i = 0; i < f_points_test.size(); i++) { - auto p = f_points_test[i]; - mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id); - mutate_seqs(p, 5, pos_buf, neg_buf, min_id, 100 * id, _id); -#pragma omp critical - prog2++; - } - prog2.end(); - buf_size = std::min(pos_buf.size(), neg_buf.size()); - cout << "testing +: " << pos_buf.size() << endl; - cout << "testing -: " << neg_buf.size() << endl; - std::random_shuffle(pos_buf.begin(), pos_buf.end()); - std::random_shuffle(neg_buf.begin(), neg_buf.end()); - for (size_t i = 0; i < buf_size; i++) { - testing.push_back(pos_buf[i].deep_clone()); - testing.push_back(neg_buf[i].deep_clone()); - } - for (auto p : pos_buf) { - delete p.first; - delete p.second; - } - for (auto p : neg_buf) { - delete p.first; - delete p.second; - } - } else { - for (auto p : f_points_tr) { - mutate_seqs(p, 10, training, training, min_id, 100, _id); - } - for (auto p : f_points_test) { - mutate_seqs(p, 10, testing, testing, min_id, 100, _id); - } - } - - - train(); -} -template -std::pair regression_train(const vector > &data, Feature& feat) -{ - auto pr = generate_feat_mat(data, feat, -1); - matrix::GLM glm; - glm.train(pr.first, pr.second); - auto result1 = pr.first * glm.get_weights(); - auto diff1 = result1 - pr.second; - double sum = 0; - for (int i = 0; i < diff1.getNumRow(); i++) { - sum += fabs(diff1.get(i, 0)); - } - sum /= diff1.getNumRow(); - return {sum, glm}; -} - -template -std::pair class_train(vector > &data, Feature& feat, double cutoff) -{ - // vector > above, below; - - // for (auto d : data) { - // if (d.val > cutoff) { - // above.push_back(d); - // } else { - // below.push_back(d); - // } - // } - // size_t sz = std::min(above.size(), below.size()); - // data.clear(); - // for (size_t i = 0; i < sz; i++) { - // data.push_back(above[i]); - // data.push_back(below[i]); - // } - auto pr = generate_feat_mat(data, feat, cutoff); - matrix::GLM glm; - glm.train(pr.first, pr.second); - matrix::Matrix p = glm.predict(pr.first); - for (int row = 0; row < p.getNumRow(); row++) { - if (p.get(row, 0) == 0) { - p.set(row, 0, -1); - } - } - double acc = get<0>(glm.accuracy(pr.second, p)); - return {acc, glm}; -} - -template -double regression_test(const vector >& data, Feature& feat, const matrix::GLM& glm, std::string prefix="") -{ - auto pr = generate_feat_mat(data, feat, -1); - auto result1 = pr.first * glm.get_weights(); - auto diff1 = result1 - pr.second; - double sum = 0; - for (int i = 0; i < diff1.getNumRow(); i++) { - sum += fabs(diff1.get(i, 0)); - } - if (prefix != "") { - for (int row = 0; row < result1.getNumRow(); row++) { - cout << prefix << ";" << data[row].first->get_header() << ";" << data[row].second->get_header() << ";" << result1.get(row, 0) << ";" << pr.second.get(row, 0) << ";" << diff1.get(row, 0) << endl; - } - } - sum /= diff1.getNumRow(); - return sum; -} - -template -void print_wrong(matrix::Matrix oLabels, matrix::Matrix pLabels) -{ - for(int i = 0; i < oLabels.getNumRow(); i++){ - if(oLabels.get(i,0) == pLabels.get(i, 0)){ - cout << ""; - } - } -} - -template -tuple class_test(const vector >& data, Feature& feat, const matrix::GLM& glm, double cutoff, std::string prefix="") -{ - auto pr = generate_feat_mat(data, feat, cutoff); - matrix::Matrix p = glm.predict(pr.first); - for (int row = 0; row < p.getNumRow(); row++) { - if (p.get(row, 0) == 0) { - p.set(row, 0, -1); - } - if (prefix != "") { - cout << prefix << ";" << data[row].first->get_header() << ";" << data[row].second->get_header() << ";" << data[row].val << ";" << p.get(row, 0) << ";" << pr.second.get(row, 0) << endl; - } - } -// print_wrong(pr.second, p); - return glm.accuracy(pr.second, p); -} - -template -void Predictor::filter(std::vector > &vec, std::string prefix) -{ - std::vector > > bins; - std::vector limits; - size_t num_bins = 10; - size_t smallest_bin_size = vec.size(); - for (size_t i = 0; i < num_bins; i++) { - limits.push_back(id + i * (1 - id) / num_bins); - bins.push_back(std::vector >()); - } - limits.push_back(1); - for (auto p : vec) { - for (size_t i = 1; i < limits.size(); i++) { - if (p.val <= limits[i] && p.val > limits[i-1]) { - bins[i-1].push_back(p); - if (prefix != "") { - cout << prefix << " bin " << i - 1 << " " << p.val << endl; - } - break; - } - } - } - size_t bin_size = 0; - for (auto &v : bins) { - bin_size += v.size(); - // smallest_bin_size = std::min(smallest_bin_size, v.size()); - std::random_shuffle(v.begin(), v.end()); - } - smallest_bin_size = bin_size / bins.size(); - vec.clear(); - - for (auto &v : bins) { - for (size_t i = 0; i < std::min(v.size(), smallest_bin_size); i++) { - vec.push_back(v[i]); - } - } - cout << "new vector size: " << vec.size() << " divided into " << bins.size() << " equal parts" << endl; -} - -double rand_between(double mute, double rng, double low, double high) -{ - Random r; - double r_d = r.random(); - - double mn = std::max(mute - rng, low); - double mx = std::min(mute + rng, high); - return r_d * (mx - mn) + mn; -} - -template -void Predictor::mutate_seqs(Point* p, size_t num_seq, vector > &pos_buf, vector > &neg_buf, double id_begin, double id_end, uintmax_t& _id) -{ - HandleSeq h(mut_type); - ClusterFactory factory(k); - double inc = (id_end - id_begin) / num_seq; - std::string bin_seq = p->get_data_str(); - std::string seq; - for (auto c : bin_seq) { - switch (c) { - case 0: - seq += 'A'; - break; - case 1: - seq += 'C'; - break; - case 2: - seq += 'G'; - break; - case 3: - seq += 'T'; - break; - case 'N': - seq += 'C'; - break; - default: - cout << "Invalid character " << c << endl; - cout << "from sequence " << bin_seq << endl; - throw 3; - } - } - for (size_t i = 0; i < num_seq; i++) { - double iter_id = id_begin + inc * (i + 0.5); - double actual_id = rand_between(iter_id, inc, id_begin, id_end); - int mut = round(100 - actual_id); - auto newseq = h.mutate(seq, mut); - std::string chrom; - std::string header = p->get_header(); - Point* new_pt = Loader::get_point(header, newseq.second, _id, k); - pra pr; - pr.first = p->clone(); - pr.second = new_pt; - pr.val = newseq.first; -#pragma omp critical - { - if (pr.val > id) { - pos_buf.push_back(pr); - } else { - neg_buf.push_back(pr); - } - } - } -} -template -void Predictor::train() -{ - Feature feat(k); - feat.set_save(true); - - uint64_t max_feat = 0; - for (uint64_t i = 0; i < possible_feats.size(); i++) { - if (possible_feats.at(i).first > max_feat) { - max_feat |= possible_feats.at(i).first; - } - } - for (uint64_t i = 1; i <= max_feat; i *= 2) { - if (i & max_feat) { - feat.add_feature(i, Combo::xy); - } - } - feat.normalize(training); - feat.normalize(testing); - feat.finalize(); - - - - // cout << "Class Training:" << endl; - // for (auto p : training) { - // cout << p.val << " "; - // } - // cout << "Class Testing:" << endl; - // for (auto p : testing) { - // cout << p.val << " "; - // } - if (mode & PRED_MODE_CLASS) { - train_class(&feat); - if (mode & PRED_MODE_REGR) { - // vector*> f_points_tr, f_points_test; - // for (int i = 0; i < 10; i++) { - // f_points_tr.push_back(training[rand()%training.size()].first); - // f_points_test.push_back(training[rand()%training.size()].first); - // } - // training.clear(); - // testing.clear(); - // for (auto p : f_points_tr) { - // mutate_seqs(p, 50, training, 100 * id, 100); - // mutate_seqs(p, 50, training, 60, 100 * id); - // } - // for (auto p : f_points_test) { - // mutate_seqs(p, 50, testing, 100 * id, 100); - // mutate_seqs(p, 50, testing, 60, 100 * id); - // } - // filter(); - auto func = [&](pra pr) { - return pr.val <= id; - }; - training.erase(std::remove_if(training.begin(), training.end(), func), training.end()); - testing.erase(std::remove_if(testing.begin(), testing.end(), func), testing.end()); - filter(training);//, "training"); - filter(testing);//, "testing"); - - } - } - if (mode & PRED_MODE_REGR) { - train_regr(&feat); - } - cout << "Training size: " << training.size() << endl; - cout << "Testing size: " << testing.size() << endl; - // for (auto p : training) { - // cout << p.val << " "; - // } - cout << endl; - feat.set_save(false); - training.clear(); - testing.clear(); - possible_feats.clear(); - is_trained = true; -} - -template -void Predictor::train_class(Feature* feat) -{ - auto c_size = feat->get_combos().size(); - for (int i = 0; i < c_size; i++) { - feat->remove_feature(); - } - vector used_list; - double abs_best_acc = 0; -// cout << "possible feats at one step: " << possible_feats.size() << endl; - Progress prog(possible_feats.size() * max_num_feat, "Feature selection:"); - - std::ostringstream oss; - for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) { - double best_class_acc = abs_best_acc; - uintmax_t best_idx = -1, cur_idx = 1; - auto best_class_feat = possible_feats.front(); - for (uint64_t i = 0; i < possible_feats.size(); i++) { - if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) { - continue; - } - auto rfeat = possible_feats[i]; - feat->add_feature(rfeat.first, rfeat.second); - feat->normalize(training); - feat->finalize(); - auto name = feat->feat_names().back(); - auto pr = class_train(training, *feat, id); - auto class_ac = class_test(testing, *feat, pr.second, id); - feat->remove_feature(); - prog++; -// cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name << " acc: " << get<0>(class_ac) << " sens: " << get<1>(class_ac) << " spec: " << get<2>(class_ac) << endl; - if (get<0>(class_ac) > best_class_acc) { - best_class_acc = get<0>(class_ac); - best_class_feat = rfeat; - best_idx = i; - } - } - if (best_class_acc > abs_best_acc || num_feat <= min_num_feat) { - feat->add_feature(best_class_feat.first, best_class_feat.second); - feat->normalize(training); - feat->finalize(); - abs_best_acc = best_class_acc; - used_list.push_back(best_idx); - oss << "Feature added: " << best_class_feat.first << " " << (int)best_class_feat.second << endl; - oss << "Accuracy: " << best_class_acc << endl; - possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_class_feat), possible_feats.end()); - } - } - prog.end(); - cout << oss.str(); - feat_c = new Feature(*feat); - feat_c->set_save(false); - auto pr = class_train(training, *feat_c, id); - cout << "Training ACC: " << pr.first << endl; - c_glm = pr.second; - auto train_results = class_test(training, *feat_c, c_glm, id);//, "train"); - cout << "Training ACC: " << get<0>(train_results) << " " << get<1>(train_results) << " " << get<2>(train_results) << endl; - auto test_results = class_test(testing, *feat_c, c_glm, id);//, "test"); - double class_acc = get<0>(test_results); - cout << "Testing ACC: " << class_acc << " " << get<1>(test_results) << " " << get<2>(test_results) << endl; - - cout << "Features: "<< endl; - for (auto line : feat_c->feat_names()) { - cout << "\t" << line << endl; - } -} -template -void Predictor::train_regr(Feature* feat) -{ - auto c_size = feat->get_combos().size(); - for (int i = 0; i < c_size; i++) { - feat->remove_feature(); - } - vector used_list; - double abs_best_regr = 1000000; - for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) { - double best_regr_err = abs_best_regr; - uintmax_t best_idx = -1, cur_idx = 1; - auto best_regr_feat = possible_feats.front(); - for (uint64_t i = 0; i < possible_feats.size(); i++) { - if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) { - continue; - } - auto rfeat = possible_feats[i]; - feat->add_feature(rfeat.first, rfeat.second); - feat->normalize(training); - feat->finalize(); - auto pr = regression_train(training, *feat); - auto name = feat->feat_names().back(); - double regr_mse = regression_test(testing, *feat, pr.second); - feat->remove_feature(); - - cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name << " err: " << regr_mse << endl; - if (regr_mse < best_regr_err) { - best_regr_err = regr_mse; - best_regr_feat = rfeat; - best_idx = i; - } - } - if (best_regr_err < abs_best_regr) { - feat->add_feature(best_regr_feat.first, best_regr_feat.second); - feat->normalize(training); - feat->finalize(); - abs_best_regr = best_regr_err; - used_list.push_back(best_idx); - //possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_regr_feat), possible_feats.end()); - } - } - feat_r = new Feature(*feat); - feat_r->set_save(false); - auto pr = regression_train(training, *feat_r); - r_glm = pr.second; - double tr_regr_mse = regression_test(testing, *feat_r, r_glm); // "training" - cout << "Training Mean Error: " << pr.first << endl; - double regr_mse = regression_test(testing, *feat_r, r_glm);//, "testing"); - cout << "Testing Mean Error: " << regr_mse << endl; - cout << "Features: "<< endl; - for (auto line : feat_r->feat_names()) { - cout << "\t" << line << endl; - } - // auto w = r_glm.get_weights(); - // for (int r = 0; r < w.getNumRow(); r++) { - // for (int c = 0; c < w.getNumCol(); c++) { - // cout << w.get(r, c) << " "; - // } - // cout << endl; - // } - // for (auto combo : feat.get_combos()) { - // cout << combo.first << " " << - // } - -} - -template class Predictor; -template class Predictor; -template class Predictor; -template class Predictor; -template class Predictor; -template class Predictor; diff --git a/src/cluster/src/Progress.cpp b/src/cluster/src/Progress.cpp deleted file mode 100644 index e16ef06..0000000 --- a/src/cluster/src/Progress.cpp +++ /dev/null @@ -1,65 +0,0 @@ -#include "Progress.h" -#include -#include - -Progress::Progress(long num, std::string prefix_) -{ - pmax = num; - ended = 0; - pcur = 0; - prefix = prefix_; - last = ""; - barWidth = 70 - (prefix.size()+1); - print(); -} - -void Progress::print() -{ - std::ostringstream oss; - double prog = (double)pcur / pmax; - oss << prefix << " ["; - int pos = barWidth * prog; - for (int i = 0; i < barWidth; i++) { - if (i < pos) { - oss << "="; - } else if (i == pos) { - oss << ">"; - } else { - oss << " "; - } - } - oss << "] " << int(prog * 100.0) << " %\r"; - if (oss.str() != last) { - last = oss.str(); - std::cout << last; - std::cout.flush(); - } -} - -void Progress::end() -{ - if (!ended) { - pcur = pmax; - print(); - std::cout << std::endl; - } - ended = true; -} - -void Progress::operator++() -{ - pcur++; - print(); -} -void Progress::operator++(int) -{ - print(); - pcur++; -} - - -void Progress::operator+=(size_t num) -{ - pcur += num; - print(); -} diff --git a/src/cluster/src/Random.h b/src/cluster/src/Random.h deleted file mode 100644 index 3131b34..0000000 --- a/src/cluster/src/Random.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef RANDOM_H // -*- C++ -*- -#define RANDOM_H -#include - -class Random { - std::mt19937 rng; -public: - Random() : rng(std::random_device()()) {} - - template - T randMod(T max) { - std::uniform_int_distribution distribution(0, max-1); - return distribution(rng); - } - - double random() { - std::uniform_real_distribution distribution(0.0, 1.0); - return distribution(rng); - } -}; - -#endif diff --git a/src/cluster/src/SingMute.cpp b/src/cluster/src/SingMute.cpp deleted file mode 100644 index 45f1610..0000000 --- a/src/cluster/src/SingMute.cpp +++ /dev/null @@ -1,116 +0,0 @@ -#include "SingMute.h" -#include -#include "Random.h" - - - - -void generate_unique_set(size_t cmd_size, std::set& ret, int num_elts, const std::set& bad_set_1, const std::set& bad_set_2, const std::vector &valid, Random& rng) -{ - while (ret.size() <= num_elts) { - long idx = rng.randMod(cmd_size); - if (valid[idx] && - ret.find(idx) == ret.end() && - bad_set_1.find(idx) == bad_set_1.end() && - bad_set_2.find(idx) == bad_set_2.end()) { - - ret.insert(idx); - } - } -} -char SingMute::randNucl() -{ - char character; - int value = rng.randMod(percAs + percCs + percGs + percTs); - if (value < percAs) { - character = 'A'; - } else if (value < percAs + percCs) { - character = 'C'; - } else if (value < percAs + percCs + percGs) { - character = 'G'; - } else { - character = 'T'; - } - return character; -} -void SingMute::init(const std::vector &valid) -{ - maxInsert = 0; - maxDel = 0; - maxSwitch = 0; - if (num_mut == 1) { - maxInsert = 1; - maxDel = 0; - maxSwitch = 0; - } else if (num_mut == 0) { - out_seq = *seq; - return; - } else { - maxSwitch = rng.randMod(num_mut); - num_mut -= maxSwitch; - - if (maxSwitch % 2 == 1 && num_mut >= 1) { - maxSwitch++; - num_mut--; - } else if (num_mut == 0) { - maxSwitch--; - num_mut++; - } - if (num_mut > 1) { - maxInsert = rng.randMod(num_mut); - num_mut -= maxInsert; - } else { - maxInsert = num_mut; - num_mut -= maxInsert; - } - maxDel = num_mut; - } - size_t seq_len = seq->length(); - - maxDel *= seq_len / 100.0; - maxInsert *= seq_len / 100.0; - maxSwitch *= seq_len / 100.0; - alignmentLength = maxInsert; - IBP = maxDel + maxSwitch; - - - std::vector command_str(seq_len, 'S'); - - std::set s_ins, s_del, s_switch; - generate_unique_set(command_str.size(), s_ins, maxInsert, s_del, s_switch, valid, rng); - generate_unique_set(command_str.size(), s_del, maxDel, s_ins, s_switch, valid, rng); - generate_unique_set(command_str.size(), s_switch, maxSwitch, s_ins, s_del, valid, rng); - for (auto idx : s_ins) { - command_str[idx] = 'I'; - } - for (auto idx : s_del) { - command_str[idx] = 'D'; - } - for (auto idx : s_switch) { - command_str[idx] = 'W'; - } - out_seq = ""; - out_seq.reserve(maxInsert + seq_len - maxDel + 1); - - for (long i = 0; i < seq_len; i++) { - auto cmd = command_str.at(i); - switch (cmd) { - case 'I': { - out_seq += randNucl(); - out_seq += seq->at(i); - break; - } - case 'S': { - out_seq += seq->at(i); - break; - } - case 'D': { - break; - } - case 'W': { - out_seq += randNucl(); - break; - } - } - } -} diff --git a/src/cluster/src/SingleFeature.cpp b/src/cluster/src/SingleFeature.cpp deleted file mode 100644 index bdc441c..0000000 --- a/src/cluster/src/SingleFeature.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#include "SingleFeature.h" - -template -void SingleFeature::normalize(const vector*,Point*> > &pairs) -{ - for (auto p : pairs) { - double d; - if (rc.empty()) { - d = raw(p.first, p.second); - } else { - d = rraw(p.first, p.second, rc, rv); - } - if (!min_set || d < min) { - min = d; - min_set = true; - } - if (!max_set || d > max) { - max = d; - max_set = true; - } - } -} - -template -double SingleFeature::operator()(Point *a, Point *b) const -{ - double d; - if (rc.empty()) { - d = raw(a, b); - } else { - d = rraw(a, b, rc, rv); - } -// std::cout << "Raw: " << d << std::endl; - double f = (d - min) / (max - min); -// std::cout << "Normalized: " << f << std::endl; - f = std::min(1.0, std::max(0.0, f)); - if (is_sim) { - return f; - } else { - return 1.0 - f; - } -} - - -template class SingleFeature; -template class SingleFeature; -template class SingleFeature; -template class SingleFeature; -template class SingleFeature; -template class SingleFeature; diff --git a/src/cluster/src/SingleFeature.h b/src/cluster/src/SingleFeature.h deleted file mode 100644 index efa882c..0000000 --- a/src/cluster/src/SingleFeature.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef SINGLEFEATURE_H -#define SINGLEFEATURE_H - -#include "Point.h" -#include - -template -class SingleFeature { -public: - SingleFeature(std::function*, Point*)> f, bool is_sim_=true) - : raw(f), is_sim(is_sim_), min_set(false), max_set(false) {} - SingleFeature(std::function*, Point*, const vector&, const vector&)> f, vector rrv, vector rrc, bool is_sim_=true) - : rraw(f), is_sim(is_sim_), min_set(false), max_set(false), rv(rrv), rc(rrc) {} - void normalize(const vector*,Point*> > &pairs); - double operator()(Point*, Point*) const; - double min, max; -private: - std::function*, Point*)> raw; - std::function*, Point*, const vector&, const vector&)> rraw; - vector rv, rc; - const bool is_sim; - bool max_set, min_set; - -}; - -#endif diff --git a/src/cluster/src/SingleMute.cpp b/src/cluster/src/SingleMute.cpp deleted file mode 100644 index 1f435f7..0000000 --- a/src/cluster/src/SingleMute.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/** - * Author: Alex Baumgartner - * The Bioinformatics Toolsmith Laboratory, the University of Tulsa - * 5/15/2018 - * - * Purpose: - * The pupose of this module is to perform single mutations on sequences - */ - -#include "SingleMute.h" -#include - -int intRandMod_(int max) { - static thread_local std::mt19937 generator; - std::uniform_int_distribution distribution(0, max-1); - return distribution(generator); -} - -SingleMute::SingleMute(int a, int c, int g, int t, int alloc) { - percAs = a; - percCs = c; - percGs = g; - percTs = t; - //If allocation is 0, all sub allocations are 0 - if (alloc == 0) { - maxDel = 0; - maxInsert = 0; - maxSwitch = 0; - } - //Arbitrary, if only 1 percent is allocated, then only insert gets an allocation - else if (alloc == 1) { - maxSwitch = 0; - maxDel = 0; - maxInsert = 1; - } - //Otherwise, allocations are assigned randomly - else { - //Max switch gets a random allocation, - //but allocation has to be even - //(don't want to switch something with itself) - maxSwitch = intRandMod_(alloc);//rand() % alloc; - alloc -= maxSwitch; - //If alloc is odd, - //and there is still percent that can be allocated - if (maxSwitch % 2 == 1 && alloc >= 1) { - //Make allocation 1 less, - //and switch allocation one more (now even) - maxSwitch++; - alloc--; - } - //Otherwise, make allocation one larger, - //switch allocation one less (even) - else if (alloc == 0) { - maxSwitch--; - alloc++; - } - //If alloc is greater than 1 (must be for % purposes), - //calculate random value for inerst allocation - if (alloc > 1) { - maxInsert = intRandMod_(alloc);//rand() % alloc; - alloc -= maxInsert; - } else { - maxInsert = alloc; - alloc -= maxInsert; - } - //Max delete is assigned whatever is left - maxDel = alloc; - } -} - -int SingleMute::getAlignmentLength(){ - return alignmentLength; -} - -int SingleMute::getIBP(){ - return IBP; -} - -void SingleMute::genSing(string * sequence, vector mutes) { - seq = sequence; - //Assign vector of mutes to inputted vector - validIndexes = new vector(); - validIndexes->reserve(mutes.size()); -// n_valid_indices = mutes.size(); - //Adds all valid indexes to the validIndexes vector - for(int i = 0; i < mutes.size(); i++){ - if(mutes.at(i)){ - validIndexes->push_back(i); - } - } - n_valid_indices = validIndexes->size(); - float tempFloat; - //Calculate number of characters each mutation can mutate - tempFloat = maxDel / 100.0; - maxDel = (int) (tempFloat * seq->length()); - tempFloat = maxInsert / 100.0; - maxInsert = (tempFloat * seq->length()); - tempFloat = maxSwitch / 100.0; - maxSwitch = (tempFloat * seq->length()); - //Calculates Alignment length and identical base pairs - alignmentLength = maxInsert; - IBP = maxDel + maxSwitch; - //Vectors to keep track of where insertions and deletions need to be made - insertions = new vector(); - insertions->reserve(maxInsert); - deletions = new vector(); - deletions->reserve(maxDel); - //Since switch makes 2 invalid, - //switchNucl is run maxSwitch/2 times - for (int i = 0; i < maxSwitch; i++) { - switchNucl(); - } - //Insert maxInsert times - for (int i = 0; i < maxInsert; i++) { - insert(); - } - //Delete maxDel nucleotides - for (int i = 0; i < maxDel; i++) { - deleteNucl(); - } - //perfroms deletions and insertions - performInsertAndDelete(); -} - -void SingleMute::insert() { - //Calculate the index to insert at - int index = intRandMod_(n_valid_indices);//rand() % validIndexes->size(); - insertions->push_back(validIndexes->at(index)); - std::swap(validIndexes->at(index), validIndexes->at(n_valid_indices-1)); - n_valid_indices--; - //Remove that as a valid index -// validIndexes->erase(validIndexes->begin() + index, validIndexes->begin() + index + 1); -} - -void SingleMute::deleteNucl() { - //Choose a valid index to delete - int index = intRandMod_(n_valid_indices);//rand() % validIndexes->size(); - deletions->push_back(validIndexes->at(index)); - std::swap(validIndexes->at(index), validIndexes->at(n_valid_indices-1)); - n_valid_indices--; - //Remove from the -// validIndexes->erase(validIndexes->begin() + index, validIndexes->begin() + index + 1); -} - -void SingleMute::switchNucl() { - //Pick a random valid index - int index = intRandMod_(n_valid_indices);//rand() % validIndexes->size(); - char character = seq->at(validIndexes->at(index)); - int value; - //Keep generating characters until one different than the one we are trying to switch is found - while(character == seq->at(validIndexes->at(index))){ - value = intRandMod_(percAs + percCs + percGs + percTs); - if (value < percAs) { - character = 'A'; - } else if (value < percAs + percCs) { - character = 'C'; - } else if (value < percAs + percCs + percGs) { - character = 'G'; - } else { - character = 'T'; - } - } - //Switch that character - seq->at(validIndexes->at(index)) = character; - std::swap(validIndexes->at(index), validIndexes->at(n_valid_indices-1)); - n_valid_indices--; - //Remove the chosen index as a valid index -// validIndexes->erase(validIndexes->begin() + index, validIndexes->begin() + index + 1); -} - -void SingleMute::performInsertAndDelete(){ - //sorts the vectors based - std::sort(insertions->begin(), insertions->end()); - std::sort(deletions->begin(), deletions->end()); - //Goes through both vectors untill all have been processed - for(int i = insertions->size() - 1, j = deletions->size() - 1; i >= 0 && j >= 0;){ - //If i is -1, all insertions have been processed - if(i == -1){ - removeNucl(deletions->at(j)); - j--; - } - //If i is -1, all deletions have been processed - else if(j == -1){ - insertNucl(insertions->at(i)); - i--; - } - else{ - //If the index of the current next insertion is higher than the next deletion, insert, else delete - if(insertions->at(i) > deletions->at(j)){ - insertNucl(insertions->at(i)); - i--; - } - else{ - removeNucl(deletions->at(j)); - j--; - } - } - } -} - -void SingleMute::removeNucl(int index){ - seq->erase(index, 1); -} - -void SingleMute::insertNucl(int index){ - string character; - //Use a weighted die to - //calculate which character to insert - int value = intRandMod_(percAs + percCs + percGs + percTs); - if (value < percAs) { - character = "A"; - } else if (value < percAs + percCs) { - character = "C"; - } else if (value < percAs + percCs + percGs) { - character = "G"; - } else { - character = "T"; - } - //insert at that index - seq->insert(index, character); -} diff --git a/src/cluster/src/SingleMute.h b/src/cluster/src/SingleMute.h deleted file mode 100644 index b0bf93d..0000000 --- a/src/cluster/src/SingleMute.h +++ /dev/null @@ -1,89 +0,0 @@ -/** - * Author: Alex Baumgartner - * The Bioinformatics Toolsmith Laboratory, the University of Tulsa - * 5/15/2018 - * - * Purpose: - * The pupose of this module is to perform single mutations on sequences - */ - -#ifndef SINGLEMUTE_H -#define SINGLEMUTE_H - -#include -#include -#include -#include - -using namespace std; - -class SingleMute { -public: - /* - Constructor, creates values - and assignes allocations based on inputted data - - @param: - int: percentage of A's - int: percentage of C's - int: percentage of G's - int: percentage of T's - int: The total allocation for non-single mutations - */ - SingleMute(int, int, int, int, int); - /* - Takes a string and mutates it based - on the allocation given in the constructor - - @param: - std::string *: pointer to the sequence to be mutated - std::vector : boolean vector of valid and invalid indexes - */ - void genSing(std::string *, std::vector); - - int getAlignmentLength(); - - int getIBP(); - - ~SingleMute(){delete validIndexes; delete insertions; delete deletions;}; - - private: - int percAs; - int percCs; - int percGs; - int percTs; - int maxDel; - int maxInsert; - int maxSwitch; - int alignmentLength; - int IBP; - std::vector * validIndexes; - size_t n_valid_indices = 0; - std::vector * deletions; - std::vector * insertions; - std::string * seq; - /* - Inserts a sequence randomly in the list - at a valid index - */ - void insert(); - /* - Deletes a random nucleotide - that has not been previously mutated - */ - void deleteNucl(); - /* - Switches two random nucleotides - that have not been mutated previously - */ - void switchNucl(); - /* - Performs necessary insertions and deletions in the string based on the insertion and deletion vectors - */ - void performInsertAndDelete(); - - void removeNucl(int); - - void insertNucl(int); -}; -#endif diff --git a/src/cluster/src/Trainer.cpp b/src/cluster/src/Trainer.cpp deleted file mode 100644 index 432d624..0000000 --- a/src/cluster/src/Trainer.cpp +++ /dev/null @@ -1,930 +0,0 @@ -#include "Trainer.h" -#include "HandleSeq.h" -#include "Loader.h" -#include "ClusterFactory.h" -#include -#include -#include -#include -#include "../../utility/GlobAlignE.h" -#include "../../utility/AffineId.h" -#include "needleman_wunsch.h" -#include "Predictor.h" -#include "GLM.h" -#include "Feature.h" -#include "Progress.h" -#include - -template -double Trainer::align(Point *a, Point* b) const -{ - auto sa = a->get_data_str(); - auto sb = b->get_data_str(); - int la = sa.length(); - int lb = sb.length(); - - // needleman_wunsch nw(sa, sb, 2, -3, 5, 2); - // return nw.identity(nw.align()); - GlobAlignE galign(sa.c_str(), 0, la-1, - sb.c_str(), 0, lb-1, - 1, -1, 2, 1); - - return galign.getIdentity(); - -} - - -template -std::tuple*,double,size_t,size_t> Trainer::get_close(Point *p, bvec_iterator istart, bvec_iterator iend, bool &is_min_r) const -{ - int ncols = weights.getNumRow(); -#pragma omp declare reduction(pmax:std::tuple*,double,size_t,size_t>: \ - omp_out = get<1>(omp_in) > get<1>(omp_out) ? omp_in : omp_out ) \ - initializer (omp_priv=std::make_tuple((Point*)NULL,-1,0,0)) - - std::tuple*, - double, - size_t, - size_t> result = std::tuple*, double, size_t, size_t>(NULL, - -1, - 0, - 0); - bool has_found = false; - - #ifdef DEBUG - cout << "begin " << istart.r << " " << istart.c << " end " << iend.r << " " << iend.c << endl; - for (auto data : *istart.col) { - cout << "\t" << data.size() << endl; - } - #endif -// #pragma omp parallel for reduction(pmin:result), reduction(||:has_found) -// for (bvec_iterator i = istart; i <= iend; i++) { -// if (i <= iend) { -// Point* pt = (*i).first; -// double sum = weights.get(0, 0); -// double dist = 0; -// for (int col = 1; col < ncols; col++) { -// if (col == 1) { -// dist = ff.at(col-1)(pt, p); -// sum += weights.get(col, 0) * dist; -// } else { -// sum += weights.get(col, 0) * ff.at(col-1)(pt, p); -// } -// } -// double res = round(1.0 / (1 + exp(-sum))); - -// // set second to true if result is not 1.0 -// // which means it will be removed -// result = std::make_pair(pt, dist); -// has_found = (res != 1.0); -// (*i).second = (res != 1.0); -// } -// } - bool is_min = true; -#pragma omp parallel for reduction(pmax:result), reduction(&&:is_min) - for (bvec_iterator i = istart; i <= iend; ++i) { - Point* pt = (*i).first; - double sum = weights.get(0, 0); - double dist = 0; - auto cache = feat->compute(*pt, *p); - for (int col = 1; col < ncols; col++) { - if (col == 1) { - dist = (*feat)(col-1, cache); - sum += weights.get(col, 0) * dist; - } else { - sum += weights.get(col, 0) * (*feat)(col-1, cache); - } - } - double res = round(1.0 / (1 + exp(-sum))); - //cout << "res: " << res << " " << dist << endl; -// set second to true if result is not 1.0 - // which means it will be removed - result = (dist > std::get<1>(result)) ? std::make_tuple(pt, dist, i.r, i.c) : result; - is_min = is_min && (res != 1.0); -// has_found = has_found || (res != 1.0); - if (res == 1.0) { - *i = std::make_pair(pt, true); -// (*i).second = true; - } - } - -// is_min = !has_found; - is_min_r = is_min; -// return get<0>(result); - return result; - -} - -template -long Trainer::merge(vector > ¢ers, long current, long begin, long last) const -{ -#pragma omp declare reduction(ldpmax:std::pair: \ - omp_out = omp_in.second > omp_out.second ? omp_in : omp_out ) \ - initializer (omp_priv=std::make_pair(0, std::numeric_limits::min())) - std::pair best = std::make_pair(0, std::numeric_limits::min()); - Point* p = centers[current].getCenter(); -#pragma omp parallel for reduction(ldpmax:best) - for (long i = begin; i <= last; i++) { - double sum = weights.get(0, 0); - double dist = 0; - Point* cen = centers[i].getCenter(); - auto cache = feat->compute(*cen, *p); - for (int col = 1; col < weights.getNumRow(); col++) { - double d = (*feat)(col-1, cache); - if (col == 1) { - dist = d; - } - sum += weights.get(col, 0) * d; - } - double res = round(1.0 / (1 + exp(-sum))); - - if (res == 1) { - best = best.second > dist ? best : std::make_pair(i, dist); - } - } - return best.first; -} - -template -vector*,Point*> > resize_vec(vector*,Point*>, double> > &vec, size_t new_size) -{ - cout << "Vector size: " << vec.size() << " min size: " << new_size << endl; - vector*, Point*> > data; - if (vec.size() <= new_size) { - for (int i = 0; i < vec.size(); i++) { - data.push_back(vec[i].first); - } - return data; - } - using k = pair*,Point*>, double>; - std::sort(vec.begin(), vec.end(), [](const k& a, const k& b) { - return a.second < b.second; - }); - double interval = (double)vec.size() / (vec.size() - new_size); - std::set indices; - int i = 0; - for (double index = 0; round(index) < vec.size() && i < (vec.size() - new_size); - i++, index += interval) { - int j = round(index); - indices.insert(j); - } - - std::cout << "index size: " << indices.size() << std::endl; - - // for (double index = 0; round(index) < vec.size() && indices.size() < new_size; - // index += interval) { - // int j = round(index); - // indices.insert(vec[j]); - // } - // vec.erase(vec.begin(), std::remove_if(vec.begin(), vec.end(), [&](const k& a) { - // return indices.find(a) == indices.end(); - // })); - for (auto iter = indices.rbegin(); iter != indices.rend(); iter++) { - int idx = *iter; - vec.erase(vec.begin() + idx); - } - if (vec.size() != new_size) { - cerr << "sizes are not the same: " << vec.size() << " " << new_size << endl; - throw "Resize did not work"; - } - for (auto a : vec) { - data.push_back(a.first); - } - return data; -} - -struct rng { - rng() { - srand(0); - } - int operator()(int n) const { - return rand() % n; - } -}; -template - pair*, - Point* - > >, - vector*, - Point*> > > Trainer::get_labels(vector*,Point*> > &vec, double cutoff) const -{ - - auto cmp = [](const pair*,Point*> a, const pair*,Point*> b) { - return a.first->get_header().compare(b.first->get_header()) < 0 - || - (a.first->get_header() == b.first->get_header() && a.second->get_header().compare(b.second->get_header()) < 0); - }; - auto scmp = [](const pair*,Point*>,double> a, const pair*,Point*>, double> b) { - return a.first.first->get_header().compare(b.first.first->get_header()) < 0 - || - (a.first.first->get_header() == b.first.first->get_header() && a.first.second->get_header().compare(b.first.second->get_header()) < 0); - }; - - // todo: convert to std::map - std::set*,Point*>, double>, decltype(scmp)> buf_pos(scmp), buf_neg(scmp); - std::vector*,Point*>, double> > buf_vpos, buf_vneg; -// std::sort(vec.begin(), vec.end(), cmp); - // cout << "Before Pair: " << vec[0].first->get_header() << ", " << vec[0].second->get_header() << endl; - // cout << "Before Pair: " << vec[vec.size()-1].first->get_header() << ", " << vec[vec.size()-1].second->get_header() << endl; - - rng gen; - random_shuffle(vec.begin(), vec.end(), gen); - // cout << "Pair: " << vec[0].first->get_header() << ", " << vec[0].second->get_header() << endl; - // cout << "Pair: " << vec[vec.size()-1].first->get_header() << ", " << vec[vec.size()-1].second->get_header() << endl; - vector scores(vec.size()); - Progress p(vec.size(), "Alignment"); -#pragma omp parallel for schedule(dynamic) - for (int i = 0; i < vec.size(); i++) { - double algn = align(vec[i].first, vec[i].second); - bool is_pos = algn >= cutoff; -#pragma omp critical - { - scores[i] = algn; - p++; - if (is_pos) { - buf_pos.insert(make_pair(vec[i], algn)); - //cout << vec[i].first->get_header() << " " << vec[i].second->get_header() << " " << algn << endl; - } else { - buf_neg.insert(make_pair(vec[i], algn)); - } - -#ifdef DEBUG - cout << vec[i].first->get_header() << " WITH " << vec[i].second->get_header() << " " << algn << endl; - #endif - - } - } - p.end(); - std::sort(scores.begin(), scores.end()); - std::cout << "positive=" << buf_pos.size() << " negative=" << buf_neg.size() << endl; - if (buf_pos.empty() || buf_neg.empty()) { - std::cout << "Identity value does not match sampled data: "; - if (buf_pos.empty()) { - std::cout << "Too many sequences below identity"; - } else { - std::cout << "Too many sequences above identity"; - } - std::cout << std::endl; - exit(0); - } - size_t m_size = std::min(buf_pos.size(), buf_neg.size()); - - std::cout << "resizing positive" << std::endl; - for (auto p : buf_pos) { - buf_vpos.push_back(p); - } - for (auto p : buf_neg) { - buf_vneg.push_back(p); - } - auto bp = resize_vec(buf_vpos, m_size); - std::cout << "resizing negative" << std::endl; - auto bn = resize_vec(buf_vneg, m_size); - auto ret = make_pair(bp, bn); - std::cout << "positive=" << ret.first.size() << " negative=" << ret.second.size() << endl; - return ret; - -} -template -void Trainer::filter(Point *p, vector *, bool> > &vec) const -{ - for (auto& pt : vec) { - double sum = weights.get(0, 0); - auto cache = feat->compute(*pt.first, *p); - for (int col = 1; col < weights.getNumRow(); col++) { - sum += weights.get(col, 0) * (*feat)(col-1, cache); - } - double res = round(1.0 / (1 + exp(-sum))); - pt.second = (res != 1); - } - vec.erase(std::remove_if(vec.begin(), vec.end(), [](pair*, bool> p) { - return p.second; - }), vec.end()); -} - -template -Point* Trainer::closest(Point *p, vector *, bool> > &vec) const -{ - Point* best_pt = NULL; - double best_dist = 0; - for (auto& pt : vec) { - double sum = weights.get(0, 0); - double dist = pt.first->distance_d(*p); - if (best_pt == NULL || dist < best_dist) { - best_dist = dist; - best_pt = pt.first; - } - } - return best_pt; -} - -template -std::pair Trainer::generate_feat_mat(pair *, Point *> >, vector *, Point *> > > &data, int ncols) -{ - int nrows = data.first.size() + data.second.size(); - matrix::Matrix feat_mat(nrows, ncols); - matrix::Matrix labels(nrows, 1); -#pragma omp parallel for - for (int i = 0; i < data.first.size(); i++) { - auto kv = data.first[i]; - int row = i; - auto cache = feat->compute(*kv.first, *kv.second); - for (int col = 0; col < ncols; col++) { - - if (col == 0) { - feat_mat.set(row, col, 1); - } else { -// double val = ff[col-1](kv.first, kv.second); - ////#pragma omp critical - double val = (*feat)(col-1, cache); - feat_mat.set(row, col, val); - } - - } - ////#pragma omp critical - labels.set(row, 0, 1); - } -#pragma omp parallel for - for (int i = 0; i < data.second.size(); i++) { - auto kv = data.second[i]; - int row = data.first.size() + i; - auto cache = feat->compute(*kv.first, *kv.second); - for (int col = 0; col < ncols; col++) { - - if (col == 0) { - feat_mat.set(row, col, 1); - } else { -// double val = ff[col-1](kv.first, kv.second); - ////#pragma omp critical - double val = (*feat)(col-1, cache); - feat_mat.set(row, col, val); - } - - } - ////#pragma omp critical - labels.set(row, 0, -1); - } - return std::make_pair(feat_mat, labels); -} -template -double Trainer::train_n(pair *, Point *> >, vector *, Point *> > > &data, int ncols) -{ - std::cout << "done" << endl; - cout << "Training on " << ncols << " columns" << endl; - int nrows = data.first.size() + data.second.size(); - - matrix::Matrix feat_mat(nrows, ncols); - matrix::Matrix labels(nrows, 1); - double avg_label = 0; -#pragma omp parallel for - for (int i = 0; i < data.first.size(); i++) { - auto kv = data.first[i]; - int row = i; - auto cache = feat->compute(*kv.first, *kv.second); - for (int col = 0; col < ncols; col++) { - - if (col == 0) { - feat_mat.set(row, col, 1); - } else { -// double val = ff[col-1](kv.first, kv.second); - ////#pragma omp critical - double val = (*feat)(col-1, cache); - feat_mat.set(row, col, val); - } - - } - ////#pragma omp critical - labels.set(row, 0, 1); - } -#pragma omp parallel for - for (int i = 0; i < data.second.size(); i++) { - auto kv = data.second[i]; - int row = data.first.size() + i; - auto cache = feat->compute(*kv.first, *kv.second); - for (int col = 0; col < ncols; col++) { - - if (col == 0) { - feat_mat.set(row, col, 1); - } else { -// double val = ff[col-1](kv.first, kv.second); - ////#pragma omp critical - double val = (*feat)(col-1, cache); - feat_mat.set(row, col, val); - } - - } - ////#pragma omp critical - labels.set(row, 0, -1); - } - for (int row = 0; row < nrows; row++) { - for (int col = 0; col < ncols; col++) { - double val = feat_mat.get(row, col); - std::cout << val << "\t"; - } - std::cout << endl; - } - glm.train(feat_mat, labels); - weights = glm.get_weights(); - #ifdef DEBUG - for (int i = 0; i < ncols; i++) { - cout << "weight: " << weights.get(i, 0) << endl; - - } - #endif - matrix::Matrix p = glm.predict(feat_mat); - for (int row = 0; row < nrows; row++) { - if (p.get(row, 0) == 0) { - p.set(row, 0, -1); - } - } - auto tup = glm.accuracy(labels, p); - return get<0>(tup); -} - -double random_between(double mute, double rng, double low, double high) -{ - double r_d = (double)rand() / RAND_MAX; - double mn = std::max(mute - rng, low); - double mx = std::min(mute + rng, high); - return r_d * (mx - mn) + mn; -} - -template -void Trainer::mutate_seqs(Point* p, size_t num_seq, vector > &pos_buf, vector > &neg_buf, double id_begin, double id_end, uintmax_t& _id) -{ - HandleSeq h(HandleSeq::BOTH); - ClusterFactory factory(k); - double inc = (id_end - id_begin) / num_seq; - std::string bin_seq = p->get_data_str(); - std::string seq; - for (auto c : bin_seq) { - switch (c) { - case 0: - seq += 'A'; - break; - case 1: - seq += 'C'; - break; - case 2: - seq += 'G'; - break; - case 3: - seq += 'T'; - break; - case 'N': - seq += 'C'; - break; - default: - cout << "Invalid character " << c << endl; - cout << "from sequence " << bin_seq << endl; - throw 3; - } - } - for (size_t i = 0; i < num_seq; i++) { - double iter_id = id_begin + inc * (i + 0.5); - double actual_id = random_between(iter_id, inc, id_begin, id_end); - int mut = round(100 - actual_id); - auto newseq = h.mutate(seq, mut); - std::string chrom; - std::string header = p->get_header(); - Point* new_pt = Loader::get_point(header, newseq.second, _id, k); - pra pr; - pr.first = p->clone(); - pr.second = new_pt; - pr.val = newseq.first; - if (pr.val > cutoff) { - pos_buf.push_back(pr); - } else { - neg_buf.push_back(pr); - } - } -} - -template -std::pair*,Point*> >, - vector*,Point*> > >, - std::pair*,Point*> >, - vector*,Point*> > > > -Trainer::new_get_labels(std::vector*> &points, size_t num_sample, double id, uintmax_t &_id) -{ - std::sort(points.begin(), points.end(), [](const Point* a, - const Point* b) -> bool { - return a->get_length() < b->get_length(); - }); - std::pair*,Point*> >, - vector*,Point*> > > training, testing; - num_sample = min(num_sample, points.size()); - vector*> f_points_tr, f_points_test; - size_t total_size = points.size(); - for (int i = 0; i < num_sample; i++) { - int i1 = floor((double)i * total_size / (2 * num_sample)); - int i2 = floor((i + 1) * (double)total_size / (2 * num_sample)); - f_points_tr.push_back(points.at(i1)); - f_points_test.push_back(points.at(i2)); - } - std::vector > pos_buf, neg_buf; - cout << "mutating sequences" << endl; - for (auto p : f_points_tr) { - mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id); - mutate_seqs(p, 5, pos_buf, neg_buf, 40, 100 * id, _id); - } - size_t buf_size = std::min(pos_buf.size(), neg_buf.size()); - cout << "training +: " << pos_buf.size() << endl; - cout << "training -: " << neg_buf.size() << endl; - std::vector > > bins; - size_t num_bins; - for (int i = 0; i < 10; i++) { - double max_identity = id * 100 + (100 - 100.0 * id) * (i+1) / 10.0; - double min_identity = id * 100 + (100 - 100.0 * id) * i / 10.0; - cout << "I = " << i << " " << min_identity << " -> " << max_identity << endl; - bins.push_back(std::vector >()); - for (auto p : pos_buf) { - if (p.val > min_identity && p.val < max_identity) { - bins[i].push_back(p); - } - } - for (auto p : neg_buf) { - if (p.val > min_identity && p.val < max_identity) { - bins[i].push_back(p); - } - } - } - std::random_shuffle(pos_buf.begin(), pos_buf.end()); - std::random_shuffle(neg_buf.begin(), neg_buf.end()); - for (size_t i = 0; i < buf_size; i++) { - cout << "TR: P " << pos_buf[i].val << endl; - cout << "TR: N " << neg_buf[i].val << endl; - if (pos_buf[i].val > id) { - training.first.emplace_back(pos_buf[i].first, pos_buf[i].second); - } else { - training.second.emplace_back(pos_buf[i].first, pos_buf[i].second); - } - if (neg_buf[i].val > id) { - training.first.emplace_back(neg_buf[i].first, neg_buf[i].second); - } else { - training.second.emplace_back(neg_buf[i].first, neg_buf[i].second); - } - } - pos_buf.clear(); - neg_buf.clear(); - for (auto p : f_points_test) { - mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id); - mutate_seqs(p, 5, pos_buf, neg_buf, 40, 100 * id, _id); - } - buf_size = std::min(pos_buf.size(), neg_buf.size()); - cout << "testing +: " << pos_buf.size() << endl; - cout << "testing -: " << neg_buf.size() << endl; - std::random_shuffle(pos_buf.begin(), pos_buf.end()); - std::random_shuffle(neg_buf.begin(), neg_buf.end()); - for (size_t i = 0; i < buf_size; i++) { - cout << "TE: P " << pos_buf[i].val << endl; - cout << "TE: N " << neg_buf[i].val << endl; - if (pos_buf[i].val > id) { - testing.first.emplace_back(pos_buf[i].first, pos_buf[i].second); - } else { - testing.second.emplace_back(pos_buf[i].first, pos_buf[i].second); - } - if (neg_buf[i].val > id) { - testing.first.emplace_back(neg_buf[i].first, neg_buf[i].second); - } else { - testing.second.emplace_back(neg_buf[i].first, neg_buf[i].second); - } - } - return make_pair(training, testing); -} -template -void Trainer::train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, double acc_cutoff) -{ - - if (k != 0) { - std::cout << "Splitting data" << endl; - uintmax_t _id = points.size(); - Predictor pred(k, cutoff, PRED_MODE_CLASS, feat_type, - mut_type, min_n_feat, max_n_feat, min_id); - pred.train(points, points, _id, n_points); - delete feat; - auto pr = pred.get_class(); - feat = pr.first; - glm = pr.second; - weights = glm.get_weights(); - return; - } else { - feat->add_feature(FEAT_ALIGN, Combo::xy); -// feat->normalize(training.first); - feat->finalize(); - weights = matrix::Matrix(2, 1); - weights.set(0, 0, -1 * cutoff); - weights.set(1, 0, 1); - return; - } -} - -template -vector*, Point*> > Trainer::split() -{ - // n_points total per side - // max_pts_from_one on each side - auto cmp = [](const pair*,Point*> a, const pair*,Point*> b) { - return a.first->get_header().compare(b.first->get_header()) < 0 -|| - (a.first->get_header() == b.first->get_header() && a.second->get_header().compare(b.second->get_header()) < 0); - }; - set*, Point*>, decltype(cmp)> pairs(cmp); -// vector*, Point*> > pairs; - const size_t total_num_pairs = n_points * 2; - int aerr = 0; - int bandwidth = (1.0 - cutoff) * 10000; - vector*> indices; - std::sort(points.begin(), points.end(), [](const Point* a, - const Point* b) -> bool { - return a->get_length() < b->get_length(); - }); - Point *begin_pt = points[points.size()/2]; - - std::sort(points.begin(), points.end(), [&](const Point* a, - const Point* b) -> bool { - return a->distance(*begin_pt) < b->distance(*begin_pt); - }); - int num_iterations = ceil(((double)n_points) / max_pts_from_one) - 1; - for (int i = 0; i <= num_iterations; i++) { - int idx = i * (points.size()-1) / num_iterations; - indices.push_back(points[idx]); - } - cout << "Point pairs: " << indices.size() << endl; - size_t to_add_each = max_pts_from_one / 2; - Progress prog(indices.size(), "Sorting data"); -#pragma omp parallel for schedule(dynamic) - for (int i = 0; i < indices.size(); i++) { - vector*> pts = points; - Point* p = indices[i]; - std::sort(pts.begin(), pts.end(), [&](const Point* a, - const Point* b) { - return a->distance(*p) < b->distance(*p); - }); - // do binary search with alignment - size_t offset = pts.size() / 4; - size_t pivot = offset; - double closest_algn = 20000; - size_t best_pivot = 2 * offset; - for (pivot = 2 * offset; offset > 0; offset /= 2) { - double algn = align(p, pts[pivot]); - // cout << "Pivot: " << pivot << " point: " << pts[pivot]->get_header() << " sim: " << align(p, pts[pivot]) << endl; - if (fabs(algn - cutoff) < closest_algn) { - closest_algn = fabs(algn - cutoff); - best_pivot = pivot; - } - if (algn < cutoff) { - pivot -= offset; - } else if (algn > cutoff) { - pivot += offset; - } else { - break; - } - } -// cout << "Pivot: " << pivot << " point: " << pts[pivot]->get_header() << " sim: " << align(p, pts[pivot]) << endl; - // before: [0, pivot) size: to_add_each - // after: [pivot, size) size: to_add_each - double before_inc = (double)pivot / to_add_each; - double after_inc = ((double)(pts.size() - pivot)) / to_add_each; -#pragma omp critical - { - prog++; - if (before_inc < 1) { - aerr = 1; - } else if (after_inc < 1) { - aerr = -1; - } - } - double before_start = 0; - double after_start = pivot; - double top_start = 0; - size_t size_before = pairs.size(); - vector*,Point*> > buf; - // Adds points above cutoff by adding before_inc - for (int i = 0; i < to_add_each; i++) { - int idx = round(before_start); - int dist = pts[idx]->distance(*p); - // cout << p->get_header() << " " << pts[idx]->get_header() << " " << dist << endl; - auto pr = p->get_header().compare(pts[idx]->get_header()) < 0 ? make_pair(p, pts[idx]) : make_pair(pts[idx], p); - buf.push_back(pr); - before_start += before_inc; - } - // Adds points before cutoff by adding after_inc - for (int i = 0; i < to_add_each && round(after_start) < pts.size(); i++) { - int idx = round(after_start); - int dist = pts[idx]->distance(*p); - // cout << p->get_header() << " " << pts[idx]->get_header() << " " << dist << endl; - auto pr = p->get_header().compare(pts[idx]->get_header()) < 0 ? make_pair(p, pts[idx]) : make_pair(pts[idx], p); - buf.push_back(pr); - after_start += after_inc; - } -#pragma omp critical - { - // Adds buffer to total pairs - // for (auto p : buf) { -// pairs.push_back(p); -// } - pairs.insert(std::begin(buf), std::end(buf)); - } -// cout << "added " << pairs.size() - size_before << " pairs" << endl; - } - prog.end(); - if (aerr < 0) { - cerr << "Warning: Alignment may be too small for sampling" << endl; - } else if (aerr > 0) { - cerr << "Warning: Alignment may be too large for sampling" << endl; - } - int i = 0; - for (auto a : pairs) { - cout << "Before Pair: " << a.first->get_header() << ", " << a.second->get_header() << endl; - if (++i == 4) { - break; - } - } - return std::vector*,Point*> >(pairs.begin(), pairs.end()); -} -template -std::pair*, Point*>, double>, - std::map*, Point*>, double> > -Trainer::split_old() { - using train_map = std::map*, Point*>, double>; - std::pair split; - int bandwidth = (1.0 - cutoff) * 10000; - size_t last_cutoff = points.size() / 2; - while (split.first.size() < n_points) { - Point *p = points[last_cutoff]; - std::sort(points.begin(), points.end(), [&](const Point* a, - const Point* b) -> bool { - return a->distance(*p) < b->distance(*p); - }); - int b_cutoff = points.size() / 2; - for (int offset = b_cutoff; offset >= 1; offset /= 2) { - int dist = p->distance(*points[b_cutoff]); - if (dist < bandwidth) { - b_cutoff += offset; - } else if (dist > bandwidth) { - b_cutoff -= offset; - } else { - break; - } - } - size_t cutoff_index = points.size(); - const size_t count = split.first.size(); - - if (b_cutoff >= max_pts_from_one) { - double ratio = (double)b_cutoff / max_pts_from_one; - double sum = 0; - for (size_t q = 0; q < max_pts_from_one; q++) { - size_t i = round(sum); - if (i >= points.size()) { - cerr << "this shouldn't happen" << endl; - throw "this shouldn't happen"; - } - double alignment = align(p, points[i]); - if (alignment < cutoff) { - cutoff_index = i + 10; - break; - } - if (split.first.size() < n_points) { - split.first[make_pair(p, points[i])] = alignment; - } - sum += ratio; - } - } else { - for (size_t i = 1; i < cutoff_index; i++) { - double alignment = align(p, points[i]); - if (alignment < cutoff) { - cutoff_index = i + 10; - break; - } - if (split.first.size() < n_points) { - split.first[make_pair(p, points[i])] = alignment; - } - } - } - size_t similar_points_added = split.first.size() - count; - size_t available_points = points.size() - cutoff_index; - if (available_points == 0 || available_points <= similar_points_added) { - cerr << "change cutoff value, points are too similar" << endl; - throw "change cutoff value, points are too similar"; - } - double ratio = (double)(available_points - 1.0) / (double)similar_points_added; - double sum = 0; - for (size_t q = 0; q < similar_points_added; q++) { - size_t i = cutoff_index + round(sum); - if (i >= points.size()) { - break; - } - double alignment = align(p, points[i]); - split.second[make_pair(p, points[i])] = alignment; - sum += ratio; - } - if (split.first.size() != split.second.size()) { - cerr << "something happened"; - throw "something happened"; - } - last_cutoff = cutoff_index; - } - for (auto p : points) { - p->set_data_str(""); - } - return split; -} - - -int gcd(int a, int b) -{ - if (b <= 0) { - return a; - } - return gcd(b, a % b); -} -int gcd_vec(std::vector v) -{ - int ret = v[0]; - for (size_t i = 1; i < v.size(); i++) { - if (v[i] == 0) { - continue; - } - ret = gcd(ret, v[i]); - } - return ret; -} - -inline int sign(double x) { - return (x > 0) - (x < 0); -} -void scale(double (&mat)[4][4], double &sigma, double& epsilon) -{ - double scale_factor = 100000; - std::vector signs, scaled; - signs.push_back(sign(sigma)); - scaled.push_back(round(scale_factor * fabs(sigma))); - signs.push_back(sign(epsilon)); - scaled.push_back(round(scale_factor * fabs(epsilon))); - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - signs.push_back(sign(mat[i][j])); - scaled.push_back(round(scale_factor * fabs(mat[i][j]))); - } - } - double common_div = gcd_vec(scaled); - sigma = signs[0] * scaled[0] / common_div; - epsilon = signs[1] * scaled[1] / common_div; - int count = 2; - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - mat[i][j] = signs[count] * scaled[count] / common_div; - count++; - } - } -} - -template -void Trainer::init(double (&matrix)[4][4], double sig, double eps) -{ - scale(matrix, sig, eps); - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - mat[i][j] = (int)matrix[i][j]; - } - } - sigma = (int)sig; - eps = (int)eps; - // sf.emplace_back([](Point* a, Point *b) { - // return Feature::manhattan(*a, *b); - // }, false); - // sf.emplace_back([](Point* a, Point *b) { - // return Feature::length_difference(*a, *b); - // }, false); - // sf.emplace_back([](Point* a, Point *b) { - // return Feature::rree_k_r(*a, *b); - // }, false); - // sf.emplace_back([](Point* a, Point* b) { - // return Feature::length_difference(*a, *b); - // }, false); - // sf.emplace_back([](Point* a, Point* b) { - // return Feature::intersection(*a, *b); - // }, true); - // sf.emplace_back([](Point* a, Point* b) { - // return Feature::jenson_shannon(*a, *b); - // }, false); - // sf.emplace_back([](Point* a, Point* b) { - // return Feature::simratio(*a, *b); - // }, true); - // sf.emplace_back([](Point* a, Point* b) { - // return Feature::squaredchord(*a, *b); - // }, false); - // sf.emplace_back([](Point* a, Point* b) { - // return Feature::manhattan(*a, *b); - // }, false); - // sf.emplace_back([](Point* a, Point* b) { - // return Feature::pearson(*a, *b); - // }, true); - -} -template class Trainer; -template class Trainer; -template class Trainer; -template class Trainer; -template class Trainer; -template class Trainer; diff --git a/src/cluster/src/Trainer.h b/src/cluster/src/Trainer.h deleted file mode 100644 index 8801172..0000000 --- a/src/cluster/src/Trainer.h +++ /dev/null @@ -1,67 +0,0 @@ -/* -*- C++ -*- */ -#ifndef TRAINER_H -#define TRAINER_H - -#include "Point.h" -#include "GLM.h" -#include "Feature.h" -#include "bvec.h" -#include "Center.h" -#include "LogTable.h" -#include - -template -class Trainer { -public: - Trainer(std::vector*> v, size_t num_points, size_t largest_count, double cutoff_, size_t max_pts_from_one_, double (&matrix)[4][4], double sig, double eps, int ksize) : points(v), n_points(num_points), cutoff(cutoff_), max_pts_from_one(max_pts_from_one_), k(ksize) { - init(matrix, sig, eps); - uintmax_t size = 1000 * 1000 * 10; - feat = new Feature(k); - }; - ~Trainer() { delete feat_mat; delete feat; } - std::pair*, Point*>, double>, - std::map*, Point*>, double> > split_old(); - vector*,Point*> > split(); - double train_n(pair*, - Point* - > >, - vector*, - Point*> > > &data, int ncols); - void train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, double acc_cutoff=97.5); - void mutate_seqs(Point* p, size_t num_seq, vector > &pos_buf, vector > &neg_buf, double id_begin, double id_end, uintmax_t& _id); - std::tuple*,double,size_t,size_t> get_close(Point*, bvec_iterator istart, bvec_iterator iend, bool& is_min) const; -// vector > get_close(Point*, const vector*,int> > &, bool& is_min) const; - std::pair*,Point*> >, - vector*,Point*> > >, - std::pair*,Point*> >, - vector*,Point*> > > > - new_get_labels(std::vector*> &points, size_t num_sample, double id, uintmax_t &_id); - void filter(Point*, vector*,bool> >&) const; - Point* closest(Point*, vector*,bool> >&) const; - long merge(vector > ¢ers, long current, long begin, long end) const; -// Point* merge(Point*, vector*,double> >&) const; -private: - matrix::GLM glm; - matrix::Matrix weights; - double align(Point* a, Point* b) const; - std::pair generate_feat_mat(pair*, - Point* - > >, - vector*, - Point*> > > &data, int ncols); - void init(double (&matrix)[4][4], double sig, double eps); - pair*, - Point* - > >, - vector*, - Point*> > > get_labels(vector*,Point*> >&, double cutoff) const; - Feature *feat; - int mat[4][4]; - int sigma, epsilon; - std::vector*> points; - matrix::Matrix *feat_mat = NULL; - size_t n_points, max_pts_from_one; - double cutoff; - int k; -}; -#endif diff --git a/src/cluster/src/needleman_wunsch.cpp b/src/cluster/src/needleman_wunsch.cpp deleted file mode 100644 index 46d0b5b..0000000 --- a/src/cluster/src/needleman_wunsch.cpp +++ /dev/null @@ -1,153 +0,0 @@ -/* -*- C++ -*- - * - * needleman_wunsch.cpp - * - * Author: Benjamin T James - */ -#include "needleman_wunsch.h" - - -//flags that can be combined -#define HORIZ 1 -#define VERT 2 -#define DIAG 4 -void needleman_wunsch::fill(int i, int j) -{ - if (i == 0 || j == 0) { - if (i == j) { - int offset = at(i, j); - score[offset] = 0; - direction[offset] = DIAG; // for backtracking - horiz_gap_len[offset] = 0; - vert_gap_len[offset] = 0; - } else if (i == 0) { - int offset = at(0, j); - int last_offset = at(0, j-1); - score[offset] = score[last_offset] + gap(j); - horiz_gap_len[offset] = 0; - vert_gap_len[offset] = j; - direction[offset] = VERT; - } else { // j == 0 - int offset = at(i, 0); - int last_offset = at(i-1, 0); - score[offset] = score[last_offset] + gap(i); - horiz_gap_len[offset] = i; - vert_gap_len[offset] = 0; - direction[offset] = HORIZ; - } - return; - } - int i_diag = at(i-1, j-1); - int i_horiz = at(i-1, j); - int i_vert = at(i, j-1); - int i_cur = at(i, j); - - int hlen = horiz_gap_len[i_horiz] + 1; - int vlen = vert_gap_len[i_vert] + 1; - - int diag_score = score[i_diag] + match_score(s1[i], s2[j]); - int horiz_score = score[i_horiz] + gap(hlen); - int vert_score = score[i_vert] + gap(vlen); - score[i_cur] = std::max(std::max(diag_score, horiz_score), vert_score); - direction[i_cur] = 0; - - // we could match multiple high scores - if (score[i_cur] == diag_score) { - direction[i_cur] |= DIAG; - } - if (score[i_cur] == vert_score) { - direction[i_cur] |= VERT; - vert_gap_len[i_cur] = vlen; - } else { - vert_gap_len[i_cur] = 0; - } - if (score[i_cur] == horiz_score) { - direction[i_cur] |= HORIZ; - horiz_gap_len[i_cur] = hlen; - } else { - horiz_gap_len[i_cur] = 0; - } -} - -std::pair -needleman_wunsch::backtrack() -{ - std::string a1 = "", a2 = ""; - int cur_i = l1 - 1; - int cur_j = l2 - 1; - while (cur_i >= 0 && cur_j >= 0) { - uint8_t dir = direction[at(cur_i, cur_j)]; - if (dir & DIAG) { - a1 += s1[cur_i--]; - a2 += s2[cur_j--]; - } else if (dir & HORIZ) { - a1 += s1[cur_i--]; - a2 += '-'; - } else if (dir & VERT) { - a1 += '-'; - a2 += s2[cur_j--]; - } - } - std::string r1(a1.rbegin(), a1.rend()); - std::string r2(a2.rbegin(), a2.rend()); - return std::make_pair(r1, r2); -} - - -std::pair -needleman_wunsch::align() -{ - for (int i = 0; i < l1; i++) { - for (int j = 0; j < l2; j++) { - fill(i, j); - } - } - return backtrack(); -} -double needleman_wunsch::identity(std::pair alignment) const -{ - int len = alignment.first.length(); - double count = 0; - for (int i = 0; i < len; i++) { - if (alignment.first[i] == alignment.second[i]) { - count++; - } - } - return 1.0 * count / len; -} - -int needleman_wunsch::gap(int gaplen) const -{ - return sigma + (gaplen - 1) * epsilon; -} - -int needleman_wunsch::match_score(char a, char b) const -{ - return a == b ? match : mismatch; -} - -needleman_wunsch::needleman_wunsch(const std::string &s1_, const std::string& s2_, int match_, int mismatch_, int sigma_, int epsilon_) -{ - int l1_ = s1_.length(); - int l2_ = s2_.length(); - if (l1_ >= l2_) { - l1 = l1_; - l2 = l2_; - s1 = s1_; - s2 = s2_; - } else { - l1 = l2_; - l2 = l1_; - s1 = s2_; - s2 = s1_; - } - sigma = -sigma_; - epsilon = -epsilon_; - match = match_; - mismatch = mismatch_; - int matlen = l1 * l2; - score = new int[matlen]; - direction = new uint8_t[matlen]; - horiz_gap_len = new int[matlen]; - vert_gap_len = new int[matlen]; -} diff --git a/src/cluster/src/needleman_wunsch.h b/src/cluster/src/needleman_wunsch.h deleted file mode 100644 index 031ea10..0000000 --- a/src/cluster/src/needleman_wunsch.h +++ /dev/null @@ -1,43 +0,0 @@ -/* -*- C++ -*- - * - * needleman_wunsch.h - * - * Author: Benjamin T James - */ - -#ifndef NEEDLEMAN_WUNSCH_H -#define NEEDLEMAN_WUNSCH_H - -#include - -class needleman_wunsch { -public: - needleman_wunsch(const std::string& s1, const std::string& s2, int match_, int mismatch_, int sigma_, int epsilon_); - ~needleman_wunsch() { - delete[] score; - delete[] direction; - delete[] horiz_gap_len; - delete[] vert_gap_len; - } - double identity(std::pair p) const; - std::pair - align(); -private: - int gap(int gap_len) const; - int match_score(char a, char b) const; - inline int at(int a, int b) const { return a * l2 + b; }; - void fill(int,int); - std::pair backtrack(); - int match, mismatch; - int sigma, epsilon; - std::string s1, s2; - int l1, l2; - - int *score; - uint8_t *direction; - int *horiz_gap_len; - int *vert_gap_len; -}; - - -#endif diff --git a/src/clutil/Clock.cpp b/src/clutil/Clock.cpp new file mode 100644 index 0000000..07da83b --- /dev/null +++ b/src/clutil/Clock.cpp @@ -0,0 +1,19 @@ +/* -*- C++ -*- */ +/* + * Clock.cpp + * + * Author: Benjamin T James + */ + +#include "Clock.h" +#include +#include + +static const auto _begin = std::chrono::system_clock::now(); + +void Clock::stamp(std::string desc) +{ + auto end = std::chrono::system_clock::now(); + std::chrono::duration diff = end - _begin; + std::cout << "timestamp " << desc << " " << diff.count() << std::endl; +} diff --git a/src/clutil/Clock.h b/src/clutil/Clock.h new file mode 100644 index 0000000..b251d51 --- /dev/null +++ b/src/clutil/Clock.h @@ -0,0 +1,16 @@ +// -*- C++ -*- +/* + * Clock.h + * + * Author: Benjamin T James + */ + +#ifndef CLOCK_H +#define CLOCK_H +#include + +class Clock { +public: + static void stamp(std::string desc); +}; +#endif diff --git a/src/clutil/Datatype.cpp b/src/clutil/Datatype.cpp new file mode 100644 index 0000000..46fc67b --- /dev/null +++ b/src/clutil/Datatype.cpp @@ -0,0 +1,19 @@ +/* -*- C++ -*- */ +/* + * Datatype.cpp + * + * Author: Benjamin T James + */ + +#include "Datatype.h" +std::string _dt_datatype = ""; + +std::string Datatype::get() +{ + return _dt_datatype; +} + +void Datatype::set(std::string s) +{ + _dt_datatype = s; +} diff --git a/src/clutil/Datatype.h b/src/clutil/Datatype.h new file mode 100644 index 0000000..dd4df42 --- /dev/null +++ b/src/clutil/Datatype.h @@ -0,0 +1,17 @@ +// -*- C++ -*_ +/* + * Datatype.h + * + * Author: Benjamin T James + */ + +#ifndef DATATYPE_H +#define DATATYPE_H +#include + +class Datatype { +public: + static std::string get(); + static void set(std::string s); +}; +#endif diff --git a/src/cluster/src/DivergencePoint.cpp b/src/clutil/DivergencePoint.cpp similarity index 98% rename from src/cluster/src/DivergencePoint.cpp rename to src/clutil/DivergencePoint.cpp index 70e4e2d..d62996a 100644 --- a/src/cluster/src/DivergencePoint.cpp +++ b/src/clutil/DivergencePoint.cpp @@ -58,7 +58,7 @@ double DivergencePoint::distance_d(Point& p) const uint64_t dist = 0; uint64_t mag = 0; for (auto i = 0; i < points.size(); i++) { - dist += 2 * min(points[i],(T)c.points[i]); + dist += 2 * min(points[i],(T)round(c.points[i])); mag += points[i] + c.points[i]; } double frac = (double)dist / mag; @@ -99,8 +99,8 @@ template DivergencePoint::DivergencePoint(const std::vector& pts, uint64_t len) { mag = 0; - points = pts; for (unsigned int i = 0; i < pts.size(); i++) { + points.push_back(pts.at(i)); mag += pts.at(i); } // display(); diff --git a/src/cluster/src/DivergencePoint.h b/src/clutil/DivergencePoint.h similarity index 98% rename from src/cluster/src/DivergencePoint.h rename to src/clutil/DivergencePoint.h index 087bff1..68d0539 100644 --- a/src/cluster/src/DivergencePoint.h +++ b/src/clutil/DivergencePoint.h @@ -38,6 +38,7 @@ class DivergencePoint : public Point { d->set_id(get_id()); d->set_length(get_length()); d->set_stddev(get_stddev()); + d->set_data_str(Point::get_data_str()); return d; } DivergencePoint* create() const { diff --git a/src/cluster/src/Histogram.cpp b/src/clutil/Histogram.cpp similarity index 100% rename from src/cluster/src/Histogram.cpp rename to src/clutil/Histogram.cpp diff --git a/src/cluster/src/Histogram.h b/src/clutil/Histogram.h similarity index 100% rename from src/cluster/src/Histogram.h rename to src/clutil/Histogram.h diff --git a/src/clutil/LCG.h b/src/clutil/LCG.h new file mode 100644 index 0000000..8725771 --- /dev/null +++ b/src/clutil/LCG.h @@ -0,0 +1,51 @@ +// -*- C++ -*- +/* + * LCG.h + * + * Author: Benjamin T James + */ + +#ifndef LCG_H +#define LCG_H + +#include +#include +#include +#include + +class LCG { +public: + LCG(uint64_t seed_) { + seed = seed_; + } + + template + T randMod(T max) { + if (max == 0) { + return 0; + } else { + uint64_t x = random() % max; + return (T)x; + } + } + + uint64_t nextRandSeed() { + return random(); + } + double rand_between(double id, double range, double low, double high) { + uint64_t rnd = random(); + double res = (double)rnd / std::numeric_limits::max(); + double mn = std::max(id - range, low); + double mx = std::min(id + range, high); + return mn + (mx - mn) * res; + } + uint64_t random() { + // MMIX random, from https://en.wikipedia.org/wiki/Linear_congruential_generator#Parameters_in_common_use + // Should be thread safe + seed = seed * 6364136223846793005 + 1442695040888963407; + return seed; + } +private: + uint64_t seed; +}; +#endif diff --git a/src/clutil/Loader.cpp b/src/clutil/Loader.cpp new file mode 100644 index 0000000..c15e3b3 --- /dev/null +++ b/src/clutil/Loader.cpp @@ -0,0 +1,223 @@ +/* -*- C++ -*- + * + * Loader.cpp + * + * Author: Benjamin T James + * + * Class which can 'preload' chunks of sequences from a file list, + * and then count the k-mers separately, which can be done in + * multiple threads + */ +#include "Loader.h" +#include "Datatype.h" + +static uint64_t num_overflow = 0; +std::string next_histogram(std::string cur_type) +{ + if (cur_type == "uint8_t") { + return "uint16_t"; + } else if (cur_type == "uint16_t") { + return "uint32_t"; + } else { + return "uint64_t"; + } +} + +template +std::string Loader::get_warning() +{ + if (num_overflow == 0) { + return ""; + } else { + std::ostringstream oss; + oss << "For " << num_overflow << " sequences, the histogram type " << Datatype::get() << " was too small for holding sequences." << endl; + oss << "Performance may be slightly hindered, but can be improved by increasing the integral type (--datatype " << next_histogram(Datatype::get()) << ")" << endl; + return oss.str(); + } +} + + + +template +void Loader::fill_table(KmerHashTable &table, ChromosomeOneDigit *chrom, std::vector& values) +{ + const int k = table.getK(); + auto segment = chrom->getSegment(); + const char *seg_bases = chrom->getBase()->c_str(); + for (vector *v : *segment) { + int start = v->at(0); + int end = v->at(1); + + // Hani Z Girgis added the following line + // It is possible + if(end - start + 1 >= k){ + int r = table.wholesaleIncrementNoOverflow(seg_bases, start, end - k + 1); + if (r == -1) { + num_overflow++; + // #pragma omp critical + // { + // std::ostringstream oss; + // oss << "In header \"" << chrom->getHeader() << "\"" << endl; + // oss << "Histogram type " << Runner::get_datatype() << " is too small for holding sequences." << endl; + // oss << "Performance may be slightly hindered, but can be improved by increasing the integral type (--datatype " << next_histogram(Runner::get_datatype()) << ")" << endl; + // _loader_warning = oss.str(); + // cerr << get_warning() << endl; + // } + } + } + } + std::string header = chrom->getHeader(); + header = header.substr(1, header.find(' ')-1); + // Hani Z. Girgis added the following lines on 10/3/2018 + // This should result in significant speed up. + unsigned long tableSize = table.getMaxTableSize(); + values.reserve(values.size() + tableSize); + const V * valueArray = table.getValues(); + + copy(&valueArray[0], &valueArray[tableSize], back_inserter(values)); + + // Commented out by Hani Z. Girgis on 10/3/2018 and replaced by the code above + // std::vector *keys = table.getKeys(); + // for (std::string str : *keys) { + // values.push_back(table.valueOf(str.c_str())); + // } + // keys->clear(); + // delete keys; +} + +template +bool Loader::done() const +{ + return file_idx == files.size(); +} + +template +void Loader::preload(int tid) +{ + if (file_idx == files.size()) { + return; + } + for (uint64_t j = 0; j < chunk_size; j++) { + auto chrom = next(); + if (chrom.first == "") { + return; + } + cache_list.at(tid).emplace_back(chrom.first, chrom.second); + } +} + + +// Modified by Hani Z. Girgis on Oct 2, 2018 +template +Point* Loader::get_point(std::string header, const std::string &base, uintmax_t& id, int k, bool set_seq) +{ + ostringstream obase; + for (int i = 0; i < base.length(); i++) { + if (base[i] == 'A' || base[i] == 'C' || + base[i] == 'G' || base[i] == 'T') { + obase << base[i]; + } + } + ChromosomeOneDigit * chrom; + if(Util::isDna){ + chrom = new ChromosomeOneDigitDna(); + }else{ + chrom = new ChromosomeOneDigitProtein(); + } + + chrom->setHeader(header); + chrom->appendToSequence(obase.str()); + chrom->finalize(); + Point *p = Loader::get_point(chrom, id, k, set_seq); + delete chrom; + return p; +} + +// Modified by Hani Z. Girgis on Oct 2, 2018 +template +Point* Loader::get_point(ChromosomeOneDigit* chrom, uintmax_t& id, int k, bool set_seq) +{ + + KmerHashTable table(k, 1); + // Hani Z. Girgis changed the following line + // The table_k1 was initialized from 0 now it is 1 + KmerHashTable table_k1(1, 1); + std::vector values; + vector values_k1; + // values.clear(); + + Loader::fill_table(table, chrom, values); + Loader::fill_table(table_k1, chrom, values_k1); +// int tmplate = get_template(chrom->getHeader(), templates); + Point *p = new DivergencePoint(values, chrom->size()); +// cout << "mag: " << ((DivergencePoint*)p)->getPseudoMagnitude() << std::endl; + p->set_1mers(values_k1); + p->set_header(chrom->getHeader()); + p->set_length(chrom->getEffectiveSize()); + if (set_seq) { + p->set_data_str(*chrom->getBase()); + } + // Added by Hani Z. Girgis on Oct 7 2018 + p->setK(k); + DivergencePoint* q = dynamic_cast*>(p); + const auto N = q->points.size(); + double aq = (double) q->getPseudoMagnitude() / N; + double sq = 0; + for (auto i = 0; i < N; i++) { + double qdiff = q->points[i] - aq; + sq += qdiff * qdiff; + } + sq = sqrt(sq / N); + q->set_stddev(sq); + p->set_id(id); + #pragma omp atomic + id++; + + // Clean + + return p; +} + + + + + + + +template +std::vector*> Loader::load_next(int tid) +{ + std::vector*> points; + for (size_t i = 0; i < cache_list.at(tid).size(); i++) { + auto pr = cache_list.at(tid).at(i); + Point* p = get_point(pr.first, *pr.second, id_list.at(tid), k); + points.push_back(p); + delete pr.second; + } + cache_list.at(tid).clear(); + return points; +} + +template +std::pair Loader::next() +{ + auto n = maker->next(); + if (n.first != "") { + return n; + } + delete maker; + maker = NULL; + file_idx++; + if (file_idx >= files.size()) { + return n; + } + maker = new SingleFileLoader(files.at(file_idx)); + return maker->next(); +} + +template class Loader; +template class Loader; +template class Loader; +template class Loader; +template class Loader; +template class Loader; diff --git a/src/cluster/src/Loader.h b/src/clutil/Loader.h similarity index 70% rename from src/cluster/src/Loader.h rename to src/clutil/Loader.h index 28da845..ec3f569 100644 --- a/src/cluster/src/Loader.h +++ b/src/clutil/Loader.h @@ -11,9 +11,18 @@ #ifndef LOADER_H #define LOADER_H -#include "Point.h" +#include + #include "SingleFileLoader.h" -#include "ClusterFactory.h" +#include "Point.h" +#include "DivergencePoint.h" +#include "../nonltr/KmerHashTable.h" +// Add by Hani Z. Girgis, PhD on Oct 2, 2018 +#include "../nonltr/ChromosomeOneDigit.h" +#include "../nonltr/ChromosomeOneDigitDna.h" +#include "../nonltr/ChromosomeOneDigitProtein.h" + + template class Loader { @@ -41,6 +50,9 @@ class Loader { }; ~Loader() { + if (get_warning() != "") { + cerr << get_warning() << endl; + } cache_list.clear(); id_list.clear(); if (maker != NULL) { @@ -55,7 +67,11 @@ class Loader { // multi-thread accessible std::vector*> load_next(int tid); - static Point* get_point(std::string header, const std::string &base, uintmax_t& id, int k); + static Point* get_point(std::string header, const std::string &base, uintmax_t& id, int k, bool set_seq=true); + static Point* get_point(ChromosomeOneDigit* dna, uintmax_t& id, int k, bool set_seq=true); + + static void fill_table(KmerHashTable &table, ChromosomeOneDigit *chrom, std::vector& values); + static std::string get_warning(); private: std::pair next(); @@ -69,5 +85,7 @@ class Loader { std::vector files; size_t file_idx = 0; SingleFileLoader *maker = NULL; + }; + #endif diff --git a/src/cluster/src/Point.h b/src/clutil/Point.h similarity index 76% rename from src/cluster/src/Point.h rename to src/clutil/Point.h index a70bc20..4aac8ff 100644 --- a/src/cluster/src/Point.h +++ b/src/clutil/Point.h @@ -13,7 +13,7 @@ #define POINT_H #include -#include "../../nonltr/ChromosomeOneDigit.h" +#include "../nonltr/ChromosomeOneDigit.h" /* * Pure virtual class that defines behavior for @@ -51,33 +51,48 @@ class Point { virtual const vector& get_data() const = 0; - void set_header(const std::string c) { header = c; }; + void set_header(const std::string c) { header = string(c); }; const std::string get_header() const { return header; }; void set_data_str(const std::string& c) { data = c; }; const std::string & get_data_str() const { return data; }; void set_1mers(const vector &vec) { - for (auto i = 0; i < 4; i++) { - one_mers[i] = vec[i]; - } + // for (auto i = 0; i < Util::getAlphabetSize(); i++) { + // one_mers[i] = vec[i]; + // } + one_mers = vector(vec); } + vector get_1mers() const { - vector vec; - for (auto i = 0; i < 4; i++) { - vec.push_back(one_mers[i]); - } - return vec; + // vector vec; + // for (auto i = 0; i < Util::getAlphabetSize(); i++) { + // vec.push_back(one_mers[i]); + // } + // return vec; + return one_mers; } virtual unsigned long size() const = 0; virtual void set_id(uintmax_t c_id) = 0;//{ id = c_id; }; virtual const uintmax_t get_id() const = 0;//{ return id; }; virtual void set_length(unsigned long len) = 0; virtual unsigned long get_length() const = 0; + + // Added by Hani Z. Girgis on Oct 7 2018 + int getK(){ + return k; + } + void setK(int k){ + this->k = k; + } + private: - uint64_t one_mers[4]; - std::string header; + vector one_mers; + std::string header; std::string data; + // Added by Hani Z. Girgis on Oct 7 2018 + // The k in k-mer used to build the table + int k; }; #endif diff --git a/src/clutil/Progress.cpp b/src/clutil/Progress.cpp new file mode 100644 index 0000000..138763b --- /dev/null +++ b/src/clutil/Progress.cpp @@ -0,0 +1,79 @@ +/* -*- C++ -*- + * + * Progress.cpp + * + * Author: Benjamin T James + * + * Progress bar that uses carriage return '\r' + * to seek to the beginning of a line to redraw + */ +#include "Progress.h" +#include +Progress::Progress(long num, std::string prefix_) +{ + pmax = num; + ended = 0; + pcur = 0; + old_prog = -1; + prefix = prefix_; + barWidth = 70 - (prefix.size()+1); + print(); +} + +void Progress::print() +{ + #ifndef NOPROG + double prog = (double)pcur / pmax; + if (old_prog != int(prog * 100)) { + std::cout << prefix << " ["; + int pos = barWidth * prog; + for (int i = 0; i < barWidth; i++) { + if (i < pos) { + std::cout << "="; + } else if (i == pos) { + std::cout << ">"; + } else { + std::cout << " "; + } + } + std::cout << "] " << int(prog * 100.0) << " %\r"; + std::cout.flush(); + } + old_prog = int(prog * 100); + #endif +} + +void Progress::end() +{ + if (!ended) { + pcur = pmax; + print(); + std::cout << std::endl; + } + ended = true; +} + + +void Progress::set(int num) +{ + pcur = num; + print(); +} + +void Progress::operator++() +{ + pcur++; + print(); +} +void Progress::operator++(int) +{ + print(); + pcur++; +} + + +void Progress::operator+=(size_t num) +{ + pcur += num; + print(); +} diff --git a/src/cluster/src/Progress.h b/src/clutil/Progress.h similarity index 75% rename from src/cluster/src/Progress.h rename to src/clutil/Progress.h index f59d948..fb7424b 100644 --- a/src/cluster/src/Progress.h +++ b/src/clutil/Progress.h @@ -3,6 +3,10 @@ * Progress.h * * Author: Benjamin T James + * + * Progress bar that uses carriage return '\r' + * to seek to the beginning of a line to redraw + * */ #include #ifndef PROGRESS_H @@ -16,14 +20,14 @@ class Progress { void operator++(); void operator++(int); void operator+=(size_t); + void set(int); private: void print(); long pmax; long pcur; + long old_prog; bool ended; std::string prefix; int barWidth; - - std::string last; }; #endif diff --git a/src/clutil/Random.h b/src/clutil/Random.h new file mode 100644 index 0000000..52e1274 --- /dev/null +++ b/src/clutil/Random.h @@ -0,0 +1,61 @@ +// -*- C++ -*- +/* + * Random.h + * + * Author: Benjamin T James + */ + +#ifndef RANDOM_H +#define RANDOM_H +#include +#include +class Random { +public: + Random(std::random_device::result_type seed=0xAA) : mt(seed) {} + + template + T randMod(T max) { + T res; +#pragma omp critical + { + if (max == 0) { + res = 0; + } else { + std::uniform_int_distribution distribution(0, max-1); + res = distribution(mt); + } + } + return res; + } + + double random() { + double res = 0; + #pragma omp critical + { + std::uniform_real_distribution distribution(0.0, 1.0); + res = distribution(mt); + } + return res; + } + double rand_between(double id, double range, double low, double high) { + double res = 0; + #pragma omp critical + { + double mn = std::max(id - range, low); + double mx = std::min(id + range, high); + std::uniform_real_distribution distribution(mn, mx); + + res = distribution(mt); + } + return res; + } + std::random_device::result_type nextRandSeed() { + using rt = std::random_device::result_type; + return randMod(std::numeric_limits::max()); + } + std::mt19937& gen() { return mt; } +private: + std::mt19937 mt; + +}; +#endif diff --git a/src/cluster/src/SingleFileLoader.cpp b/src/clutil/SingleFileLoader.cpp similarity index 67% rename from src/cluster/src/SingleFileLoader.cpp rename to src/clutil/SingleFileLoader.cpp index e62715f..9b61024 100644 --- a/src/cluster/src/SingleFileLoader.cpp +++ b/src/clutil/SingleFileLoader.cpp @@ -82,3 +82,42 @@ std::pair SingleFileLoader::next() // std::cout << "next(): " << diff / CLOCKS_PER_SEC << std::endl; return ret; } +ChromosomeOneDigitDna* SingleFileLoader::nextChrom() +{ + ChromosomeOneDigitDna* ret = NULL; + if (!in->good()) { + return ret; + } + if (is_first) { + safe_getline(*in, buffer); + is_first = false; + } + do { + if (buffer[0] == '>') { + if (ret != NULL) { + ret->finalize(); + return ret; + } + ret = new ChromosomeOneDigitDna(); + ret->setHeader(buffer); + } else if (buffer[0] == ' ' || buffer[0] == '\t') { + bool all_spaces = true; + for (auto c : buffer) { + if (c != ' ' && c != '\t') { + all_spaces = false; + } + } + if (!all_spaces) { + std::ostringstream oss; + oss << ret->getHeader() << buffer; + std::string new_header = oss.str(); + ret->setHeader(new_header); + } + } else { + ret->appendToSequence(buffer); + } + safe_getline(*in, buffer); + } while (in->good()); + ret->finalize(); + return ret; +} diff --git a/src/cluster/src/SingleFileLoader.h b/src/clutil/SingleFileLoader.h similarity index 85% rename from src/cluster/src/SingleFileLoader.h rename to src/clutil/SingleFileLoader.h index d6b3c5d..d944a30 100644 --- a/src/cluster/src/SingleFileLoader.h +++ b/src/clutil/SingleFileLoader.h @@ -11,7 +11,7 @@ #define SINGLEFILELOADER_H #include - +#include "../nonltr/ChromosomeOneDigitDna.h" class SingleFileLoader { public: SingleFileLoader(std::string file); @@ -21,6 +21,7 @@ class SingleFileLoader { } } std::pair next(); + ChromosomeOneDigitDna* nextChrom(); private: std::ifstream *in; std::string buffer; diff --git a/src/fastcar/FC_Runner.cpp b/src/fastcar/FC_Runner.cpp new file mode 100644 index 0000000..34bd459 --- /dev/null +++ b/src/fastcar/FC_Runner.cpp @@ -0,0 +1,635 @@ +/* -*- C++ -*- + * + * Runner.cpp + * + * Author: Benjamin T James + * + * Runner class that parses options and controls + * the process of the program. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "../nonltr/ChromListMaker.h" +#include "../clutil/DivergencePoint.h" +#include "FC_Runner.h" +#include "../predict/Predictor.h" +#include "../clutil/Loader.h" +#include "../clutil/Progress.h" +#include "../clutil/Datatype.h" +#include + + +Runner::Runner(int argc, char **argv) +{ + get_opts(argc, argv); + srand(10); +} + +int parseLine(char* line) { + int i = strlen(line); + const char* p = line; + while (*p < '0' || *p > '9') p++; + line[i-3] = '\0'; + i = atoi(p); + return i; +} + +void mem_used(std::string prefix) +{ + struct sysinfo memInfo; + sysinfo(&memInfo); + FILE* file = fopen("/proc/self/status", "r"); + int result = -1; + char line[128]; + while (fgets(line, 128, file)) { + if (strncmp(line, "VmSize:", 7) == 0) { + result = parseLine(line); + break; + } + } + fclose(file); + cout << prefix << ": used memory: " << result << " KB" << endl; +} + +int Runner::run() +{ + if (pred64) { + k = pred64->get_k(); + } else if (k == -1) { + uintmax_t total_length = 0; + uintmax_t total_num_seq = 0; + largest_count = 0; + Progress progress(files.size(), "Reading in sequences"); + uintmax_t num_seq = 10000; + for (auto i = 0; i < files.size(); i++) { + auto f = files.at(i); + SingleFileLoader maker(f); + + progress++; + uint64_t local_largest_count = 0; + std::pair pr; + while ((pr = maker.next()).first != "" && total_num_seq++ < num_seq) { + total_length += pr.second->length(); + } + } + progress.end(); + double avg_length = (double)total_length / total_num_seq; + k = std::max((int)(ceil(log(avg_length) / log(4)) - 1), 2); + } + cout << "K: " << k << endl; +// #pragma omp parallel for reduction(max:largest_count) +// for (size_t i = 0; i < sequences.size(); i++) { +// std::vector values; +// KmerHashTable table(k, 1); +// ChromosomeOneDigitDna chrom; +// chrom.setSequence(*sequences[i].second); +// chrom.setHeader(sequences[i].first); +// chrom.finalize(); +// fill_table(table, &chrom, values); +// uint64_t l_count = 0; +// for (auto elt : values) { +// if (elt > l_count) { +// l_count = elt; +// } +// } +// if (l_count > largest_count) { +// largest_count = l_count; +// } +// values.clear(); +// } +// largest_count *= 2; + uint64_t cap = 10000; + std::vector sequences(cap); + if (pred64 == NULL || Datatype::get() == "") { + uint64_t idx = 0; + Progress progress(cap, "Reading in sequences"); + uint64_t largest_count = 0; + + + for (auto i = 0; i < files.size(); i++) { + auto f = files.at(i); + SingleFileLoader maker(f); + ChromosomeOneDigitDna* chrom = NULL; + while ((chrom = maker.nextChrom()) != NULL && idx < cap) { + + sequences[idx] = chrom; + idx++; + progress++; + } + } + sequences.resize(idx); + +#pragma omp parallel for reduction(max:largest_count) + for (int i = 0; i < sequences.size(); i++) { + auto chrom = sequences[i]; + std::vector values; + KmerHashTable table(k, 1); + Loader::fill_table(table, chrom, values); + uint64_t l_count = *std::max_element(std::begin(values), std::end(values)); + if (l_count > largest_count) { + largest_count = l_count; + } + } + progress.end(); + } else if (pred64 != NULL) { + sequences.clear(); + Datatype::set(pred64->get_datatype()); + similarity = pred64->get_id(); + } + if (Datatype::get() != "") { + std::string type = Datatype::get(); + if (type == "uint8_t") { + largest_count = std::numeric_limits::max(); + } else if (type == "uint16_t") { + largest_count = std::numeric_limits::max(); + } else if (type == "uint32_t") { + largest_count = std::numeric_limits::max(); + } else if (type == "uint64_t") { + largest_count = std::numeric_limits::max(); + } + } + if (largest_count <= std::numeric_limits::max()) { + Datatype::set("uint8_t"); + cout << "Using 8 bit histograms" << endl; + return do_run(sequences); + } else if (largest_count <= std::numeric_limits::max()) { + Datatype::set("uint16_t"); + cout << "Using 16 bit histograms" << endl; + return do_run(sequences); + } else if (largest_count <= std::numeric_limits::max()){ + Datatype::set("uint32_t"); + cout << "Using 32 bit histograms" << endl; + return do_run(sequences); + } else if (largest_count <= std::numeric_limits::max()) { + Datatype::set("uint64_t"); + cout << "Using 64 bit histograms" << endl; + return do_run(sequences); + } else { + throw "Too big sequence"; + } +} + + +void Runner::usage(std::string progname) const +{ + int num_threads = omp_get_max_threads(); + std::cout << "Usage: " << progname << " *.fasta --query queryFile.fasta --id 0.90 [optional_arguments]" << std::endl << std::endl; + std::cout << "Options: " << std::endl; + std::cout << "\t" << "--id "<<"\t" <<"identityValue" << "\t\t" << "Use this alignment identity (0.0 to 1.0) for classification" << std::endl; + std::cout << "\t" << "-q|--query "<<"\t" <<"queryFile.fasta" << "\t\t" << "Run the database against this query file" << std::endl; + std::cout << "\t" << "-k|--kmer "<<"\t" << "N"<<"\t\t\t" << "Usually calculated by going through the data and finding the ceil(log_4(Length_avg))-1,"<< std::endl; + std::cout << "\t\t\t\t\t\t " << "so if provided, it can save computational time. Increasing the k-mer increases memory usage four-fold."<< std::endl; + std::cout << "\t" << "--datatype "<<"\t" <<"uintX_t" << "\t\t\t" << "If provided, instead of running through the data another time," << std::endl; + std::cout << "\t\t\t\t\t\t " << "provide the maximum data type to not overflow, one of {uint8_t, uint16_t, uint32_t, uint64_t}" << std::endl; + std::cout << "\t" << "-c|--chunk "<<"\t" << chunk_size << "\t\t\t" << "Process N (a positive integer number) sequences at once in the multithreading model." << std::endl; + std::cout << "\t" << "--dump "<<"\t" <<"weights.txt" << "\t\t" << "Instead of running, only train the model(s) and dump the weights" << std::endl; + std::cout << "\t" << "--no-format "<<"\t\t\t\t" << "Print the full header instead of the abbreviated header when printing output" << std::endl; + + + std::cout << "\t" << "-o|--output "<<"\t" <<"output.search" << "\t\t" << "Output file, to which numbers 0 through [num_threads] are appended. Each file contains data computed by each thread." << std::endl; + std::cout << "\t" << "-r|--recover"<<"\t" <<"weights.txt" << "\t\t" << "Instead of training, use a pre-computed weights file to avoid re-training" << std::endl; + std::cout << "\t" << "-f|--feat "<<"\t" <<"fast" << "\t\t\t"<<"Use a small,fast set of possible features (fast) or a larger, slower-to-train set of possible features (slow)"<= 1) { + throw std::invalid_argument(""); + } + } catch(std::exception e) { + cerr << "Similarity must be between 0 and 1" << endl; + exit(EXIT_FAILURE); + } + i++; + } else if ((arg == "-c" || arg == "--chunk") && i + 1 < argc) { + chunk_size = strtol(argv[i+1], NULL, 10); + if (errno) { + perror(argv[i+1]); + exit(EXIT_FAILURE); + } else if (chunk_size <= 0) { + fprintf(stderr, "Chunk size must be greater than 0.\n"); + exit(EXIT_FAILURE); + } + i++; + } else if ((arg == "--dump") && i + 1 < argc) { + dump_str = argv[++i]; + dump = true; + } else if (arg == "--noformat" || arg == "--no-format") { + format = false; + } else if ((arg == "--datatype") && i + 1 < argc) { + std::string val = argv[++i]; + if (val == "uint8_t" || val == "8" || val == "uint8") { + Datatype::set("uint8_t"); + } else if (val == "uint16_t" || val == "16" || val == "uint16") { + Datatype::set("uint16_t"); + } else if (val == "uint32_t" || val == "32" || val == "uint32") { + Datatype::set("uint32_t"); + } else if (val == "uint64_t" || val == "64" || val == "uint64") { + Datatype::set("uint64_t"); + } else { + cerr << "Histogram data type must have a valid data type or size: one of 8, 16, 32, 64" << endl; + exit(EXIT_FAILURE); + } + } else if ((arg == "-k" || arg == "--kmer") && i + 1 < argc) { + k = strtol(argv[i+1], NULL, 10); + if (errno) { + perror(argv[i+1]); + exit(EXIT_FAILURE); + } else if (k <= 0) { + fprintf(stderr, "K must be greater than 0.\n"); + exit(EXIT_FAILURE); + } + align = false; + i++; + } else if ((arg == "-o" || arg == "--output") && i + 1 < argc) { + output = string(argv[i+1]); + i++; + } else if ((arg == "-q" || arg == "--query") && i + 1 < argc) { + char* qfile = argv[++i]; + struct stat st; + stat(qfile, &st); + if (S_ISREG(st.st_mode)) { + qfiles.emplace_back(qfile); + } else { + usage(*argv); + exit(EXIT_FAILURE); + } + } else if ((arg == "-r" || arg == "--recover") && i + 1 < argc) { + recover = true; + dump_str = argv[++i]; + pred64 = new Predictor(dump_str); + similarity = pred64->get_id(); + k = pred64->get_k(); + } else if ((arg == "-f" || arg == "--feat") && i + 1 < argc) { + std::string val = argv[++i]; + if (val == "fast") { + feats = PRED_FEAT_FAST; + } else if (val == "slow") { + feats = PRED_FEAT_FAST | PRED_FEAT_DIV; + } else { + cerr << "Features must be either \"fast\" or \"slow\"" << endl; + } + } else if ((arg == "-m" || arg == "--mode") && i + 1 < argc) { + std::string val = argv[++i]; + if (val == "c") { + mode |= PRED_MODE_CLASS; + } else if (val == "r") { + mode |= PRED_MODE_REGR; + } else if (val == "cr" || val == "rc") { + mode |= PRED_MODE_CLASS | PRED_MODE_REGR; + } else { + cerr << "Mode must be either c, r, or a combination" << endl; + exit(EXIT_FAILURE); + } + } else if ((arg == "-s" || arg == "--sample") && i + 1 < argc) { + sample_size = strtol(argv[i+1], NULL, 10); + if (errno) { + perror(argv[i+1]); + exit(EXIT_FAILURE); + } else if (sample_size <= 0) { + fprintf(stderr, "Sample size must be greater than 0.\n"); + exit(EXIT_FAILURE); + } + i++; + } else if ((arg == "--mut-type") && i + 1 < argc) { + std::string opt = argv[i+1]; + if (opt == "all") { + mut_type = HandleSeq::BOTH | HandleSeq::ATYPICAL; + } else if (opt == "both") { + mut_type = HandleSeq::BOTH; + } else if (opt == "snp" || opt == "single") { + mut_type = HandleSeq::SINGLE; + } else if (opt == "nonsingle-typical") { + mut_type = HandleSeq::NON_SINGLE; + } else if (opt == "nonsingle-all") { + mut_type = HandleSeq::NON_SINGLE | HandleSeq::ATYPICAL; + } else if (opt == "all-but-reversion") { + mut_type = HandleSeq::BOTH | HandleSeq::TRANSLOCATION; + } else if (opt == "all-but-translocation") { + mut_type = HandleSeq::BOTH | HandleSeq::REVERSION; + } else { + cerr << "Options for mutation type are \"single\", \"nonsingle-typical\", \"both\" (for single and nonsingle-typical), \"nonsingle-all\", and \"all\" (single, nonsingle, and atypical nonsingle)." << endl; + exit(1); + } + i++; + } else if ((arg == "-t" || arg == "--threads") && i + 1 < argc) { + try { + std::string opt = argv[i+1]; + int threads = std::stoi(opt); + if (threads <= 0) { + throw std::invalid_argument(""); + } + #ifdef _OPENMP + omp_set_num_threads(threads); + #endif + } catch (std::exception e) { + cerr << "Number of threads must be greater than 0." << endl; + exit(1); + } + + i++; + + } else if ((arg == "-h") || (arg == "--help")) { + usage(*argv); + exit(EXIT_FAILURE); + } else { + struct stat st; + if (stat(argv[i], &st) == 0 && S_ISREG(st.st_mode)) { + files.push_back(argv[i]); + } else { + usage(*argv); + exit(EXIT_FAILURE); + } + } + } + if (files.empty()) { + usage(*argv); + exit(EXIT_FAILURE); + } +} + + +double global_mat[4][4] = {{1, -1, -1, -1}, + {-1, 1, -1, -1}, + {-1, -1, 1, -1}, + {-1, -1, -1, 1}}; +double global_sigma = -2; +double global_epsilon = -1; + +template +long bin_search(const std::vector*> &points, size_t begin, size_t last, size_t length) +{ + if (last < begin) { + return 0; + } + size_t idx = begin + (last - begin) / 2; + if (points.at(idx)->get_length() == length) { + while (idx > 0 && points[idx-1]->get_length() == length) { + idx--; + } + return idx; + } else if (points.at(idx)->get_length() > length) { + if (begin == idx) { return idx; } + return bin_search(points, begin, idx-1, length); + } else { + return bin_search(points, idx+1, last, length); + } +} + +std::string format_header(std::string hdr) +{ + long len = hdr.length(); + long b_idx = 0; + if (hdr[0] == '>') { + b_idx++; + } + for (long i = b_idx; i < len; i++) { + if (hdr[i] == ' ' || hdr[i] == '\t') { + len = i + 1; + break; + } + } + return hdr.substr(b_idx, len - b_idx); +} + +template +void work(const std::vector*> &queries, const std::vector*> &pts, double similarity, Predictor* pred, std::string delim, std::ofstream &out, uintmax_t &num_pred_pos, bool format) +{ + if (pts.empty()) { + return; + } + uint8_t mode = pred->get_mode(); + for (auto query : queries) { + size_t q_len = query->get_length(); + size_t begin_length = q_len * similarity; + size_t end_length = q_len / similarity; + size_t start = bin_search(pts, 0, pts.size()-1, + begin_length); + + for (size_t i = start; + i < pts.size() && pts[i]->get_length() <= end_length; + i++) { + double sim = 0.0; + bool cls = true; + + if (mode & PRED_MODE_CLASS) { + cls = pred->close(pts[i], query); + + } + if (!cls) { + continue; + } + num_pred_pos++; + if (mode & PRED_MODE_REGR) { + sim = pred->similarity(pts[i], query); + } else { + sim = 1; + } + if (mode & PRED_MODE_CLASS) { +// sim = (sim > similarity) ? sim : 0; + } + if (sim > 0) { + if (format) { + out << format_header(query->get_header()) << delim << format_header(pts[i]->get_header()) << delim << 100 * sim << endl; + } else { + out << query->get_header() << delim << pts[i]->get_header() << delim << 100 * sim << endl; + } + } + } + } +} + +template +int Runner::do_run(std::vector &seqs) +{ + using DNA=ChromosomeOneDigit; + using pvec = vector *>; + using pmap = map*, pvec*>; + srand(0xFF); + mem_used("before do_run"); + size_t num_points = 0; + uintmax_t _id = 0; + + + + + // Sorting all sequences based on length + std::sort(seqs.begin(), seqs.end(), [](DNA* a, DNA* b) { + return a->getBase()->length() < b->getBase()->length(); + }); + cout << "sample_size: " << sample_size << endl; + double increment = std::max(1.0, (double)seqs.size() / sample_size); + for (double i = 0; round(i) < seqs.size(); i += increment) { + indices.push_back(round(i)); + // cout << "index: " << round(i) << " length: " << seqs.at(round(i)).second->length() << endl; + } + std::vector*> trpoints(indices.size()); + #pragma omp parallel for + for (size_t i = 0; i < indices.size(); i++) { + auto chrom = seqs.at(indices.at(i)); + Point* p = Loader::get_point(chrom, _id, k); + trpoints[i] = p; + } + for (auto p : seqs) { + delete p; + } + seqs.clear(); + + indices.clear(); + mem_used("after selection"); + cout << "TRpoints.size(): " << trpoints.size() << endl; + + // std::sort(trpoints.begin(), trpoints.end(), [](const Point* a, const Point* b) { + // return a->get_length() < b->get_length(); }); + + int n_threads = omp_get_max_threads(); + Predictor *pred = NULL; + if (recover) { + pred = new Predictor(dump_str); + + } else { + if (mode == 0) { + cout << "No mode specified, using regression and classification by default" << endl; + mode = PRED_MODE_REGR | PRED_MODE_CLASS; + } + if (feats == 0) { + cout << "No feature set specified, using fast features by default" << endl; + feats = PRED_FEAT_FAST; + } + if ((mode & PRED_MODE_CLASS) == PRED_MODE_CLASS && similarity < 0) { + cout << "Classification specified, but no identity score given. Please supply a cutoff with \"--id\"" << endl; + exit(EXIT_FAILURE); + } else if (similarity < 0) { + similarity = 0.9; + } + + pred = new Predictor(k, similarity, mode, feats, mut_type, 4); + auto before = clock(); + mem_used("before predictor training"); + pred->train(trpoints, _id, 10, sample_size); + + double elapsed = (clock() - before); + elapsed /= CLOCKS_PER_SEC; + cout << "Training time: " << elapsed << endl; + for (auto p : trpoints) { + delete p; + } + trpoints.clear(); + if (dump) { + pred->save(dump_str, Datatype::get()); + exit(0); + } + } + mem_used("after predictor training"); + + std::vector output_list; + for (int i = 0; i < n_threads; i++) { + std::ostringstream oss; + oss << output << i; + output_list.emplace_back(oss.str()); + } + + + string delim = "\t"; + if (!format) { + delim = "!"; + } + uint64_t query_id_start = num_points; + int num_query = num_points; + Loader qloader(qfiles, n_threads * num_points, chunk_size, 1, k, query_id_start); + mem_used("before loop"); + uintmax_t num_pred_pos = 0; + while (!qloader.done()) { + qloader.preload(0); + auto queries = qloader.load_next(0); + Loader loader(files, 0, chunk_size, n_threads, k); + + + while (!loader.done()) { + int n_iter = n_threads; + mem_used("during inner loop"); + for (int h = 0; h < n_iter; h++) { + loader.preload(h); + } + #pragma omp parallel for + for (int h = 0; h < n_iter; h++) { + int tid = omp_get_thread_num(); + auto pts = loader.load_next(tid); + std::sort(std::begin(pts), std::end(pts), [](Point*a, Point*b) { + return a->get_length() < b->get_length(); + }); + work(queries, pts, similarity, pred, delim, output_list[tid], num_pred_pos, format); + for (auto p : pts) { + delete p; + } + } + } + + for (auto q : queries) { + delete q; + } + mem_used("mid loop"); + } + mem_used("after loop"); + cout << "# of predicted positive: " << num_pred_pos << endl; + std::string warn = Loader::get_warning(); + if (warn != "") { + cout << warn << endl; + } + return 0; +} + + +template +void Runner::print_output(const map*, vector*>*> &partition) const +{ + cout << "Printing output" << endl; + std::ofstream ofs; + ofs.open(output, std::ofstream::out); + int counter = 0; + for (auto const& kv : partition) { + if (kv.second->size() == 0) { + continue; + } + ofs << ">Cluster " << counter << endl; + int pt = 0; + for (auto p : *kv.second) { + string s = p->get_header(); + ofs << pt << "\t" << p->get_length() << "nt, " << s << "... " << endl; + pt++; + } + counter++; + } + ofs.close(); +} diff --git a/src/fastcar/FC_Runner.h b/src/fastcar/FC_Runner.h new file mode 100644 index 0000000..54b851a --- /dev/null +++ b/src/fastcar/FC_Runner.h @@ -0,0 +1,53 @@ +/* -*- C++ -*- + * + * Runner.h + * + * Author: Benjamin T James + * + * Runner class, sets default params + * and runs program + */ +#ifndef FC_RUNNER_H +#define FC_RUNNER_H + +#include +#include +#include +#include "../clutil/Point.h" +#include "../predict/Predictor.h" +#include "../predict/HandleSeq.h" +#include "../nonltr/ChromosomeOneDigitDna.h" +using namespace std; + +class Runner { +public: + Runner(int argc, char** argv); + ~Runner() { indices.clear(); files.clear(); qfiles.clear(); if (pred64) {delete pred64;}}; + int run(); +private: + void usage(std::string progname) const; + template int do_run(std::vector &sequences); + template void print_output(const map*, vector*>*> &m) const; + int k = -1; + int bandwidth; + double similarity = -1; + long largest_count = 0; + bool align = false; + bool recover = false; + int sample_size = 300; + int mut_type = HandleSeq::SINGLE; + uint8_t mode = 0; + uint64_t feats = 0; + uint64_t chunk_size = 10000; + std::vector files, qfiles; + std::vector indices; + bool dump = false; + bool format = true; + string output = "output.search"; + string dump_str = "weights.txt"; + void get_opts(int argc, char** argv); + Predictor *pred64 = NULL; + + +}; +#endif diff --git a/src/fastcar/fastcar.cpp b/src/fastcar/fastcar.cpp new file mode 100644 index 0000000..c4f81fa --- /dev/null +++ b/src/fastcar/fastcar.cpp @@ -0,0 +1,12 @@ +/* -*- C++ -*- + * + * main.cpp + * + * Author: Benjamin T James + */ +#include "FC_Runner.h" +int main(int argc, char **argv) +{ + Runner runner(argc, argv); + return runner.run(); +} diff --git a/src/nonltr/ChromListMaker.cpp b/src/nonltr/ChromListMaker.cpp index e684c3a..5857c07 100644 --- a/src/nonltr/ChromListMaker.cpp +++ b/src/nonltr/ChromListMaker.cpp @@ -9,8 +9,9 @@ namespace nonltr { -ChromListMaker::ChromListMaker(string seqFileIn) { +ChromListMaker::ChromListMaker(string seqFileIn, bool is_oneseq_) { seqFile = seqFileIn; + is_oneseq = is_oneseq_; chromList = new vector(); } @@ -50,36 +51,110 @@ const vector * ChromListMaker::makeChromList() { ifstream in(seqFile.c_str()); bool isFirst = true; Chromosome * chrom; - + vector size_list = getSize(); + uint64_t cur_seq = 0; + if (is_oneseq) { + uint64_t sum = 0; + for (uint64_t len : size_list) { + sum += len + 50; + } + size_list.clear(); + size_list.push_back(sum); + } while (in.good()) { string line; safe_getline(in, line); if (line[0] == '>') { if (!isFirst) { - chrom->finalize(); - chromList->push_back(chrom); + if (is_oneseq) { + std::string interseq(50, 'N'); + // chrom->insert(interseq); + chrom->appendToSequence(interseq); + } else { + chrom->finalize(); + chromList->push_back(chrom); + chrom = new Chromosome(size_list.at(cur_seq++)); + chrom->setHeader(line); + } } else { isFirst = false; + chrom = new Chromosome(size_list.at(cur_seq++)); + chrom->setHeader(line); } + } else if (line[0] == ' ' || line[0] == '\t') { + } else { + // chrom->insert(line); + chrom->appendToSequence(line); + } + } + chrom->finalize(); + chromList->push_back(chrom); + in.close(); - chrom = new Chromosome(); - chrom->setHeader(line); + return chromList; +} + +const vector ChromListMaker::getSize() { + ifstream in(seqFile.c_str()); + vector size_list; + uint64_t current_size = 0; + while (in.good()) { + string line; + safe_getline(in, line); + if (line[0] == '>') { + if (current_size > 0) { + size_list.push_back(current_size); + } + current_size = 0; } else if (line[0] == ' ' || line[0] == '\t') { - bool all_spaces = true; - for (auto c : line) { - if (c != ' ' && c != '\t') { - all_spaces = false; + } else { + current_size += line.length(); + } + } + size_list.push_back(current_size); + return size_list; +} +const vector * ChromListMaker::makeChromOneDigitDnaList() { + ifstream in(seqFile.c_str()); + bool isFirst = true; + ChromosomeOneDigitDna * chrom; + vector size_list = getSize(); + uint64_t cur_seq = 0; + if (is_oneseq) { + uint64_t sum = 0; + for (uint64_t len : size_list) { + sum += len + 50; + } + if (sum > 0) { + sum -= 50; + } + size_list.clear(); + size_list.push_back(sum); + } + while (in.good()) { + string line; + safe_getline(in, line); + if (line[0] == '>') { + if (!isFirst) { + if (is_oneseq) { + std::string interseq(50, 'N'); + chrom->insert(interseq); + } else { + chrom->finalize(); + chromList->push_back(chrom); + chrom = new ChromosomeOneDigitDna(size_list.at(cur_seq++)); + chrom->setHeader(line); } + } else { + isFirst = false; + chrom = new ChromosomeOneDigitDna(size_list.at(cur_seq++)); + chrom->setHeader(line); + } - if (all_spaces) { - continue; - } - std::ostringstream oss; - oss << chrom->getHeader() << line; - std::string new_header = oss.str(); - chrom->setHeader(new_header); + } else if (line[0] == ' ' || line[0] == '\t') { } else { - chrom->appendToSequence(line); + chrom->insert(line); +// chrom->appendToSequence(line); } } chrom->finalize(); @@ -89,10 +164,10 @@ const vector * ChromListMaker::makeChromList() { return chromList; } -const vector * ChromListMaker::makeChromOneDigitList() { +const vector * ChromListMaker::makeChromOneDigitProteinList() { ifstream in(seqFile.c_str()); bool isFirst = true; - ChromosomeOneDigit * chrom; + ChromosomeOneDigitProtein * chrom; while (in.good()) { string line; @@ -105,7 +180,7 @@ const vector * ChromListMaker::makeChromOneDigitList() { isFirst = false; } - chrom = new ChromosomeOneDigit(); + chrom = new ChromosomeOneDigitProtein(); chrom->setHeader(line); } else { chrom->appendToSequence(line); diff --git a/src/nonltr/ChromListMaker.h b/src/nonltr/ChromListMaker.h index a60fe2f..1a9d771 100644 --- a/src/nonltr/ChromListMaker.h +++ b/src/nonltr/ChromListMaker.h @@ -1,8 +1,9 @@ /* * ChromListMaker.h * - * Created on: Mar 13, 2014 - * Author: Hani Zakaria Girgis, PhD + * Created on: Mar 13, 2014 + * Modified on: Oct 2, 2018 + * Author: Hani Zakaria Girgis, PhD */ #ifndef CHROMLISTMAKER_H_ @@ -12,7 +13,8 @@ #include #include "Chromosome.h" -#include "ChromosomeOneDigit.h" +#include "ChromosomeOneDigitDna.h" +#include "ChromosomeOneDigitProtein.h" #include "../utility/Util.h" @@ -25,12 +27,14 @@ class ChromListMaker { private: vector * chromList; string seqFile; - + bool is_oneseq; public: - ChromListMaker(string); + ChromListMaker(string, bool is_oneseq_=false); virtual ~ChromListMaker(); + const vector getSize(); const vector * makeChromList(); - const vector * makeChromOneDigitList(); + const vector * makeChromOneDigitDnaList(); + const vector * makeChromOneDigitProteinList(); }; diff --git a/src/nonltr/Chromosome.cpp b/src/nonltr/Chromosome.cpp index 2bea802..7a2f53a 100644 --- a/src/nonltr/Chromosome.cpp +++ b/src/nonltr/Chromosome.cpp @@ -5,6 +5,7 @@ * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH */ #include "Chromosome.h" +#include Chromosome::Chromosome() { header = string(""); @@ -14,6 +15,14 @@ Chromosome::Chromosome() { isFinalized = false; } +Chromosome::Chromosome(uint64_t size) { + header = string(""); + base = string(size, ' '); + str_len = 0; + isHeaderReady = false; + isBaseReady = false; + isFinalized = false; +} Chromosome::Chromosome(string fileName) { chromFile = fileName; readFasta(); @@ -32,6 +41,12 @@ Chromosome::Chromosome(string fileName, int len) { help(len, true); } +Chromosome::Chromosome(string fileName, int len, int maxLength) { + chromFile = fileName; + readFasta(maxLength); + help(len, true); +} + Chromosome::Chromosome(string &seq, string &info) { header = info; base = seq; @@ -81,6 +96,22 @@ void Chromosome::appendToSequence(const string& line) { } } +void Chromosome::insert(const string& line) { + if (isFinalized) { + string msg("This chromosome has been finalized. "); + msg.append("The sequence cannot be modified."); + throw InvalidOperationException(msg); + } else { + + memcpy((char*)base.c_str() + str_len, + line.c_str(), + line.length()); + str_len += line.length(); + isBaseReady = true; + } +} + + void Chromosome::finalize() { if (isFinalized) { string msg("This chromosome has been already finalized. "); @@ -97,26 +128,52 @@ void Chromosome::finalize() { } void Chromosome::help(int len, bool canMerge) { + canClean = true; + effectiveSize = 0; segLength = len; segment = new vector *>(); -// segment->reserve(100); + //segment->reserve(100); toUpperCase(); - removeN(); - if (canMerge) { + + if(Util::isDna){ + baseCount = new vector(4, 0); + makeBaseCount(); + } + + removeAmbiguous(); + + if (Util::isDna && (canMerge && base.size() > 20)) { mergeSegments(); } + makeSegmentList(); calculateEffectiveSize(); + } Chromosome::~Chromosome() { base.clear(); - Util::deleteInVector(segment); - segment->clear(); - delete segment; + //cerr << "~Chromosome() 1" << endl; + + if (canClean) { + while (!segment->empty()) { + segment->back()->clear(); + delete segment->back(); + segment->pop_back(); + } + segment->clear(); + + // Util::deleteInVector(segment); + delete segment; + if(Util::isDna){ + baseCount->clear(); + delete baseCount; + } + } + //cerr << "~Chromosome() 2" << endl; } void Chromosome::readFasta() { @@ -125,6 +182,14 @@ void Chromosome::readFasta() { base = string(""); ifstream in(chromFile.c_str()); + if (in.fail()) { + string msg("Cannot open "); + msg.append(chromFile); + msg.append(". System code is: "); + msg.append(Util::int2string(errno)); + throw InvalidInputException(msg); + } + while (in.good()) { string line; getline(in, line); @@ -147,6 +212,42 @@ void Chromosome::readFasta() { in.close(); } +void Chromosome::readFasta(int maxLength) { + bool isFirst = true; + header = string(""); + base = string(""); + + ifstream in(chromFile.c_str()); + if (in.fail()) { + string msg("Cannot open "); + msg.append(chromFile); + msg.append(". System code is: "); + msg.append(Util::int2string(errno)); + throw InvalidInputException(msg); + } + + while (in.good() && base.size() < maxLength) { + string line; + getline(in, line); + if (line[0] == '>') { + if (!isFirst) { + string msg = "Chromosome file: "; + msg = msg + chromFile; + msg = + msg + + " must have one sequence only. But it has more than one."; + throw InvalidInputException(msg); + } else { + header = line; + isFirst = false; + } + } else { + base.append(line); + } + } + in.close(); +} + /** * Convert alphabet to upper case if it has not been done before **/ @@ -159,20 +260,21 @@ void Chromosome::toUpperCase() { /** * Segment coordinates are inclusive [s,e] **/ -void Chromosome::removeN() { +void Chromosome::removeAmbiguous() { // Store non-N index int start = -1; + char uncertainChar = Util::isDna? 'N' : 'X'; for (int i = 0; i < base.size(); i++) { - if (base[i] != 'N' && start == -1) { + if (base[i] != uncertainChar && start == -1) { start = i; - } else if (base[i] == 'N' && start != -1) { + } else if (base[i] == uncertainChar && start != -1) { vector * v = new vector(); v->push_back(start); v->push_back(i - 1); segment->push_back(v); start = -1; - } else if (i == base.size() - 1 && base[i] != 'N' && start != -1) { + } else if (i == base.size() - 1 && base[i] != uncertainChar && start != -1) { vector * v = new vector(); v->push_back(start); v->push_back(i); @@ -181,48 +283,73 @@ void Chromosome::removeN() { start = -1; } } + + // Test code + // for(auto seg : *segment){ + // cerr << seg->at(0) << "-" << seg->at(1) << endl; + // } } /** + * Applied to DNA only--not proteins. * If the gap between two consecutive segments is less than 10 bp. * Segments that are shorter than 20 bp are not added. */ void Chromosome::mergeSegments() { - vector *> * mSegment = new vector *>(); - int s = segment->at(0)->at(0); - int e = segment->at(0)->at(1); + // cout << "Segment size is " << segment->size() << endl; + // cout << base << endl; - for (int i = 1; i < segment->size(); i++) { - int s1 = segment->at(i)->at(0); - int e1 = segment->at(i)->at(1); + if (segment->size() > 0) { + vector *> * mSegment = new vector *>(); + int s = segment->at(0)->at(0); + int e = segment->at(0)->at(1); - if (s1 - e < 10) { - e = e1; - } else { - if (e - s + 1 >= 20) { - vector * seg = new vector(); - seg->push_back(s); - seg->push_back(e); - mSegment->push_back(seg); + for (int i = 1; i < segment->size(); i++) { + int s1 = segment->at(i)->at(0); + int e1 = segment->at(i)->at(1); + + /* + if(e1 - s1 + 1 <= 2000){ + cout << "s1:" << s1 << " e1: " << e1 << endl; + } + */ + + if (s1 - e < 10) { + e = e1; + } else { + if (e - s + 1 >= 20) { + vector * seg = new vector(); + seg->push_back(s); + seg->push_back(e); + mSegment->push_back(seg); + } + + // Test start + /* + if (e - s + 1 <= 100) { + cout << "Removing: " << base.substr(s, e - s + 1) << endl; + } + */ + // Test end + s = s1; + e = e1; } + } - s = s1; - e = e1; + // Handle the last index + if (e - s + 1 >= 20) { + vector * seg = new vector(); + seg->push_back(s); + seg->push_back(e); + mSegment->push_back(seg); } - } - // Handle the last index - if (e - s + 1 >= 20) { - vector * seg = new vector(); - seg->push_back(s); - seg->push_back(e); - mSegment->push_back(seg); + Util::deleteInVector(segment); + segment->clear(); + delete segment; + segment = mSegment; } - - Util::deleteInVector(segment); - segment->clear(); - segment = mSegment; } void Chromosome::makeSegmentList() { @@ -261,14 +388,22 @@ const string* Chromosome::getBase() { return &base; } +string& Chromosome::getBaseRef() { + return base; +} + +string& Chromosome::getHeaderRef() { + return header; +} + const vector *> * Chromosome::getSegment() { return segment; } -void Chromosome::printSegmentList(){ +void Chromosome::printSegmentList() { int l = segment->size(); cout << "Segment list size = " << l << endl; - for(int i = 0; i < l; i++){ + for (int i = 0; i < l; i++) { cout << segment->at(i)->at(0) << "\t"; cout << segment->at(i)->at(1) << endl; } @@ -296,6 +431,11 @@ int Chromosome::getEffectiveSize() { } int Chromosome::getGcContent() { + if(!Util::isDna){ + cerr << "Calculating GC content on a protein sequence is not allowed." << endl; + throw std::exception(); + } + int gc = 0; int size = base.size(); for (int i = 0; i < size; i++) { @@ -306,3 +446,37 @@ int Chromosome::getGcContent() { } return gc; } + +void Chromosome::makeBaseCount() { + if(!Util::isDna){ + cerr << "Counting nucleotides in a protein sequence is not allowed." << endl; + throw std::exception(); + } + + int size = base.size(); + for (int i = 0; i < size; i++) { + switch (base.at(i)) { + case 'A': + baseCount->at(0)++; + break; +; case 'C': + baseCount->at(1)++; + break; + case 'G': + baseCount->at(2)++; + break; + case 'T': + baseCount->at(3)++; + break; + } + } +} + +vector * Chromosome::getBaseCount() { + if(!Util::isDna){ + cerr << "Counting nucleotides in a protein sequence is not allowed." << endl; + throw std::exception(); + } + + return baseCount; +} diff --git a/src/nonltr/Chromosome.h b/src/nonltr/Chromosome.h index 0632458..adb42c2 100644 --- a/src/nonltr/Chromosome.h +++ b/src/nonltr/Chromosome.h @@ -27,9 +27,11 @@ namespace nonltr { class Chromosome: public IChromosome { public: Chromosome(); + Chromosome(uint64_t); Chromosome(string); Chromosome(string, bool); Chromosome(string, int); + Chromosome(string, int, int); Chromosome(string &, string&); Chromosome(string &, string&, int); @@ -37,6 +39,9 @@ class Chromosome: public IChromosome { virtual ~Chromosome(); + virtual string& getBaseRef(); + virtual string& getHeaderRef(); + virtual const string* getBase(); virtual const vector *> * getSegment(); virtual void printSegmentList(); @@ -47,19 +52,23 @@ class Chromosome: public IChromosome { virtual void setSequence(string&); virtual void appendToSequence(const string&); virtual void finalize(); - + virtual vector * getBaseCount(); + virtual void insert(const string&); protected: string chromFile; string header; string base; + int str_len; + int effectiveSize; int segLength; vector *> * segment; void readFasta(); + void readFasta(int); void toUpperCase(); - void removeN(); + void removeAmbiguous(); void mergeSegments(); virtual void help(int, bool); void makeSegmentList(); @@ -69,9 +78,11 @@ class Chromosome: public IChromosome { bool isHeaderReady; bool isBaseReady; bool isFinalized; + bool canClean = false; void reverseSegments(); - + void makeBaseCount(); + vector * baseCount; }; } diff --git a/src/nonltr/ChromosomeOneDigit.cpp b/src/nonltr/ChromosomeOneDigit.cpp index 9af2c51..2783d7a 100644 --- a/src/nonltr/ChromosomeOneDigit.cpp +++ b/src/nonltr/ChromosomeOneDigit.cpp @@ -3,25 +3,10 @@ * * Created on: Jul 31, 2012 * Author: Hani Zakaria Girgis, PhD at the NCB1/NLM/NIH - * A A - * T T - * G G - * C C - * R G or A - * Y T or C - * M A or C - * K G or T - * S G or C - * W A or T - * H A or C or T - * B G or T or C - * V G or C or A - * D G or T or A - * N G or T or A or C */ #include #include - +#include #include "Chromosome.h" #include "ChromosomeOneDigit.h" #include "../exception/InvalidInputException.h" @@ -32,6 +17,12 @@ namespace nonltr { ChromosomeOneDigit::ChromosomeOneDigit() : Chromosome() { + //cout << "The no args constructor is called" << endl; +} + +ChromosomeOneDigit::ChromosomeOneDigit(uint64_t s) : + Chromosome(s) { + //cout << "The no args constructor is called" << endl; } ChromosomeOneDigit::ChromosomeOneDigit(string fileName) : @@ -39,16 +30,22 @@ ChromosomeOneDigit::ChromosomeOneDigit(string fileName) : help(); } -ChromosomeOneDigit::ChromosomeOneDigit(string seq, string info) : +ChromosomeOneDigit::ChromosomeOneDigit(string fileName, int segmentLength, + int maxLength) : + Chromosome(fileName, segmentLength, maxLength) { + help(); +} + +ChromosomeOneDigit::ChromosomeOneDigit(string& seq, string& info) : Chromosome(seq, info) { + //cout << "Two string constructor is called" << endl; help(); } -void ChromosomeOneDigit::help() { - // Build codes - buildCodes(); - // Modify the sequence in the super class - encodeNucleotides(); +ChromosomeOneDigit::ChromosomeOneDigit(string& seq, string& info, int length) : + Chromosome(seq, info, length) { + //cout << "Two string constructor is called" << endl; + help(); } void ChromosomeOneDigit::finalize() { @@ -56,190 +53,82 @@ void ChromosomeOneDigit::finalize() { help(); } -void ChromosomeOneDigit::buildCodes() { +void ChromosomeOneDigit::help() { + // Can delete the codes + canClean = true; + // Make map codes = new map(); - // Certain nucleotides - codes->insert(map::value_type('A', (char) 0)); - codes->insert(map::value_type('C', (char) 1)); - codes->insert(map::value_type('G', (char) 2)); - codes->insert(map::value_type('T', (char) 3)); - - // Common uncertain nucleotide - // codes->insert(map::value_type('N', (char) 4)); - - // Uncertain nucleotides - codes->insert(map::value_type('R', codes->at('G'))); - codes->insert(map::value_type('Y', codes->at('C'))); - codes->insert(map::value_type('M', codes->at('A'))); - codes->insert(map::value_type('K', codes->at('T'))); - codes->insert(map::value_type('S', codes->at('G'))); - codes->insert(map::value_type('W', codes->at('T'))); - codes->insert(map::value_type('H', codes->at('C'))); - codes->insert(map::value_type('B', codes->at('T'))); - codes->insert(map::value_type('V', codes->at('A'))); - codes->insert(map::value_type('D', codes->at('T'))); - codes->insert(map::value_type('N', codes->at('C'))); - codes->insert(map::value_type('X', codes->at('G'))); + // Build codes + buildCodes(); + // Modify the sequence in the super class + encode(); } ChromosomeOneDigit::~ChromosomeOneDigit() { - codes->clear(); - delete codes; -} - -/** - * This method converts nucleotides in the segments to single digit codes - */ -void ChromosomeOneDigit::encodeNucleotides() { - - for (int s = 0; s < segment->size(); s++) { - int segStart = segment->at(s)->at(0); - int segEnd = segment->at(s)->at(1); - for (int i = segStart; i <= segEnd; i++) { - if (codes->count(base[i]) > 0) { - base[i] = codes->at(base[i]); - } else { - string msg = "Invalid nucleotide: "; - msg.append(1, base[i]); - throw InvalidInputException(msg); - } - } - } - - // Digitize skipped segments - int segNum = segment->size(); - if(segNum > 0){ - // The first interval - before the first segment - int segStart = 0; - int segEnd = segment->at(0)->at(0)-1; - - for (int s = 0; s <= segNum; s++) { - for (int i = segStart; i <= segEnd; i++) { - char c = base[i]; - if(c != 'N'){ - if (codes->count(c) > 0) { - base[i] = codes->at(c); - } else { - string msg = "Invalid nucleotide: "; - msg.append(1, c); - throw InvalidInputException(msg); - } + if (canClean) { + codes->clear(); + delete codes; } - } - - // The regular intervals between two segments - if(s < segNum-1){ - segStart = segment->at(s)->at(1)+1; - segEnd = segment->at(s+1)->at(0)-1; - } - // The last interval - after the last segment - else if(s == segNum - 1){ - segStart = segment->at(s)->at(1)+1; - segEnd = base.size()-1; - } - } - } } -/* -void ChromosomeOneDigit::encodeNucleotides() { - int seqLen = base.size(); - - for (int i = 0; i < seqLen; i++) { - if (codes->count(base[i]) > 0) { - base[i] = codes->at(base[i]); - } else { - string msg = "Invalid nucleotide: "; - msg.append(1, base[i]); - throw InvalidInputException(msg); - } - } - -} -*/ - /** - * Cannot be called on already finalized object. - */ -void ChromosomeOneDigit::makeR() { - //cout << "Making reverse ..." << endl; - makeReverse(); - reverseSegments(); -} - -/** - * Cannot be called on already finalized object. + * This method converts nucleotides in the segments to single digit codes */ -void ChromosomeOneDigit::makeRC() { - //cout << "Making reverse complement ..." << endl; - makeComplement(); - makeReverse(); - reverseSegments(); -} - -void ChromosomeOneDigit::makeComplement() { - map complement; - - // Certain nucleotides - complement.insert(map::value_type((char) 0, (char) 3)); - complement.insert(map::value_type((char) 1, (char) 2)); - complement.insert(map::value_type((char) 2, (char) 1)); - complement.insert(map::value_type((char) 3, (char) 0)); - - // Unknown nucleotide - complement.insert(map::value_type('N', 'N')); - // complement.insert(map::value_type((char) 4, (char) 4)); - - // Convert a sequence to its complement - int seqLen = base.size(); - for (int i = 0; i < seqLen; i++) { - if (complement.count(base[i]) > 0) { - base[i] = complement.at(base[i]); - } else { - cerr << "Error: The digit " << (char) base[i]; - cerr << " does not represent a base." << endl; - exit(2); +void ChromosomeOneDigit::encode() { + + for (int s = 0; s < segment->size(); s++) { + int segStart = segment->at(s)->at(0); + int segEnd = segment->at(s)->at(1); + for (int i = segStart; i <= segEnd; i++) { + + if (codes->count(base[i]) > 0) { + base[i] = codes->at(base[i]); + } else { + string msg = "Invalid nucleotide: "; + std::ostringstream oss; + int b_int = base[i]; + oss << msg << b_int; + throw InvalidInputException(oss.str()); + } } } -} - -void ChromosomeOneDigit::makeReverse() { - int last = base.size() - 1; - - // Last index to be switched - int middle = base.size() / 2; - - for (int i = 0; i < middle; i++) { - char temp = base[last - i]; - base[last - i] = base[i]; - base[i] = temp; - } -} -void ChromosomeOneDigit::reverseSegments() { + // Digitize skipped segments + char uncertainChar = Util::isDna? 'N' : 'X'; int segNum = segment->size(); - int lastBase = size() - 1; - - // Calculate the coordinate on the main strand - for (int i = 0; i < segNum; i++) { - vector * seg = segment->at(i); - - int s = lastBase - seg->at(1); - int e = lastBase - seg->at(0); - seg->clear(); - seg->push_back(s); - seg->push_back(e); - } - - // Reverse the regions within the list - int lastRegion = segNum - 1; - int middle = segNum / 2; - for (int i = 0; i < middle; i++) { - vector * temp = segment->at(lastRegion - i); - (*segment)[lastRegion - i] = segment->at(i); - (*segment)[i] = temp; + if (segNum > 0) { + // The first interval - before the first segment + int segStart = 0; + int segEnd = segment->at(0)->at(0) - 1; + + for (int s = 0; s <= segNum; s++) { + for (int i = segStart; i <= segEnd; i++) { + char c = base[i]; + + if (c != uncertainChar) { + if (codes->count(c) > 0) { + base[i] = codes->at(c); + } else { + string msg = "ChromosomeOneDigit::encode() found invalid letter: "; + msg.append(1, c); + throw InvalidInputException(msg); + } + } + } + + // The regular intervals between two segments + if (s < segNum - 1) { + segStart = segment->at(s)->at(1) + 1; + segEnd = segment->at(s + 1)->at(0) - 1; + } + // The last interval - after the last segment + else if (s == segNum - 1) { + segStart = segment->at(s)->at(1) + 1; + segEnd = base.size() - 1; + } + } } } diff --git a/src/nonltr/ChromosomeOneDigit.h b/src/nonltr/ChromosomeOneDigit.h index 384698f..19875eb 100644 --- a/src/nonltr/ChromosomeOneDigit.h +++ b/src/nonltr/ChromosomeOneDigit.h @@ -15,28 +15,28 @@ namespace nonltr { class ChromosomeOneDigit: public Chromosome { private: - /* Fields */ - map * codes; - - /* Methods */ + void encode(); void help(); - void buildCodes(); - void encodeNucleotides(); - void makeReverse(); - void makeComplement(); - void reverseSegments(); + +protected: + bool canClean = false; + map * codes; + virtual void buildCodes() = 0; + public: /* Methods */ ChromosomeOneDigit(); + ChromosomeOneDigit(uint64_t); ChromosomeOneDigit(string); - ChromosomeOneDigit(string, string); + ChromosomeOneDigit(string, int, int); + ChromosomeOneDigit(string&, string&); + ChromosomeOneDigit(string&, string&, int); virtual ~ChromosomeOneDigit(); virtual void finalize(); - void makeR(); - void makeRC(); + }; } diff --git a/src/nonltr/ChromosomeOneDigitDna.cpp b/src/nonltr/ChromosomeOneDigitDna.cpp new file mode 100644 index 0000000..9f8bbf7 --- /dev/null +++ b/src/nonltr/ChromosomeOneDigitDna.cpp @@ -0,0 +1,154 @@ +#include "ChromosomeOneDigitDna.h" + +namespace nonltr{ + +ChromosomeOneDigitDna::ChromosomeOneDigitDna() : ChromosomeOneDigit() {} +ChromosomeOneDigitDna::ChromosomeOneDigitDna(uint64_t s) : ChromosomeOneDigit(s) {} + +ChromosomeOneDigitDna::ChromosomeOneDigitDna(string fileName) : + ChromosomeOneDigit(fileName){ + +} + +ChromosomeOneDigitDna::ChromosomeOneDigitDna(string fileName, int segmentLength, int maxLength) : + ChromosomeOneDigit(fileName, segmentLength, maxLength) { + +} + +ChromosomeOneDigitDna::ChromosomeOneDigitDna(string& seq, string& info) : + ChromosomeOneDigit(seq, info){ + +} + +ChromosomeOneDigitDna::ChromosomeOneDigitDna(string& seq, string& info, int length) : + ChromosomeOneDigit(seq, info, length) { +} + +ChromosomeOneDigitDna::~ChromosomeOneDigitDna(){ + +} + +/** + * A A + * T T + * G G + * C C + * R G or A + * Y T or C + * M A or C + * K G or T + * S G or C + * W A or T + * H A or C or T + * B G or T or C + * V G or C or A + * D G or T or A + * N G or T or A or C + */ +void ChromosomeOneDigitDna::buildCodes() { + // Certain nucleotides + codes->insert(map::value_type('A', (char) 0)); + codes->insert(map::value_type('C', (char) 1)); + codes->insert(map::value_type('G', (char) 2)); + codes->insert(map::value_type('T', (char) 3)); + + // Uncertain nucleotides + codes->insert(map::value_type('R', codes->at('G'))); + codes->insert(map::value_type('Y', codes->at('C'))); + codes->insert(map::value_type('M', codes->at('A'))); + codes->insert(map::value_type('K', codes->at('T'))); + codes->insert(map::value_type('S', codes->at('G'))); + codes->insert(map::value_type('W', codes->at('T'))); + codes->insert(map::value_type('H', codes->at('C'))); + codes->insert(map::value_type('B', codes->at('T'))); + codes->insert(map::value_type('V', codes->at('A'))); + codes->insert(map::value_type('D', codes->at('T'))); + codes->insert(map::value_type('N', codes->at('C'))); + codes->insert(map::value_type('X', codes->at('G'))); +} + +/** + * Cannot be called on already finalized object. + */ +void ChromosomeOneDigitDna::makeR() { + //cout << "Making reverse ..." << endl; + makeReverse(); + reverseSegments(); +} + +/** + * Cannot be called on already finalized object. + */ +void ChromosomeOneDigitDna::makeRC() { + //cout << "Making reverse complement ..." << endl; + makeComplement(); + makeReverse(); + reverseSegments(); +} + +void ChromosomeOneDigitDna::makeComplement() { + map complement; + + // Certain nucleotides + complement.insert(map::value_type((char) 0, (char) 3)); + complement.insert(map::value_type((char) 1, (char) 2)); + complement.insert(map::value_type((char) 2, (char) 1)); + complement.insert(map::value_type((char) 3, (char) 0)); + + // Unknown nucleotide + complement.insert(map::value_type('N', 'N')); + // complement.insert(map::value_type((char) 4, (char) 4)); + + // Convert a sequence to its complement + int seqLen = base.size(); + for (int i = 0; i < seqLen; i++) { + if (complement.count(base[i]) > 0) { + base[i] = complement.at(base[i]); + } else { + cerr << "Error: The digit " << (char) base[i]; + cerr << " does not represent a base." << endl; + exit(2); + } + } +} + +void ChromosomeOneDigitDna::makeReverse() { + int last = base.size() - 1; + + // Last index to be switched + int middle = base.size() / 2; + + for (int i = 0; i < middle; i++) { + char temp = base[last - i]; + base[last - i] = base[i]; + base[i] = temp; + } +} + +void ChromosomeOneDigitDna::reverseSegments() { + int segNum = segment->size(); + int lastBase = size() - 1; + + // Calculate the coordinate on the main strand + for (int i = 0; i < segNum; i++) { + vector * seg = segment->at(i); + + int s = lastBase - seg->at(1); + int e = lastBase - seg->at(0); + seg->clear(); + seg->push_back(s); + seg->push_back(e); + } + + // Reverse the regions within the list + int lastRegion = segNum - 1; + int middle = segNum / 2; + for (int i = 0; i < middle; i++) { + vector * temp = segment->at(lastRegion - i); + (*segment)[lastRegion - i] = segment->at(i); + (*segment)[i] = temp; + } +} + + +} diff --git a/src/nonltr/ChromosomeOneDigitDna.h b/src/nonltr/ChromosomeOneDigitDna.h new file mode 100644 index 0000000..7bd9dc7 --- /dev/null +++ b/src/nonltr/ChromosomeOneDigitDna.h @@ -0,0 +1,36 @@ +/* + * ChromosomeOneDigitDna.h + * Created on: September 28, 2018 + * Author: Hani Z. Girgis, PhD + */ + + #ifndef HROMOSOMEONEDIGITDNA_H_ + #define HROMOSOMEONEDIGITDNA_H_ + +#include "ChromosomeOneDigit.h" + +namespace nonltr{ + class ChromosomeOneDigitDna: public ChromosomeOneDigit{ + private: + void makeReverse(); + void makeComplement(); + void reverseSegments(); + + protected: + virtual void buildCodes(); + + public: + ChromosomeOneDigitDna(); + ChromosomeOneDigitDna(uint64_t); + ChromosomeOneDigitDna(string); + ChromosomeOneDigitDna(string, int, int); + ChromosomeOneDigitDna(string&, string&); + ChromosomeOneDigitDna(string&, string&, int); + virtual ~ChromosomeOneDigitDna(); + + void makeR(); + void makeRC(); + }; +} + +#endif diff --git a/src/nonltr/ChromosomeOneDigitProtein.cpp b/src/nonltr/ChromosomeOneDigitProtein.cpp new file mode 100644 index 0000000..7add5af --- /dev/null +++ b/src/nonltr/ChromosomeOneDigitProtein.cpp @@ -0,0 +1,64 @@ +#include "ChromosomeOneDigitProtein.h" + +namespace nonltr{ + +ChromosomeOneDigitProtein::ChromosomeOneDigitProtein() : + ChromosomeOneDigit() { + +} + +ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string fileName) : + ChromosomeOneDigit(fileName){ + +} + +ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string fileName, int segmentLength, int maxLength) : + ChromosomeOneDigit(fileName, segmentLength, maxLength) { + +} + +ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string& seq, string& info) : + ChromosomeOneDigit(seq, info){ + +} + +ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string& seq, string& info, int length) : + ChromosomeOneDigit(seq, info, length) { +} + +ChromosomeOneDigitProtein::~ChromosomeOneDigitProtein(){ + +} + +void ChromosomeOneDigitProtein::buildCodes() { + // https://en.wikipedia.org/wiki/Proteinogenic_amino_acid + codes->insert(map::value_type('A', (char) 0)); + codes->insert(map::value_type('C', (char) 1)); + codes->insert(map::value_type('D', (char) 2)); + codes->insert(map::value_type('E', (char) 3)); + codes->insert(map::value_type('F', (char) 4)); + codes->insert(map::value_type('G', (char) 5)); + codes->insert(map::value_type('H', (char) 6)); + codes->insert(map::value_type('I', (char) 7)); + codes->insert(map::value_type('K', (char) 8)); + codes->insert(map::value_type('L', (char) 9)); + codes->insert(map::value_type('M', (char) 10)); + codes->insert(map::value_type('N', (char) 11)); + codes->insert(map::value_type('O', (char) 12)); + codes->insert(map::value_type('P', (char) 13)); + codes->insert(map::value_type('Q', (char) 14)); + codes->insert(map::value_type('R', (char) 15)); + codes->insert(map::value_type('S', (char) 16)); + codes->insert(map::value_type('T', (char) 17)); + codes->insert(map::value_type('U', (char) 18)); + codes->insert(map::value_type('V', (char) 19)); + codes->insert(map::value_type('W', (char) 20)); + codes->insert(map::value_type('Y', (char) 21)); + + // Uncertain uncleotides + codes->insert(map::value_type('B', codes->at('D'))); + codes->insert(map::value_type('Z', codes->at('E'))); + codes->insert(map::value_type('J', codes->at('L'))); +} + +}// End namespace \ No newline at end of file diff --git a/src/nonltr/ChromosomeOneDigitProtein.h b/src/nonltr/ChromosomeOneDigitProtein.h new file mode 100644 index 0000000..b5f78ee --- /dev/null +++ b/src/nonltr/ChromosomeOneDigitProtein.h @@ -0,0 +1,28 @@ +/* + * ChromosomeOneDigitProtein.h + * Created on: October 2, 2018 + * Author: Hani Z. Girgis, PhD + */ + + #ifndef HROMOSOMEONEDIGITPROTEIN_H_ + #define HROMOSOMEONEDIGITPROTEIN_H_ + +#include "ChromosomeOneDigit.h" + +namespace nonltr{ + class ChromosomeOneDigitProtein: public ChromosomeOneDigit{ + + protected: + virtual void buildCodes(); + + public: + ChromosomeOneDigitProtein(); + ChromosomeOneDigitProtein(string); + ChromosomeOneDigitProtein(string, int, int); + ChromosomeOneDigitProtein(string&, string&); + ChromosomeOneDigitProtein(string&, string&, int); + virtual ~ChromosomeOneDigitProtein(); + }; +} + +#endif \ No newline at end of file diff --git a/src/nonltr/KmerHashTable.cpp b/src/nonltr/KmerHashTable.cpp index dc53505..56fd5cd 100644 --- a/src/nonltr/KmerHashTable.cpp +++ b/src/nonltr/KmerHashTable.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "../utility/Util.h" #include "../exception/InvalidInputException.h" @@ -222,6 +223,38 @@ void KmerHashTable::wholesaleIncrement(const char* sequence, }*/ } + +/** + * Call wholesaleIncrement on the segment itself. + * Then, call it again on the reverse complement of this segment. + * + * sequence: is a long sequence usually a long segment of a chromosome. + * sFirstKmer: is the start index of the first k-mer. + * sLastKmer: is the start index of the last k-mer. + */ +template +int KmerHashTable::wholesaleIncrementNoOverflow(const char* sequence, + int firstKmerStart, int lastKmerStart) { + // Increment k-mer's in the forward strand + vector hashList = vector(); + hash(sequence, firstKmerStart, lastKmerStart, &hashList); + int ret = 0; + int size = hashList.size(); + for (int i = 0; i < size; i++) { + I keyHash = hashList.at(i); + if (keyHash >= maxTableSize) { + cerr << "array out of bounds" << endl; + throw ""; + } + if (values[keyHash] < std::numeric_limits::max()) { + values[keyHash]++; + } else { + ret = -1; + } + } + return ret; +} + /** * Increment the entry associated with the key by one. */ @@ -384,7 +417,7 @@ vector* KmerHashTable::getKeys() { template void KmerHashTable::printTable(string output) { vector keys; -// getKeys(keys); + //getKeys(keys); ofstream out(output.c_str()); diff --git a/src/nonltr/KmerHashTable.h b/src/nonltr/KmerHashTable.h index 7c38e23..cd072af 100644 --- a/src/nonltr/KmerHashTable.h +++ b/src/nonltr/KmerHashTable.h @@ -57,6 +57,7 @@ class KmerHashTable: public ITableView { void increment(const char*); void increment(const char*, int); void wholesaleIncrement(const char*, int, int); + int wholesaleIncrementNoOverflow(const char*, int, int); void addReverseComplement(); I countNonInitialEntries(); diff --git a/src/RepeatsDetector.cpp b/src/nonltr/RepeatsDetector.cpp similarity index 96% rename from src/RepeatsDetector.cpp rename to src/nonltr/RepeatsDetector.cpp index 443cf24..74f525d 100644 --- a/src/RepeatsDetector.cpp +++ b/src/nonltr/RepeatsDetector.cpp @@ -12,13 +12,13 @@ #include #include -#include "nonltr/Trainer.h" -#include "nonltr/KmerHashTable.h" -#include "nonltr/TableBuilder.h" -#include "nonltr/HMM.h" -#include "nonltr/Scanner.h" -#include "nonltr/ChromListMaker.h" -#include "utility/Util.h" +#include "../nonltr/Trainer.h" +#include "../nonltr/KmerHashTable.h" +#include "../nonltr/TableBuilder.h" +#include "../nonltr/HMM.h" +#include "../nonltr/Scanner.h" +#include "../nonltr/ChromListMaker.h" +#include "../utility/Util.h" using namespace std; using namespace nonltr; @@ -67,7 +67,7 @@ void drive(map * const param) { Util::deleteFile(param->at(MSK_PRM)); } } - + if (param->count(RPT_PRM) > 0) { if (param->count(GNM_PRM) > 0) { cout << "Deleting pre-existing files under " << param->at(RPT_PRM); @@ -78,7 +78,7 @@ void drive(map * const param) { Util::deleteFile(param->at(RPT_PRM)); } } - + if (param->count(SCO_PRM) > 0 && param->count(GNM_PRM) > 0) { cout << "Deleting pre-existing files under " << param->at(SCO_PRM); cout << endl; @@ -97,21 +97,21 @@ void drive(map * const param) { // Process the input int k = atoi(param->at(LEN_PRM).c_str()); - + if (param->count(GNM_PRM) > 0) { string genomeDir = param->at(GNM_PRM); int order = atoi(param->at(ORD_PRM).c_str()); double s = atoi(param->at(GAU_PRM).c_str()); double t = atoi(param->at(THR_PRM).c_str()); int minObs = atoi(param->at(MIN_PRM).c_str()); - + // Adjust the threshold when it is one because of the log base. if (((int) t) == 1) { t = 1.5; cout << "The base of the logarithmic function is adjusted." << endl; } - - + + // This part or the next Trainer * trainer; if (param->count(CND_PRM) > 0) { @@ -119,20 +119,20 @@ void drive(map * const param) { } else { trainer = new Trainer(genomeDir, order, k, s, t, minObs); } - - + + if (param->count(TBL_PRM)) { cout << "Printing the count of the kmer's to: "; cout << param->at(TBL_PRM) << endl; trainer->printTable(param->at(TBL_PRM)); } - + if (param->count(HMO_PRM) > 0) { cout << "Printing the HMM to: " << endl; cout << param->at(HMO_PRM) << endl; trainer->printHmm(param->at(HMO_PRM)); } - + // Stage 3: Scan cout << endl << endl; cout << "Stage 4: Scanning ..." << endl; @@ -141,33 +141,33 @@ void drive(map * const param) { if (param->count(DIR_PRM) > 0) { Util::readChromList(param->at(DIR_PRM), fileList, string("fa")); } - + int chromCount = fileList->size(); for (int i = 0; i < chromCount; i++) { cout << "Scanning: " << fileList->at(i) << endl; - + // Output file name string path(fileList->at(i)); int slashLastIndex = path.find_last_of(Util::fileSeparator); int dotLastIndex = path.find_last_of("."); string nickName = path.substr(slashLastIndex + 1, dotLastIndex - slashLastIndex - 1); - + // Process each sequence with the ith file ChromListMaker * maker = new ChromListMaker(fileList->at(i)); - const vector * chromList = maker->makeChromOneDigitList(); + const vector * chromList = maker->makeChromOneDigitDnaList(); ChromListMaker * oMaker = new ChromListMaker(fileList->at(i)); const vector * oChromList; if (param->count(MSK_PRM) > 0) { oChromList = oMaker->makeChromList(); } - + for (int h = 0; h < chromList->size(); h++) { - ChromosomeOneDigit * chrom = dynamic_cast(chromList->at(h)); - + ChromosomeOneDigitDna * chrom = dynamic_cast(chromList->at(h)); + // Scan the forward strand Scanner * scanner = new Scanner(trainer->getHmm(), k, chrom,trainer->getTable()); - + // Scan the reverse complement chrom->makeRC(); Scanner * scannerRC = new Scanner(trainer->getHmm(), k, chrom, trainer->getTable()); @@ -175,8 +175,8 @@ void drive(map * const param) { scanner->mergeWithOtherRegions(scannerRC->getRegionList()); delete scannerRC; chrom->makeRC(); - - + + // Scan the reverse chrom->makeR(); Scanner * scannerR = new Scanner(trainer->getHmm(), k, chrom, trainer->getTable()); @@ -186,14 +186,14 @@ void drive(map * const param) { //@@ The chromosome now has the sequence of the reverse strand // The actual strand is calculated if the user requested the scores. - + // Print according to the user's requests bool canAppend = (h == 0) ? false : true; - + if (param->count(SCO_PRM) > 0) { // Calculate the forward strand from the reverse chrom->makeR(); - + string scoFile = param->at(SCO_PRM) + Util::fileSeparator + nickName + ".scr"; if (!canAppend) { cout << "Printing scores to: " << scoFile << endl; @@ -203,7 +203,7 @@ void drive(map * const param) { scorer->printScores(scoFile, canAppend); delete scorer; } - + if (param->count(RPT_PRM) > 0) { string rptFile = param->at(RPT_PRM) + Util::fileSeparator + nickName + ".rpt"; if (!canAppend) { @@ -211,7 +211,7 @@ void drive(map * const param) { } scanner->printIndex(rptFile, canAppend, atoi(param->at(FRM_PRM).c_str())); } - + if (param->count(MSK_PRM) > 0) { string mskFile = param->at(MSK_PRM) + Util::fileSeparator + nickName + ".msk"; if (!canAppend) { @@ -220,41 +220,41 @@ void drive(map * const param) { Chromosome * oChrom = oChromList->at(h); scanner->printMasked(mskFile, *oChrom, canAppend); } - + // Free memory delete scanner; } - + delete maker; delete oMaker; } - + // Free memory fileList->clear(); delete fileList; delete trainer; } else if (param->count(HMI_PRM) > 0) { HMM * hmm = new HMM(param->at(HMI_PRM)); - + string chromFile = param->at(SEQ_PRM); string scoresFile = param->at(SCI_PRM); - - ChromosomeOneDigit * chrom = new ChromosomeOneDigit(chromFile); + + ChromosomeOneDigitDna * chrom = new ChromosomeOneDigitDna(chromFile); Scanner * scanner = new Scanner(hmm, k, chrom, scoresFile); - + if (param->count(RPT_PRM) > 0) { string rptFile = param->at(RPT_PRM); cout << "Printing locations to: " << rptFile << endl; scanner->printIndex(rptFile, false, atoi(param->at(FRM_PRM).c_str())); } - + if (param->count(MSK_PRM) > 0) { string mskFile = param->at(MSK_PRM); cout << "Printing masked sequence to: " << mskFile << endl; Chromosome oChrom(chromFile); scanner->printMasked(mskFile, oChrom, false); } - + // Free memory delete scanner; delete chrom; @@ -266,7 +266,7 @@ int main(int argc, char * argv[]) { cout << endl << endl; cout << "This is Red (REpeat Detector) designed and developed by "; cout << "Hani Zakaria Girgis, PhD." << endl << endl; - + cout << "Version: 05/22/2015" << endl << endl; string message = string("Valid argument pairs:\n"); @@ -278,8 +278,8 @@ int main(int argc, char * argv[]) { message.append("\t\tFiles with \".fa\" extension in this directory are NOT used for completing the table.\n"); message.append("\t\tThese Files MUST have different names from those in the genome directory.\n"); message.append("\t\tThese Files are scanned for repeats.\n"); - - + + message.append("\t-len word length equals k defining the k-mer. The default is floor(log_4(genome size)).\n"); message.append("\t-ord order of the background Markov chain. The default is floor(k/2)-1.\n"); message.append("\t-gau half width of the mask. The default is based on the GC content.\n"); @@ -290,7 +290,7 @@ int main(int argc, char * argv[]) { message.append("\t-tbl file where the table of the adjusted counts is written, optional.\n"); message.append("\t-sco directory where scores are saved, optional.\n"); message.append("\t\tScore files have the \".scr\" extension.\n"); - + message.append("\t-cnd directory where candidate regions are saved, optional.\n"); message.append("\t\tCandidates files have the \".cnd\" extension.\n"); message.append("\t-rpt directory where repeats locations are saved, optional.\n"); @@ -300,7 +300,7 @@ int main(int argc, char * argv[]) { message.append("\t-frm the format of the output: 1 (chrName:start-end) or 2 (chrName\tstart\tend).\n"); message.append("\t\tThe output format are zero based and the end is exclusive.\n"); - + message.append("\t-hmo file where the HMM is saved, optional.\n\n"); message.append("Examples:\n"); @@ -342,11 +342,11 @@ int main(int argc, char * argv[]) { return 1; } } - - + + // Check if the user provided the essential arguments - - + + if (param->count(LEN_PRM) == 0) { if (param->count(GNM_PRM) > 0) { // Calculate the size of the genome @@ -365,9 +365,9 @@ int main(int argc, char * argv[]) { } fileList->clear(); delete fileList; - + double temp = log(genomeLength) / log(4.0); - + int k = floor(temp); cout << "The recommended k is " << k << "." << endl; if (k > 15) { @@ -382,17 +382,17 @@ int main(int argc, char * argv[]) { k = 12; } cout << endl; - + string kString = Util::int2string(k); param->insert(map::value_type(LEN_PRM, kString)); - + } else { cerr << "The word length is required." << endl; cerr << message << endl; return 1; } } - + if(param->count(FRM_PRM) == 0){ cout << "Using the default output format chrName:start-end" << endl; param->insert(map::value_type(FRM_PRM, Util::int2string(Scanner::FRMT_POS))); @@ -404,21 +404,21 @@ int main(int argc, char * argv[]) { return 1; } } - + if (param->count(GNM_PRM) > 0) { Util::checkFile(param->at(GNM_PRM)); - + if (param->count(ORD_PRM) == 0) { double k = atoi(param->at(LEN_PRM).c_str()); int o = floor(k / 2.0) - 1; - + cout << "Using the default background order: " << o << "."; cout << endl; - + string oString = Util::int2string(o); param->insert(map::value_type(ORD_PRM, oString)); } - + if (param->count(THR_PRM) == 0) { cout << "Using the default threshold: 2." << endl; param->insert(map::value_type(THR_PRM, string("2"))); @@ -430,7 +430,7 @@ int main(int argc, char * argv[]) { return 1; } } - + if (param->count(MIN_PRM) == 0) { cout << "Using the default minimum of the observed count of k-mers: 3." << endl; param->insert(map::value_type(MIN_PRM, string("3"))); @@ -442,10 +442,10 @@ int main(int argc, char * argv[]) { return 1; } } - + if (param->count(GAU_PRM) == 0) { cout << "Calculating GC content ..." << endl; - + // 1: Count the gc content of the input genome long genomeLength = 0; long genomeGc = 0; @@ -463,7 +463,7 @@ int main(int argc, char * argv[]) { } fileList->clear(); delete fileList; - + // 2: Calculate the gc content of the input genome double gc = 100.00 * genomeGc / genomeLength; int w = 20; @@ -477,7 +477,7 @@ int main(int argc, char * argv[]) { } } else if (param->count(HMI_PRM) > 0) { Util::checkFile(param->at(HMI_PRM)); - + if (param->count(SEQ_PRM) == 0) { cerr << "The sequence file is required."; cerr << endl; @@ -486,7 +486,7 @@ int main(int argc, char * argv[]) { } else { Util::checkFile(param->at(SEQ_PRM)); } - + if (param->count(SCI_PRM) == 0) { cerr << "The scores file is required."; cerr << endl; @@ -495,14 +495,14 @@ int main(int argc, char * argv[]) { } else { Util::checkFile(param->at(SCI_PRM)); } - + } else { cerr << "A mode is required: training and scanning (-gnm) or "; cerr << "scanning only (-hmi)." << endl; cerr << message << endl; return 1; } - + // Check optional parameters if (param->count(TBL_PRM) > 0 && param->count(GNM_PRM) == 0) { cerr << "Printing the k-mer table is optional with -gnm only."; @@ -510,14 +510,14 @@ int main(int argc, char * argv[]) { cerr << message << endl; return 1; } - + if (param->count(HMO_PRM) > 0 && param->count(GNM_PRM) == 0) { cerr << "Printing the HMM is optional with -gnm only."; cerr << endl; cerr << message << endl; return 1; } - + if (param->count(SCO_PRM) > 0 && param->count(GNM_PRM) == 0) { cerr << "Printing the scores is optional with -gnm only."; cerr << endl; @@ -526,7 +526,7 @@ int main(int argc, char * argv[]) { } else if (param->count(SCO_PRM) > 0 && param->count(GNM_PRM) > 0) { Util::checkFile(param->at(SCO_PRM)); } - + if (param->count(CND_PRM) > 0 && param->count(GNM_PRM) == 0) { cerr << "Printing candidate regions is optional with -gnm only."; @@ -536,8 +536,8 @@ int main(int argc, char * argv[]) { } else if (param->count(CND_PRM) > 0 && param->count(GNM_PRM) > 0) { Util::checkFile(param->at(CND_PRM)); } - - + + if (param->count(DIR_PRM) > 0 && param->count(GNM_PRM) == 0) { cerr << "Processing additional sequences is optional with -gnm only."; cerr << endl; @@ -546,15 +546,15 @@ int main(int argc, char * argv[]) { } else if (param->count(DIR_PRM) > 0 && param->count(GNM_PRM) > 0) { Util::checkFile(param->at(DIR_PRM)); } - + if (param->count(MSK_PRM) > 0 && param->count(GNM_PRM) > 0) { Util::checkFile(param->at(MSK_PRM)); } - + if (param->count(RPT_PRM) > 0 && param->count(GNM_PRM) > 0) { Util::checkFile(param->at(RPT_PRM)); } - + // Print out the parameters table typedef map myMap; myMap::iterator sIter = param->begin(); @@ -565,10 +565,10 @@ int main(int argc, char * argv[]) { sIter++; } cout << endl; - + // Start! drive(param); - + // Clear parameters when done. param->clear(); delete param; @@ -577,7 +577,7 @@ int main(int argc, char * argv[]) { cerr << endl; cerr << message << endl; } - + //return EXIT_SUCCESS; return 0; } diff --git a/src/nonltr/TableBuilder.cpp b/src/nonltr/TableBuilder.cpp index 32733a9..d038aab 100644 --- a/src/nonltr/TableBuilder.cpp +++ b/src/nonltr/TableBuilder.cpp @@ -31,7 +31,7 @@ void TableBuilder::buildTable() { for (int i = 0; i < fileList->size(); i++) { cout << "Counting k-mers in " << fileList->at(i) << " ..." << endl; ChromListMaker * maker = new ChromListMaker(fileList->at(i)); - const vector * chromList = maker->makeChromOneDigitList(); + const vector * chromList = maker->makeChromOneDigitDnaList(); for (int h = 0; h < chromList->size(); h++) { ChromosomeOneDigit * chrom = diff --git a/src/nonltr/Trainer.cpp b/src/nonltr/Trainer.cpp index 3e8865f..1a86a8f 100644 --- a/src/nonltr/Trainer.cpp +++ b/src/nonltr/Trainer.cpp @@ -106,7 +106,7 @@ void Trainer::stage2() { cout << "Calculating the percentage in: " << fileList->at(i) << " ..."; cout << endl; ChromListMaker * maker = new ChromListMaker(fileList->at(i)); - const vector * chromList = maker->makeChromOneDigitList(); + const vector * chromList = maker->makeChromOneDigitDnaList(); for (int h = 0; h < chromList->size(); h++) { ChromosomeOneDigit * chrom = @@ -190,7 +190,7 @@ void Trainer::stage3() { // Read sequences in the file ChromListMaker * maker = new ChromListMaker(fileList->at(i)); - const vector * chromList = maker->makeChromOneDigitList(); + const vector * chromList = maker->makeChromOneDigitDnaList(); for (int h = 0; h < chromList->size(); h++) { ChromosomeOneDigit * chrom = dynamic_cast(chromList->at(h)); @@ -216,7 +216,7 @@ void Trainer::stage3() { } trainingRegionList = detector->getRegionList(); - + } if (isCON && isConRepAvailable) { @@ -225,7 +225,7 @@ void Trainer::stage3() { locList->mergeWithAnotherList(detector->getRegionList()); } trainingRegionList = locList->getList(); - + } // The candidate regions are already copied to the location list @@ -236,7 +236,7 @@ void Trainer::stage3() { // Train the HMM if(isCND || (isCON && isConRepAvailable)){ - + scorer->takeLog(t); scoreList = scorer->getScores(); hmm->train(scoreList, chrom->getSegment(), trainingRegionList); diff --git a/src/predict/BestFirstSelector.cpp b/src/predict/BestFirstSelector.cpp new file mode 100644 index 0000000..c28d39b --- /dev/null +++ b/src/predict/BestFirstSelector.cpp @@ -0,0 +1,258 @@ +// -*- C++ -*- +/* + * BestFirstSelector.cpp + * + * Author: Benjamin T James + */ + +#include "BestFirstSelector.h" +#include "../clutil/Progress.h" +#include +#include + +template +pair*,matrix::GLM> BestFirstSelector::train_regression(Feature* feat, const vector > &training,const vector > &testing) +{ + matrix::GLM glm; + return {NULL,glm}; +} + +using FeatPair = std::pair; + +struct Compare { + bool operator()(const std::pair,double> &a, const std::pair,double> &b) { + return a.second < b.second; + } +}; + +using pqueue = std::priority_queue,double>, std::vector,double> >, Compare>; + +vector > children_of(set feat, + const vector& all_feats, + const set >& closed_list, + + const set > &open_list) +{ + vector > out; + for (auto fp : all_feats) { + set temp = feat; + auto pos = std::find(temp.begin(), temp.end(), fp); + if (pos == temp.end()) { + temp.insert(fp); + } else { + temp.erase(pos); + } + auto pos_bad = std::find(closed_list.begin(), closed_list.end(), temp); + if (!temp.empty() && pos_bad == closed_list.end()) { + auto pos_good = std::find(open_list.begin(), open_list.end(), temp); + if (pos_good == open_list.end()) { + out.push_back(temp); + } + } + } + return out; +} +template +std::string feat_name(Feature* feat) +{ + std::ostringstream oss; + auto feat_names = feat->feat_names(); + for (int i = 0; i < feat_names.size(); i++) { + oss << feat_names[i]; + if (i < feat_names.size() - 1) { + oss << " + "; + } + } + return oss.str(); +} +template +std::string feature_name(const set& feat_list, int k) +{ + Feature feat(k); + for (auto fpair : feat_list) { + feat.add_feature(fpair.first, fpair.second); + } + return feat_name(&feat); +} + +template +Feature* load_feat(Feature* old_feat, const set& feat_list, const vector > &training) +{ + + Feature* feat = new Feature(*old_feat); + feat->set_save(true); + auto c_size = feat->get_combos().size(); + for (int i = 0; i < c_size; i++) { + feat->remove_feature(); + } + set single_feats; + for (FeatPair fp : feat_list) { + for (uint64_t i = 1; i <= fp.first; i *= 2) { + if (i & fp.first) { + single_feats.insert(i); + } + } + feat->add_feature(fp.first, fp.second); + } + for (uint64_t i : single_feats) { + auto minmax = old_feat->get_normal(i); + // #pragma omp critical + // { + // cout << "Feature " << Feature::log2(i) << " min: " << minmax.first << " max: " << minmax.second << endl; + // } + feat->set_normal(i, minmax.first, minmax.second); + } + +// feat->normalize(training); + feat->finalize(); + + return feat; +} + +template +void calculate_table(Feature* feat, vector possible_feats, const vector > &training,const vector > &testing) +{ + auto c_size = feat->get_combos().size(); + for (int i = 0; i < c_size; i++) { + feat->remove_feature(); + } + for (FeatPair fp : possible_feats) { + feat->add_feature(fp.first, fp.second); + } + feat->set_save(true); + feat->normalize(training); + feat->finalize(); + for (auto pr : testing) { + feat->compute(*pr.first, *pr.second); + } +} +template +pair feature_accuracy(const set& feat_set, Feature* old_feat, const vector > &training,const vector > &testing, double id) +{ + Feature* feat = NULL; + feat = load_feat(old_feat, feat_set, training); +// cout << "Considering " << name << " "; + auto pr = FeatureSelector::class_train(training, *feat, id); + auto class_ac = FeatureSelector::class_test(testing, *feat, pr.second, id); + double class_accuracy = get<0>(class_ac); +// cout << "Accuracy: " << class_accuracy << endl; + std::string name = feat_name(feat); + delete feat; + return make_pair(name, class_accuracy); +} + +pair feat_list_sizes(const set >& flist) +{ + int minimum = flist.begin()->size(); + int maximum = 0; + for (auto item : flist) { + if (item.size() < minimum) { + minimum = item.size(); + } + if (item.size() > maximum) { + maximum = item.size(); + } + } + return make_pair(minimum, maximum); +} +template +void evaluate(const vector > &item_list, set > &open_map, pqueue &open_heap, Feature* feat, const vector > &training,const vector > &testing, double id, int max_num_feat) +{ + std::ostringstream oss; +// of size " << item_list[0].size(); + + +// Progress prog(item_list.size(), oss.str()); + auto minmax = feat_list_sizes(open_map); + oss << "Evaluating features " << minmax.second << "/" << max_num_feat; + Progress prog(item_list.size(), oss.str()); + #pragma omp parallel for + for (int i = 0; i < item_list.size(); i++) { + const set& item = item_list[i]; + auto feat_acc = feature_accuracy(item, feat, training, testing, id); + double acc = feat_acc.second; + string name = feat_acc.first; + #pragma omp critical + { + prog++; +// cout << name << ": " << acc << endl; + open_map.insert(item); + open_heap.push(std::make_pair(item, acc)); + } + } + prog.end(); +} + +template +std::pair*,matrix::GLM> BestFirstSelector::train_class(Feature* feat, const vector > &training,const vector > &testing, double id) +{ + set feat_set, best_feat_set; + set > closed_list, open_list; + pqueue open_heap; + + int last_best_changed = 0; + double best_acc = -100; + const double eps = 0; + + cout << "Calculating all features" << endl; + calculate_table(feat, possible_feats, training, testing); + // prime the open_map + vector > children = children_of(feat_set, possible_feats, closed_list, open_list); + evaluate(children, open_list, open_heap, feat, training, testing, id, max_num_feat); + for (int iteration = 0; !open_list.empty(); iteration++) { + + auto minmax = feat_list_sizes(open_list); + + // stopping criteria: if we have already met the maximum number of features + // or if no features changed in the last 3 iterations of having a minimum number of features + if (minmax.second > max_num_feat || (iteration - last_best_changed >= 3 && minmax.second > min_num_feat)) { + break; + } + //cout << "Features: " << minmax.first << " to " << minmax.second << endl; + + // Peek at the maximum-accuracy feature + auto ptr = open_heap.top(); + feat_set = ptr.first; + double acc = ptr.second; + + // Remove the best item from the open list/heap and add to the closed list + open_heap.pop(); + open_list.erase(feat_set); + closed_list.insert(feat_set); + + if (acc - eps > best_acc && feat_set.size() >= min_num_feat && feat_set.size() <= max_num_feat) { + //cout << "New Best feature: " << feature_name(feat_set, feat->get_k()) << endl; + best_feat_set = feat_set; + best_acc = acc; + last_best_changed = iteration; + } + + vector > children = children_of(feat_set, possible_feats, closed_list, open_list); + evaluate(children, open_list, open_heap, feat, training, testing, id, max_num_feat); + }// while (iteration++ - last_best_changed < 2); + + Feature* feat_c = load_feat(feat, best_feat_set, training); + feat_c->set_save(false); + auto pr = FeatureSelector::class_train(training, *feat_c, id); + matrix::GLM c_glm = pr.second; + + auto train_results = FeatureSelector::class_test(training, *feat_c, c_glm, id);//, "train"); + cout << "Training ACC: " << get<0>(train_results) << " " << get<1>(train_results) << " " << get<2>(train_results) << endl; + auto test_results = FeatureSelector::class_test(testing, *feat_c, c_glm, id);//, "test"); + double class_acc = get<0>(test_results); + cout << "Testing ACC: " << class_acc << " " << get<1>(test_results) << " " << get<2>(test_results) << endl; + + cout << "Features: "<< endl; + for (auto line : feat_c->feat_names()) { + cout << "\t" << line << endl; + } + return std::make_pair(feat_c, c_glm); +} + + +template class BestFirstSelector; +template class BestFirstSelector; +template class BestFirstSelector; +template class BestFirstSelector; +template class BestFirstSelector; +template class BestFirstSelector; diff --git a/src/predict/BestFirstSelector.h b/src/predict/BestFirstSelector.h new file mode 100644 index 0000000..b969b09 --- /dev/null +++ b/src/predict/BestFirstSelector.h @@ -0,0 +1,25 @@ +// -*- C++ -*- +/* + * BestFirstSelector.h + * + * Author: Benjamin T James + */ + +#ifndef BEST_FIRST_SELECTOR_H +#define BEST_FIRST_SELECTOR_H +#include "FeatureSelector.h" +#include +template +class BestFirstSelector : public FeatureSelector { +public: + BestFirstSelector(vector > possible_feats_, int min_n_feat, int max_n_feat) : possible_feats(possible_feats_), min_num_feat(min_n_feat), max_num_feat(max_n_feat) {} + ~BestFirstSelector() {} + + pair*,matrix::GLM> train_regression(Feature* tfeat, const vector > &training,const vector > &testing); + pair*,matrix::GLM> train_class(Feature* tfeat, const vector > &training,const vector > &testing, double id); + +private: + int max_num_feat, min_num_feat; + vector > possible_feats; +}; +#endif diff --git a/src/cluster/src/Feature.cpp b/src/predict/Feature.cpp similarity index 81% rename from src/cluster/src/Feature.cpp rename to src/predict/Feature.cpp index 67baf50..74147f5 100644 --- a/src/cluster/src/Feature.cpp +++ b/src/predict/Feature.cpp @@ -9,14 +9,13 @@ * exist because I was lazy and couldn't get * anonymous functions to work with the hashing */ -#include "Feature.h" -#include "DivergencePoint.h" -#include -#include + +#include +#include #include -#include -#include "../../utility/GlobAlignE.h" +#include "Feature.h" +using namespace std; template Feature::Feature(const Feature& feat_) : k(feat_.get_k()) @@ -28,6 +27,7 @@ Feature::Feature(const Feature& feat_) : k(feat_.get_k()) combos = feat_.get_combos(); lookup = feat_.get_lookup(); is_finalized = feat_.get_finalized(); + ltable = feat_.get_ltable(); do_save = false; auto freverse = [](int idx, int k) { int sum = 0; @@ -102,7 +102,7 @@ Feature Feature::operator=(const Feature& feat_) template void Feature::add_feature(uint64_t f_flags, Combo combo) { -// cout << "Adding combo " << f_flags << endl; + // cout << "Adding combo " << f_flags << endl; if (combo != Combo::xy && combo != Combo::x2y && combo != Combo::xy2 && combo != Combo::x2y2) { throw "invalid combo"; } @@ -138,6 +138,13 @@ void Feature::normalize_cache(vector &cache) const { for (size_t i = 0; i < lookup.size(); i++) { double val = (cache[i] - mins[i]) / (maxs[i] - mins[i]); + + // Hani Z. Girgis added this test + if(isnan(val)){ + cerr << "Got NAN from max " << maxs[i] << " min " << mins[i] << endl; + throw std::exception(); + } + if (is_sims[i]) { cache[i] = val; } else { @@ -172,6 +179,39 @@ void Feature::set_normal(uint64_t single_flag, double min_, double max_) is_finalized.at(idx) = true; } +template +pair Feature::get_normal(uint64_t single_flag) const +{ + int idx = index_of(single_flag); + return make_pair(mins.at(idx), maxs.at(idx)); +} + +/* +template +vector Feature::get_raw(const vector*,Point*> > &vec, int index) const +{ + std::vector results(vec.size(), 0); + auto func = raw_funcs[index]; + + #pragma omp parallel for + for (size_t i = 0; i < vec.size(); i++) { + results[i] = func(*vec[i].first, *vec[i].second); + } + + double vmin, vmax; + auto mm = std::minmax_element(results.begin(), results.end()); + vmin = *(mm.first); + vmax = *(mm.second); + for (auto &v : results) { + v = (v - vmin) / (vmax - vmin); + if (! is_sims[index]) { + v = 1 - v; + } + } + return results; +} +*/ + template void Feature::normalize(const vector > &pairs) { @@ -203,6 +243,27 @@ void Feature::normalize(const vector > &pairs) mins[i] = small; maxs[i] = big; + + // Hani Z. Girgis added this tests + if(abs(maxs[i] - mins[i]) <= 0.000000001){ + cerr << "Error of feature: " << feat_names().at(i) << ". "; + cerr << "The maximum distance cannot be zero."; + cerr << endl; + throw std::exception(); + } + + if(isinf(maxs[i])){ + cerr << "Error of feature: " << feat_names().at(i) << ". "; + cerr << "Maximum is " << maxs[i] << endl; + throw std::exception(); + } + + if(isinf(mins[i])){ + cerr << "Error of feature: " << feat_names().at(i) << ". "; + cerr << "Minimum is " << mins[i] << endl; + throw std::exception(); + } + } }; @@ -548,7 +609,8 @@ bool Feature::feat_is_sim(uint64_t single_flag) const is_sim = false; break; case FEAT_SPEARMAN: - is_sim = true; + is_sim = false; // Hani Z. Girgis modified the boolean + //is_sim = true; break; case FEAT_JACCARD: is_sim = true; @@ -710,6 +772,7 @@ double Feature::intersection(Point &a, Point &b) for (auto i = 0; i < N; i++) { dist += 2 * std::min(p.points[i], q.points[i]); } + return (double)dist / (double)mag; } @@ -744,7 +807,7 @@ double Feature::pearson(Point &a, Point &b) nq += dq * dq; dot += dp * dq; } - return dot / sqrt(std::max(np * nq, 0.5)); + return dot / sqrt(np * nq); } template @@ -874,6 +937,11 @@ double Feature::c_n2rrc(Point& a, Point& b) { template double Feature::n2rrc(Point& a, Point& b) const { + if(!Util::isDna){ + cerr << "n2rrc cannot be calculated on protein sequences." << endl; + throw std::exception(); + } + const DivergencePoint& p = dynamic_cast&>(a); const DivergencePoint& q = dynamic_cast&>(b); const auto N = p.points.size(); @@ -921,7 +989,7 @@ double Feature::jensen_shannon(Point &a, Point &b) const uint64_t mq = q.getPseudoMagnitude(); double sum = 0; const auto N = p.points.size(); - #pragma omp simd reduction(+:sum) + #pragma omp simd reduction(+:sum) for (auto i = 0; i < N; i++) { double pp = (double)p.points[i] / mp; double pq = (double)q.points[i] / mq; @@ -955,22 +1023,27 @@ double Feature::c_rre_k_r(Point& a, Point& b) { } } +// This statistics uses conditional probability +// Modified by Hani Z. Girgis on Oct 7 2018 to enable processing protein sequences template double Feature::rre_k_r(Point& a, Point& b) { const DivergencePoint& p = dynamic_cast&>(a); const DivergencePoint& q = dynamic_cast&>(b); const auto N = p.points.size(); + const auto A = Util::getAlphabetSize(); + double op = 0, oq = 0; - const double l4 = log(4); + const double l4 = log(A); uint64_t sum4_p = 0, sum4_q = 0; + for (auto i = 0; i < N; i++) { sum4_p += p.points[i]; sum4_q += q.points[i]; - if (i % 4 == 3) { + if (i % A == (A-1)) { double inner_sum_p = 0; double inner_sum_q = 0; - for (auto j = i - 3; j <= i; j++) { + for (auto j = i - (A-1); j <= i; j++) { double conditional_p = (double)p.points[j] / sum4_p; double conditional_q = (double)q.points[j] / sum4_q; double avg = 0.5 * (conditional_p + conditional_q); @@ -985,7 +1058,8 @@ double Feature::rre_k_r(Point& a, Point& b) sum4_q = 0; } } - double val = 0.5 * (op + oq); + + double val = 0.5 * (op + oq); return val; } @@ -1165,7 +1239,24 @@ double Feature::jefferey_divergence(Point& a, Point& b) for (auto i = 0; i < N; i++) { double pp = (double)p.points[i] / mp; double pq = (double)q.points[i] / mq; + // if (q.points[i] == 0) { + // cout << "Error for sequence " << q.get_header() << endl; + // for (int j = 0; j < q.points.size(); j++) { + // cout << q.points.at(j) << " "; + // } + // cout << endl; + // exit(1); + // } + // if (p.points[i] == 0) { + // cout << "Error for sequence " << p.get_header() << endl; + // for (int j = 0; j < p.points.size(); j++) { + // cout << (int)p.points.at(j) << " "; + // } + // cout << endl; + // exit(1); + // } double diff = pp - pq; + // cout << "pp: " << pp << " pq: " << pq << " pp/pq: " << pp / pq << endl; sum += diff * log(pp / pq); } return sum; @@ -1219,21 +1310,26 @@ double Feature::c_kl_conditional(Point& a, Point& b) { } } +// Modified by Hani Z Girgis on Oct 7 2018. template double Feature::kl_conditional(Point& a, Point& b) { const DivergencePoint& p = dynamic_cast&>(a); const DivergencePoint& q = dynamic_cast&>(b); - uint64_t sum4_p = 0, sum4_q = 0; // Sum for every 4 nucleotides + uint64_t sum4_p = 0, sum4_q = 0; // Sum for every 4 nucleotides or 22 a.a. double outer_sum_p = 0, outer_sum_q = 0; // Prior K-mer sum + const auto N = p.points.size(); + const auto A = Util::getAlphabetSize(); + for (auto i = 0; i < N; i++) { sum4_p += p.points[i]; sum4_q += q.points[i]; - if (i % 4 == 3) { //finished counting word, now compute probabilities + + if (i % A == A-1) { //finished counting word, now compute probabilities double inner_sum_p = 0; // Sum of p(X|Y) * log(p(X|Y) / q(X|Y)) double inner_sum_q = 0; // Sum of q(X|Y) * log(q(X|Y) / p(X|Y)) - for (auto j = i - 3; j <= i; j++) { + for (auto j = i - (A-1); j <= i; j++) { double conditional_p = (double)p.points[j] / sum4_p; double conditional_q = (double)q.points[j] / sum4_q; double lg = log(conditional_p / conditional_q); @@ -1273,20 +1369,26 @@ double Feature::markov(Point& a, Point& b) const DivergencePoint& q = dynamic_cast&>(a); const DivergencePoint& p = dynamic_cast&>(b); double total = 0; // Prior K-mer sum + + // Hani Z. Girgis modified this code on Oct 2 2018 + // to adapt this feature to proteins const auto N = p.points.size(); - for (auto i = 0; i < N; i += 4) { + const auto A = Util::getAlphabetSize(); + + for (auto i = 0; i < N; i += A) { uint64_t psum = 0, qsum = 0; - for (auto j = 0; j < 4; j++) { + for (auto j = 0; j < A; j++) { psum += p.points[i+j]; qsum += q.points[i+j]; } double lpsum = log(psum); double lqsum = log(qsum); - for (auto j = 0; j < 4; j++) { + for (auto j = 0; j < A; j++) { total += (q.points[i+j]-1) * (log(p.points[i+j]) - lpsum); total += (p.points[i+j]-1) * (log(q.points[i+j]) - lqsum); } - } + } + return total / 2; } @@ -1319,7 +1421,7 @@ double Feature::d2z(Point& a, Point& b) double pz = (p.points[i] - ap) / sp; double qz = (q.points[i] - aq) / sq; sum += pz * qz; - } + } return sum; } @@ -1415,16 +1517,74 @@ double Feature::emd(Point& a, Point& b) return (double)dist; } -template -std::vector tiedrank(const Point& a) -{ +// Commented by Hani Z. Girgis +// template +// std::vector tiedrank(const Point& a) +// { +// const DivergencePoint& p = dynamic_cast&>(a); +// const auto N = p.points.size(); +// vector ip(N, 0); +// std::iota(std::begin(ip), std::end(ip), 0); +// std::sort(std::begin(ip), std::end(ip), [&](size_t x, size_t y) { +// return p.points[x] < p.points[y]; +// }); + +// for(auto elm : ip){ +// cerr << elm << endl; +// } +// exit(9); +// return ip; +// } + +// Added by Hani Z. Girgis +template +std::vector tiedrank(const Point& a){ + // Initialize multimap const DivergencePoint& p = dynamic_cast&>(a); - const auto N = p.points.size(); - vector ip(N, 0); - std::iota(std::begin(ip), std::end(ip), 0); - std::sort(std::begin(ip), std::end(ip), [&](size_t x, size_t y) { - return p.points[x] < p.points[y]; - }); + unsigned int n = p.points.size(); + + std::multimap mmap; + for(unsigned i = 0; i < n; i++){ + mmap.insert(pair( p.points[i] , i)); + } + + // Set ranks without ties + int lastRank = 0; + // std::multimap::iterator + for (auto it=mmap.begin(); it!=mmap.end(); ++it){ + (*it).second = ++lastRank; + } + + for (auto it=mmap.begin(); it!=mmap.end(); it=mmap.upper_bound((*it).first)){ + auto ret = mmap.equal_range((*it).first); + + // Calculate the average rank + double rankTotal = 0; + double count = 0; + for (auto it1=ret.first; it1 != ret.second; ++it1){ + count++; + rankTotal += (*it1).second; + } + + // Assign the average rank + double meanRank = rankTotal / count; + for (auto it1=ret.first; it1 != ret.second; ++it1){ + (*it1).second = meanRank; + // cout << (*it).first << " => " << (*it1).second << endl; + } + } + + std::vector r(n, 0); + for(unsigned int i = 0; i < n; i++){ + r[i] = mmap.find(p.points[i])->second; + } + + // For testing + // for(unsigned int i = 0; i < n; i++){ + // cout << r[i] << endl; + // } + + return r; } template @@ -1442,6 +1602,7 @@ double Feature::c_spearman(Point& a, Point& b) { } } +/* template double Feature::spearman(Point& a, Point& b) { @@ -1455,9 +1616,7 @@ double Feature::spearman(Point& a, Point& b) std::sort(std::begin(ip), std::end(ip), [&](size_t x, size_t y) { return p.points[x] < p.points[y]; }); - std::sort(std::begin(iq), std::end(iq), [&](size_t x, size_t y) { - return q.points[x] < q.points[y]; - }); + double expected = (N+1) / 2.0; double cov = 0; double sp = 0; @@ -1466,8 +1625,41 @@ double Feature::spearman(Point& a, Point& b) cov += (ip[i] - expected) * (iq[i] - expected); sp += (ip[i] - expected) * (ip[i] - expected); sq += (iq[i] - expected) * (iq[i] - expected); - } - return (N * cov) / (sp * sq); + } + + cout << "N: " << N << endl; + cout << "Cov: " << cov << endl; + cout << "Sp: " << sp << endl; + cout << "Sq: " << sq << endl; + + double results = (N * cov) / (sp * sq); + + return log(results); +} +*/ + + + +template +double Feature::spearman(Point& a, Point& b) +{ + vector ip = tiedrank(a); + vector iq = tiedrank(b); + const auto N = iq.size(); + + double expected = (N+1) / 2.0; + double cov = 0; + double sp = 0; + double sq = 0; + for (auto i = 0; i < N; i++) { + cov += (ip[i] - expected) * (iq[i] - expected); + sp += (ip[i] - expected) * (ip[i] - expected); + sq += (iq[i] - expected) * (iq[i] - expected); + } + + double result = 1 - ( cov / ( sqrt(sp) * sqrt(sq) )); + // cerr << result << endl; + return result; } template @@ -1515,13 +1707,25 @@ double Feature::c_d2s(Point& a, Point& b) { } } +// Modified by Hani Z. Girgis on Oct 07 2018 to enable comparing protein sequences +// Note: This feature cannot be used if k is 1. template double Feature::d2s(Point& a, Point& b) { const DivergencePoint& p = dynamic_cast&>(a); const DivergencePoint& q = dynamic_cast&>(b); const auto N = p.points.size(); - const int k = (int)(log(N) / log(4)); + const auto A = Util::getAlphabetSize(); + + // Commented out by Hani Z Girgis and replaced by the line next to it. + // const int k = (int)(log(N) / log(4)); + int k = a.getK(); + if(k==1){ + cerr << "D2s is skipped because it cannot be applied when k is 1."; + cerr << endl; + throw std::exception(); + } + const auto p1 = p.get_1mers(); const auto q1 = q.get_1mers(); const double pmag = p.getPseudoMagnitude(); @@ -1529,23 +1733,129 @@ double Feature::d2s(Point& a, Point& b) double sum = 0; for (size_t i = 0; i < N; i++) { double p1i = 1; - double q1i = 1; - size_t idx = i; + double q1i = 1; + size_t idx = i; for (int j = 0; j < k; j++) { - int i1 = idx % 4; - idx /= 4; + int i1 = idx % A; + idx /= A; p1i *= (double)p1[i1] / pmag; q1i *= (double)q1[i1] / qmag; } - double hp = p.points[i] - pmag * p1i; - double hq = q.points[i] - qmag * q1i; - if (hp != 0 && hq != 0) { - sum += hp * hq / hypot(hp, hq); + + // Post conditions the probabilities + if(p1i > 1 || p1i < 0){ + cerr << "p1i is too big or too small." << endl; + throw std::exception(); + } + if(q1i > 1 || q1i < 0){ + cerr << "pq1i is too big or too small." << endl; + throw std::exception(); + } + + //double hp = p.points[i] - pmag * p1i; + //double hq = q.points[i] - qmag * q1i; + double hp = p.points[i] - (p.getRealMagnitude() * p1i + 1); + double hq = q.points[i] - (q.getRealMagnitude() * q1i + 1); + double denom = hypot(hp, hq); + if (denom != 0 ) { + sum += (hp * hq) / denom; } } return sum; } +template +double Feature::c_d2_star(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_D2_star)); + if (ltable.find(tup) == ltable.end()) { + double val = d2_star(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +// Modified by Hani Z. Girgis on Oct 07 2018 to enable comparing protein sequences +// This method is rewriten based on the d2s code. +// Note: This feature cannot be used if k is 1. +template +double Feature::d2_star(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + const auto A = Util::getAlphabetSize(); + + // Commented out by Hani Z Girgis and replaced by the line next to it. + // const int k = (int)(log(N) / log(4)); + int k = a.getK(); + if(k==1){ + cerr << "D2_star cannot be applied when k is 1."; + cerr << endl; + throw std::exception(); + } + + const auto p1 = p.get_1mers(); + const auto q1 = q.get_1mers(); + const double pmag = p.getPseudoMagnitude(); + const double qmag = q.getPseudoMagnitude(); + const double pq_len = sqrt(p.getRealMagnitude() * q.getRealMagnitude()); + + double sum = 0; + for (size_t i = 0; i < N; i++) { + double p1i = 1; + double q1i = 1; + double pq1i = 1; + size_t idx = i; + for (int j = 0; j < k; j++) { + int i1 = idx % A; + idx /= A; + p1i *= (double) p1.at(i1) / pmag; + q1i *= (double) q1.at(i1) / qmag; + pq1i *= ((double) p1.at(i1) + q1.at(i1)) / (pmag + qmag); + } + + // Post conditions the probabilities + if(p1i > 1 || p1i < 0){ + cerr << "p1i is too big or too small." << endl; + throw std::exception(); + } + if(q1i > 1 || q1i < 0){ + cerr << "pq1i is too big or too small." << endl; + throw std::exception(); + } + if(pq1i > 1 || pq1i < 0){ + cerr << "pq1i is too big or too small." << endl; + throw std::exception(); + } + + double hp = p.points[i] - (p.getRealMagnitude() * p1i + 1); + double hq = q.points[i] - (q.getRealMagnitude() * q1i + 1); + double e = (p.getRealMagnitude() + q.getRealMagnitude()) * pq1i + 1; + + // Post conditions on the expected value + if(e > p.getRealMagnitude() + q.getRealMagnitude()){ + cerr << "E is too big." << endl; + throw std::exception(); + } + if(e < 0){ + cerr << "E is too small." << endl; + throw std::exception(); + } + + double denom = e * pq_len; + if (denom > 0) { + sum += hp * hq / denom; + } + } + + return sum; +} + template double Feature::c_afd(Point& a, Point& b) { @@ -1561,37 +1871,54 @@ double Feature::c_afd(Point& a, Point& b) { } } +// Modified by Hani Z. Girgis to enable processing protein sequences on Oct 9 2018. +// Must be used when k = 2; otherwise, an exception is thrown. template double Feature::afd(Point& a, Point& b) { const DivergencePoint& p = dynamic_cast&>(a); const DivergencePoint& q = dynamic_cast&>(b); const auto N = p.points.size(); - const int k = (int)(log(N) / log(4)); + const auto A = Util::getAlphabetSize(); + + const int k = a.getK(); + if(k != 2){ + cerr << "AFD cannot be calculated for k other than 2: Received: " << k << endl; + throw std::exception(); + } + const auto p1 = p.get_1mers(); const auto q1 = q.get_1mers(); const auto pmag = p.getPseudoMagnitude(); const auto qmag = q.getPseudoMagnitude(); + double sum = 0; - const auto nMinusOne = N / 4; - const auto nMinusTwo = nMinusOne / 4; + const auto nMinusOne = N / A; + const auto nMinusTwo = nMinusOne / A; int first_i = 0; for (auto i = 0; i < N; i += nMinusTwo) { -// 16 iterations total, iterating through all 2-mers + // 16 iterations total, iterating through all 2-mers uint64_t psum = 0, qsum = 0; for (auto j = i; j < i + nMinusTwo; j++) { - psum += p.points[j]; - qsum += q.points[j]; + psum += p.points.at(j); + qsum += q.points.at(j); } - double x = (double)psum / p1[first_i / 4]; - double y = (double)qsum / q1[first_i / 4]; + double x = (double)psum / p1.at(first_i / A); + double y = (double)qsum / q1.at(first_i / A); first_i++; + double diff = abs(x - y); + double unsquared = (diff * pow(1+diff, -14)); + // Hani Z. Girgis modified this line + // double unsquared = (diff * pow(1+diff, -2)); - - double diff = x - y; - double unsquared = (diff * pow(1+diff, -14)); sum += unsquared * unsquared; + + if(isinf(sum)){ + cerr << x << " " << y << " " << diff << " " << unsquared << endl; + throw std::exception(); + } } + return sum; } @@ -1685,57 +2012,62 @@ double Feature::kulczynski1(Point &a, Point &b) return sum; } -template -double Feature::c_d2_star(Point& a, Point& b) { - auto aid = a.get_id(); - auto bid = b.get_id(); - auto tup = std::tuple(aid, bid, Feature::log2(FEAT_D2_star)); - if (ltable.find(tup) == ltable.end()) { - double val = d2_star(a, b); - ltable.insert({tup, val}); - return val; - } else { - return ltable.at(tup); - } -} -template -double Feature::d2_star(Point& a, Point& b) -{ - const DivergencePoint& p = dynamic_cast&>(a); - const DivergencePoint& q = dynamic_cast&>(b); - const auto N = p.points.size(); - const int k = (int)(log(N) / log(4)); - const auto p1 = p.get_1mers(); - const auto q1 = q.get_1mers(); +// // Modified by Hani Z. Girgis on Oct 7 2018 to enable processing protine sequence. +// // Failed——needs understanding of the implementation. +// template +// double Feature::d2_star(Point& a, Point& b) +// { +// const DivergencePoint& p = dynamic_cast&>(a); +// const DivergencePoint& q = dynamic_cast&>(b); +// const auto N = p.points.size(); + +// // const int k = (int)(log(N) / log(4)); +// int k = a.getK(); +// if(k==1){ +// cerr << "D2s is skipped because it cannot be applied when k is 1."; +// cerr << endl; +// throw std::exception(); +// } +// const int Alpha = Util::getAlphabetSize(); + +// const auto p1 = p.get_1mers(); +// const auto q1 = q.get_1mers(); + +// const auto pmag = p.getPseudoMagnitude(); +// const auto qmag = q.getPseudoMagnitude(); +// double sum = 0; + +// vector tilde(Alpha, 0); +// for (int i = 0; i < Alpha; i++) { +// tilde[i] = (double)(p1[i] + q1[i]) / (pmag + qmag); +// cerr << "tilde[i]: " << tilde[i] << endl; +// } +// const double L = sqrt(pmag * qmag); +// for (auto i = 0; i < N; i++) { +// double p1i = 1; +// double q1i = 1; +// double tilde_i = 1; +// auto idx = i; +// for (int j = 0; j < k; j++) { +// auto i1 = idx % Alpha; +// idx /= Alpha; +// p1i *= (double)p1[i1] / pmag; +// q1i *= (double)q1[i1] / qmag; +// tilde_i *= tilde[i1]; +// } +// double hp = p.points[i] - pmag * p1i; +// double hq = q.points[i] - qmag * q1i; +// sum += hp * hq / (L * tilde_i); +// } + +// cerr << "L: " << L << endl; + +// return sum; +// } + - const auto pmag = p.getPseudoMagnitude(); - const auto qmag = q.getPseudoMagnitude(); - double sum = 0; - vector tilde(4, 0); - for (int i = 0; i < 4; i++) { - tilde[i] = (double)(p1[i] + q1[i]) / (pmag + qmag); - } - const double L = sqrt(pmag * qmag); - for (auto i = 0; i < N; i++) { - double p1i = 1; - double q1i = 1; - double tilde_i = 1; - auto idx = i; - for (int j = 0; j < k; j++) { - auto i1 = idx % 4; - idx /= 4; - p1i *= (double)p1[i1] / pmag; - q1i *= (double)q1[i1] / qmag; - tilde_i *= tilde[i1]; - } - double hp = p.points[i] - pmag * p1i; - double hq = q.points[i] - qmag * q1i; - sum += hp * hq / (L * tilde_i); - } - return sum; -} template double Feature::c_n2r(Point& a, Point& b) { @@ -1794,6 +2126,11 @@ double Feature::c_n2rc(Point& a, Point& b) { template double Feature::n2rc(Point& a, Point& b) const { + if(!Util::isDna){ + cerr << "n2rc cannot be calculated on protein sequences." << endl; + throw std::exception(); + } + const DivergencePoint& p = dynamic_cast&>(a); const DivergencePoint& q = dynamic_cast&>(b); const auto N = p.points.size(); @@ -1815,6 +2152,14 @@ double Feature::n2rc(Point& a, Point& b) const return total; } +// template +// void Feature::safe_insert(std::tuple k, double v){ +// # pragma omp critical +// { +// ltable.insert({k, v}); +// } +// } + template class Feature; template class Feature; template class Feature; diff --git a/src/cluster/src/Feature.h b/src/predict/Feature.h similarity index 90% rename from src/cluster/src/Feature.h rename to src/predict/Feature.h index ba7f73e..ed18f17 100644 --- a/src/cluster/src/Feature.h +++ b/src/predict/Feature.h @@ -8,13 +8,25 @@ * shared indivual features can be shared through hashing if sequence * id's are set. */ -#ifndef FEATURES_H -#define FEATURES_H +#ifndef FEATURE_H +#define FEATURE_H + +// #include "SingleFeature.h" -#include "SingleFeature.h" #include #include #include +#include +#include +#include +#include +#include +#include + +#include "../clutil/DivergencePoint.h" +#include "../utility/GlobAlignE.h" + +using namespace std; #define FEAT_ALIGN (1UL << 0) #define FEAT_HELLINGER (1UL << 1) @@ -98,16 +110,19 @@ class Feature { Feature operator=(const Feature& feat_); Feature(const int k_) : k(k_) { flags = 0; + + // Modified by Hani Z. Girgis on Oct 9 2018 to enable processing protein auto freverse = [](int idx, int k) { int sum = 0; + const auto A = Util::getAlphabetSize(); for (int i = 0; i < k; i++) { - int rem = idx % 4; - idx /= 4; - sum = 4 * sum + rem; - + int rem = idx % A; + idx /= A; + sum = A * sum + rem; } return sum; }; + auto freverse_complement = [](int idx, int k) { std::vector v; for (int i = 0; i < k; i++) { @@ -121,13 +136,19 @@ class Feature { return sum; }; - uint64_t k4 = 1; + uint64_t k4_22 = 1; for (int i = 0; i < k; i++) { - k4 *= 4; + k4_22 *= Util::getAlphabetSize(); } - for (int i = 0; i < k4; i++) { + + for (int i = 0; i < k4_22; i++) { reverse.push_back(freverse(i, k)); - reverse_complement.push_back(freverse_complement(i, k)); + } + + if(Util::isDna){ + for (int i = 0; i < k4_22; i++) { + reverse_complement.push_back(freverse_complement(i, k)); + } } } void add_feature(uint64_t f_flags, Combo combo=Combo::xy); @@ -135,7 +156,7 @@ class Feature { vector feat_names(); static std::string feat_name(uint64_t single); void finalize(); - + // std::vector get_raw(const vector*,Point*> >&, int index) const; void remove_feature() { // Tear down features SPECIFIC to last pairing // auto indices_to_rm = combos.back().second; // combos.pop_back(); @@ -171,11 +192,16 @@ class Feature { } void normalize(const vector > &pairs); void set_normal(uint64_t single_flag, double min, double max); + pair get_normal(uint64_t single_flag) const; + vector compute(Point& p, Point& q) { vector cache = compute_all_raw(p, q); normalize_cache(cache); return cache; }; + + // This should be called on the singles, which can be calculated + // using the compute method double operator()(int col, const vector& cache) const { auto pr = combos.at(col); Combo combo = pr.first; @@ -306,6 +332,7 @@ class Feature { std::vector get_mins() const { return mins; }; std::vector get_maxs() const { return maxs; }; std::vector get_lookup() const { return lookup; }; + int get_k() const { return k; }; private: vector compute_all_raw(Point& p, Point& q); @@ -333,11 +360,8 @@ class Feature { std::vector get_sims() const { return is_sims; }; std::vector get_finalized() const { return is_finalized; }; + int k; - - - - int k; int get_k() const { return k; }; uint64_t flags; bool do_save; std::vector, double> atable; std::map, double> ltable; + const std::map, double>& get_ltable() const { return ltable; } + // Added by Hani Z. Girgis + // std::vector tiedrank(const Point& a); // std::map, double> * get_table() const { return ltable; } }; @@ -377,4 +404,7 @@ class Feature { // vector > features; // std::function)> combo; // }; + +//#include "Feature.cpp" + #endif diff --git a/src/predict/FeatureSelector.cpp b/src/predict/FeatureSelector.cpp new file mode 100644 index 0000000..01455a7 --- /dev/null +++ b/src/predict/FeatureSelector.cpp @@ -0,0 +1,110 @@ +// -*- C++ -*- +/* + * FeatureSelector.cpp + * + * Author: Benjamin T James + */ + +#include "FeatureSelector.h" +template +std::pair FeatureSelector::generate_feat_mat(const vector > &data, Feature& feat, double cutoff) +{ + bool classify = (cutoff > 0); + int nrows = data.size(); + int ncols = feat.size()+1; + matrix::Matrix feat_mat(nrows, ncols); + matrix::Matrix labels(nrows, 1); +// #pragma omp parallel for + for (int row = 0; row < data.size(); row++) { + auto kv = data.at(row); + vector cache; + // #pragma omp critical + // { + cache = feat.compute(*kv.first, *kv.second); + // } + feat_mat.set(row, 0, 1); + if (classify) { + labels.set(row, 0, kv.val >= cutoff ? 1 : -1); + } else { + labels.set(row, 0, kv.val); + // labels.set(row, 0, (kv.val - smin) / (smax - smin)); + } + for (int col = 1; col < ncols; col++) { + double val = feat(col-1, cache); + feat_mat.set(row, col, val); + } + } + return std::make_pair(feat_mat, labels); +} + + +template +std::pair FeatureSelector::regression_train(const vector > &data, Feature& feat) +{ + auto pr = generate_feat_mat(data, feat, -1); + matrix::GLM glm; + glm.train(pr.first, pr.second); + auto result1 = pr.first * glm.get_weights(); + auto diff1 = result1 - pr.second; + double sum = 0; + for (int i = 0; i < diff1.getNumRow(); i++) { + sum += fabs(diff1.get(i, 0)); + } + sum /= diff1.getNumRow(); + return {sum, glm}; +} + +template +std::pair FeatureSelector::class_train(const vector > &data, Feature& feat, double cutoff) +{ + auto pr = generate_feat_mat(data, feat, cutoff); + matrix::GLM glm; + glm.train(pr.first, pr.second); + matrix::Matrix p = glm.predict(pr.first); + for (int row = 0; row < p.getNumRow(); row++) { + if (p.get(row, 0) == 0) { + p.set(row, 0, -1); + } + } + auto tup = glm.accuracy(pr.second, p); + double acc = get<0>(tup); + double sens = get<1>(tup); + double spec = get<2>(tup); + return {acc, glm}; +} + +template +double FeatureSelector::regression_test(const vector >& data, Feature& feat, const matrix::GLM& glm) +{ + auto pr = generate_feat_mat(data, feat, -1); + auto result1 = pr.first * glm.get_weights(); + auto diff1 = result1 - pr.second; + double sum = 0; + for (int i = 0; i < diff1.getNumRow(); i++) { + sum += fabs(diff1.get(i, 0)); + } + sum /= diff1.getNumRow(); + return sum; +} + +template +tuple FeatureSelector::class_test(const vector >& data, Feature& feat, const matrix::GLM& glm, double cutoff) +{ + auto pr = generate_feat_mat(data, feat, cutoff); + matrix::Matrix p = glm.predict(pr.first); + for (int row = 0; row < p.getNumRow(); row++) { + if (p.get(row, 0) == 0) { + p.set(row, 0, -1); + } + } + auto tup = glm.accuracy(pr.second, p); + return tup; + +} + +template class FeatureSelector; +template class FeatureSelector; +template class FeatureSelector; +template class FeatureSelector; +template class FeatureSelector; +template class FeatureSelector; diff --git a/src/predict/FeatureSelector.h b/src/predict/FeatureSelector.h new file mode 100644 index 0000000..1d96de4 --- /dev/null +++ b/src/predict/FeatureSelector.h @@ -0,0 +1,27 @@ +// -*- C++ -*- +/* + * FeatureSelector.h + * + * Author: Benjamin T James + */ + +#ifndef FEATURE_SELECTOR_H +#define FEATURE_SELECTOR_H + +#include "GLM.h" +#include "Feature.h" + +template +class FeatureSelector { +public: + virtual ~FeatureSelector() {}; + static std::pair generate_feat_mat(const vector > &data, Feature& feat, double cutoff); + static std::pair class_train(const vector > &data, Feature& feat, double cutoff); + static std::pair regression_train(const vector > &data, Feature& feat); + static double regression_test(const vector >& data, Feature& feat, const matrix::GLM& glm); + static tuple class_test(const vector >& data, Feature& feat, const matrix::GLM& glm, double cutoff); + + virtual pair*,matrix::GLM> train_regression(Feature* tfeat, const vector > &training,const vector > &testing) = 0; + virtual pair*,matrix::GLM> train_class(Feature* tfeat, const vector > &training,const vector > &testing, double id) = 0; +}; +#endif diff --git a/src/cluster/src/GLM.cpp b/src/predict/GLM.cpp similarity index 89% rename from src/cluster/src/GLM.cpp rename to src/predict/GLM.cpp index f5ef4ba..d4f37d1 100644 --- a/src/cluster/src/GLM.cpp +++ b/src/predict/GLM.cpp @@ -22,13 +22,18 @@ void GLM::train(Matrix& features, Matrix& labels){ weights = weights.pseudoInverse() * features.transpose() * labels; } + +double GLM::logistic(double x) +{ + return 1.0 / (1 + exp(-x)); +} Matrix GLM::predict(Matrix& features) const { Matrix labels; labels = features * weights; double log; for(int i = 0; i < labels.getNumRow(); i++){ - log = round(1/(1 + exp(-(labels.get(i,0))))); - labels.set(i,0, log); + //log = round(1/(1 + exp(-(labels.get(i,0)))) + 0.1); + labels.set(i,0, round(logistic(labels.get(i, 0)))); } return labels; } diff --git a/src/cluster/src/GLM.h b/src/predict/GLM.h similarity index 91% rename from src/cluster/src/GLM.h rename to src/predict/GLM.h index d9e150b..868dc84 100644 --- a/src/cluster/src/GLM.h +++ b/src/predict/GLM.h @@ -22,6 +22,8 @@ class GLM { void load(Matrix weights_) { weights = weights_; } void train(matrix::Matrix& features, matrix::Matrix& labels); Matrix predict(matrix::Matrix& features) const; + static double logistic(double x); + static double linear(double x); std::tuple accuracy(matrix::Matrix& oLabels, matrix::Matrix& pLabels) const; const Matrix& get_weights() const { return weights; }; }; diff --git a/src/predict/GreedySelector.cpp b/src/predict/GreedySelector.cpp new file mode 100644 index 0000000..7ec73df --- /dev/null +++ b/src/predict/GreedySelector.cpp @@ -0,0 +1,154 @@ +/* -*- C++ -*- */ +/* + * GreedySelector.cpp + * + * Author: Benjamin T James + */ +#include "GreedySelector.h" +#include "../clutil/Progress.h" + +template +pair*,matrix::GLM> GreedySelector::train_regression(Feature* feat, const vector > &training,const vector > &testing) +{ + auto c_size = feat->get_combos().size(); + for (int i = 0; i < c_size; i++) { + feat->remove_feature(); + } + vector used_list; + double abs_best_regr = 1000000; +// Progress prog(possible_feats.size() * max_num_feat, "Feature selection:"); + for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) { + double best_regr_err = abs_best_regr; + uintmax_t best_idx = -1, cur_idx = 1; + auto best_regr_feat = possible_feats.front(); + for (uint64_t i = 0; i < possible_feats.size(); i++) { + if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) { + continue; + } + auto rfeat = possible_feats[i]; + feat->add_feature(rfeat.first, rfeat.second); + feat->normalize(training); + feat->finalize(); + auto pr = FeatureSelector::regression_train(training, *feat); + auto name = feat->feat_names().back(); + double regr_mse = FeatureSelector::regression_test(testing, *feat, pr.second); + feat->remove_feature(); + // prog++; + //cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name << " err: " << regr_mse << endl; + if (regr_mse < best_regr_err) { + best_regr_err = regr_mse; + best_regr_feat = rfeat; + best_idx = i; + } + } + if (best_regr_err < abs_best_regr) { + feat->add_feature(best_regr_feat.first, best_regr_feat.second); + feat->normalize(training); + feat->finalize(); + abs_best_regr = best_regr_err; + used_list.push_back(best_idx); + //possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_regr_feat), possible_feats.end()); + } + } +// prog.end(); + + Feature* feat_r = new Feature(*feat); + feat_r->set_save(false); + auto pr = FeatureSelector::regression_train(training, *feat_r); + matrix::GLM r_glm = pr.second; + double tr_regr_mse = FeatureSelector::regression_test(testing, *feat_r, r_glm); // "training" + cout << "Training Mean Error: " << pr.first << endl; + double regr_mse = FeatureSelector::regression_test(testing, *feat_r, r_glm);//, "testing"); + cout << "Testing Mean Error: " << regr_mse << endl; + cout << "Features: "<< endl; + for (auto line : feat_r->feat_names()) { + cout << "\t" << line << endl; + } + auto w = r_glm.get_weights(); + for (int r = 0; r < w.getNumRow(); r++) { + cout << "weight: "; + for (int c = 0; c < w.getNumCol(); c++) { + cout << w.get(r, c) << " "; + } + cout << endl; + } + +} + +template +std::pair*,matrix::GLM> GreedySelector::train_class(Feature* feat, const vector > &training,const vector > &testing, double id) +{ + auto c_size = feat->get_combos().size(); + for (int i = 0; i < c_size; i++) { + feat->remove_feature(); + } + vector used_list; + double abs_best_acc = 0; +// cout << "possible feats at one step: " << possible_feats.size() << endl; + Progress prog(possible_feats.size() * max_num_feat, "Feature selection:"); + + std::ostringstream oss; + for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) { + double best_class_acc = abs_best_acc; + uintmax_t best_idx = -1, cur_idx = 1; + auto best_class_feat = possible_feats.front(); + for (uint64_t i = 0; i < possible_feats.size(); i++) { + if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) { + continue; + } + auto rfeat = possible_feats[i]; + feat->add_feature(rfeat.first, rfeat.second); + feat->normalize(training); + feat->finalize(); + auto name = feat->feat_names().back(); + auto pr = FeatureSelector::class_train(training, *feat, id); + auto class_ac = FeatureSelector::class_test(testing, *feat, pr.second, id); + double class_accuracy = get<0>(class_ac);//sqrt(get<1>(class_ac) * get<2>(class_ac)); + feat->remove_feature(); + prog++; +// cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name << " acc: " << get<0>(class_ac) << " sens: " << get<1>(class_ac) << " spec: " << get<2>(class_ac) << endl; + if (class_accuracy > best_class_acc) { + best_class_acc = class_accuracy; + best_class_feat = rfeat; + best_idx = i; + } + } + /* accept the feature if either 1. we don't have enough features + * or 2. it improves accuracy by over 0.5% + */ + if (best_class_acc > abs_best_acc || num_feat <= min_num_feat) { + feat->add_feature(best_class_feat.first, best_class_feat.second); + feat->normalize(training); + feat->finalize(); + abs_best_acc = best_class_acc; + used_list.push_back(best_idx); + oss << "Feature added: " << best_class_feat.first << " " << (int)best_class_feat.second << endl; + oss << "Accuracy: " << best_class_acc << endl; + possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_class_feat), possible_feats.end()); + } + } + prog.end(); + cout << oss.str(); + Feature* feat_c = new Feature(*feat); + feat_c->set_save(false); + auto pr = FeatureSelector::class_train(training, *feat_c, id); + matrix::GLM c_glm = pr.second; + auto train_results = FeatureSelector::class_test(training, *feat_c, c_glm, id);//, "train"); + cout << "Training ACC: " << get<0>(train_results) << " " << get<1>(train_results) << " " << get<2>(train_results) << endl; + auto test_results = FeatureSelector::class_test(testing, *feat_c, c_glm, id);//, "test"); + double class_acc = get<0>(test_results); + cout << "Testing ACC: " << class_acc << " " << get<1>(test_results) << " " << get<2>(test_results) << endl; + + cout << "Features: "<< endl; + for (auto line : feat_c->feat_names()) { + cout << "\t" << line << endl; + } + return std::make_pair(feat_c, c_glm); +} + +template class GreedySelector; +template class GreedySelector; +template class GreedySelector; +template class GreedySelector; +template class GreedySelector; +template class GreedySelector; diff --git a/src/predict/GreedySelector.h b/src/predict/GreedySelector.h new file mode 100644 index 0000000..5d4bc2f --- /dev/null +++ b/src/predict/GreedySelector.h @@ -0,0 +1,23 @@ +/* -*- C++ -*- */ +/* + * GreedySelector.h + * + * Author: Benjamin T James + */ + +#ifndef GREEDY_SELECTOR_H +#define GREEDY_SELECTOR_H +#include "FeatureSelector.h" + +template +class GreedySelector : public FeatureSelector { +public: + GreedySelector(vector > possible_feats_, int min_n_feat, int max_n_feat) : possible_feats(possible_feats_), min_num_feat(min_n_feat), max_num_feat(max_n_feat) {} + ~GreedySelector() {} + pair*,matrix::GLM> train_regression(Feature* tfeat, const vector > &training,const vector > &testing); + pair*,matrix::GLM> train_class(Feature* tfeat, const vector > &training,const vector > &testing, double id); +private: + int max_num_feat, min_num_feat; + vector > possible_feats; +}; +#endif diff --git a/src/cluster/src/HandleSeq.cpp b/src/predict/HandleSeq.cpp similarity index 89% rename from src/cluster/src/HandleSeq.cpp rename to src/predict/HandleSeq.cpp index 041c22a..f9e7f2f 100644 --- a/src/cluster/src/HandleSeq.cpp +++ b/src/predict/HandleSeq.cpp @@ -1,5 +1,6 @@ /** * Author: Alex Baumgartner + * Modified by Benjamin T James * The Bioinformatics Toolsmith Laboratory, the University of Tulsa * 5/15/2018 * @@ -11,11 +12,12 @@ #include "HandleSeq.h" #include // d -HandleSeq::HandleSeq(int m) { +HandleSeq::HandleSeq(int m, std::random_device::result_type rnd) { mode = m & HandleSeq::BOTH; enableTrans = m & HandleSeq::TRANSLOCATION; enableRev = m & HandleSeq::REVERSION; + random = new LCG(rnd); // disable = (m & HandleSeq::ATYPICAL) > 0 ? 0 : 1; } @@ -71,7 +73,7 @@ pair, vector> HandleSeq::parseFile(string fileName) { } } -pair HandleSeq::mutate(string sequence, int muteRate) { +pair HandleSeq::mutate(string sequence, int muteRate, int split) { percMute = muteRate; if (muteRate == 0) { return std::make_pair(1, sequence); @@ -99,12 +101,13 @@ pair HandleSeq::mutate(string sequence, int muteRate) { } //Otherwise, assing a random percentage to both else { - percMulti = rand() % percMute; + percMulti = split; +// percMulti = random.randMod(percMute); percSing = percMute - percMulti; } //Define a new multiple mutation MultiMute multi(percAs, percCs, percGs, percTs, - percMulti, enableTrans, enableRev); + percMulti, enableTrans, enableRev, random->nextRandSeed()); //Run the multiple mutations, //get back its vector of what is valid to mutate and what isn't vector mutes = multi.genMulti(seq); @@ -112,9 +115,12 @@ pair HandleSeq::mutate(string sequence, int muteRate) { for (bool b : mutes) { cnt += b ? 1 : 0; } - + if (mutes.size() != seq->length()) { + cerr << "mutation size is not matching the multi-sequence" << endl; + throw 100; + } SingMute sing(percAs, percCs, percGs, percTs, - percSing, seq, mutes); + percSing, seq, mutes, random->nextRandSeed()); float alignmentLength = multi.getAlignmentLength() + sing.getAlignmentLength() + length; // cout << "alignLength: " << alignmentLength << endl; float IBP = length - multi.getIBP() - sing.getIBP(); @@ -130,7 +136,7 @@ pair HandleSeq::mutate(string sequence, int muteRate) { return make_pair(alignment, outseq); } -vector HandleSeq::countNucl(string sequence) { +vector HandleSeq::countNucl(const string& sequence) { int a = 0; int c = 0; int g = 0; diff --git a/src/cluster/src/HandleSeq.h b/src/predict/HandleSeq.h similarity index 83% rename from src/cluster/src/HandleSeq.h rename to src/predict/HandleSeq.h index 95a7718..f75ac0f 100644 --- a/src/cluster/src/HandleSeq.h +++ b/src/predict/HandleSeq.h @@ -1,5 +1,6 @@ /** * Author: Alex Baumgartner + * Modified by Benjamin T James * The Bioinformatics Toolsmith Laboratory, the University of Tulsa * 5/15/2018 * @@ -14,6 +15,7 @@ #include #include #include +#include "LCG.h" #include "MultiMute.h" #include "SingMute.h" @@ -39,7 +41,8 @@ class HandleSeq { int: the mode of the program (Single only = 1, nonsingle only = 2, both = 3) */ - HandleSeq(int); + HandleSeq(int, std::random_device::result_type seed); + ~HandleSeq() { if (random != NULL) { delete random; }} /* returns a vector of all sequences in a file inputted @@ -55,11 +58,15 @@ class HandleSeq { Mutates a sequence based on parameters inputted in constructor, and returns the mutated sequence */ - pair mutate(string, int); + pair mutate(string, int, int); + + uint32_t getSeed() const { return seed; } private: + uint32_t seed; int mode; int percMute; bool enableTrans, enableRev; + LCG *random = NULL; /* Counts the nucleotides in a file, and returns a vector corresponding to their values {A, C, G, T} @@ -70,7 +77,7 @@ class HandleSeq { @return: std::vector: vector containing ints of each nucleotide count */ - vector countNucl(string); + vector countNucl(const string&); }; diff --git a/src/cluster/src/Matrix.cpp b/src/predict/Matrix.cpp similarity index 97% rename from src/cluster/src/Matrix.cpp rename to src/predict/Matrix.cpp index 997d1c7..f1055c7 100644 --- a/src/cluster/src/Matrix.cpp +++ b/src/predict/Matrix.cpp @@ -20,6 +20,13 @@ using namespace std; namespace matrix { +Matrix::Matrix(vector vec) : numRow(1), numCol(vec.size()) { + m.at(0) = vector(vec.size()); + for (int i = 0; i < vec.size(); i++) { + set(0, i, vec[i]); + } +} + Matrix::Matrix(int r, int c) : numRow(r), numCol(c) { m.resize(r); @@ -142,7 +149,6 @@ Matrix Matrix::gaussJordanInverse() { } } else {//If it cannot perform a type 1 row swap with a non zero pivot value, the Inverse does not exist. cout << "Inverse does not exist\n"; - throw 0; m = temp.m; return temp; } @@ -183,13 +189,11 @@ Matrix Matrix::gaussJordanInverse() { for (int j = 0; j < numCol; j++) { if (i == j && get(i, j) != 1) { cout << "Inverse does not exist\n"; - throw 0; m = temp.m; return temp; } if (i != j && get(i, j) != 0) { cout << "Inverse does not exist\n"; - throw 0; m = temp.m; return temp; } diff --git a/src/cluster/src/Matrix.h b/src/predict/Matrix.h similarity index 90% rename from src/cluster/src/Matrix.h rename to src/predict/Matrix.h index 46a73a6..6aaffa1 100644 --- a/src/cluster/src/Matrix.h +++ b/src/predict/Matrix.h @@ -3,6 +3,7 @@ * * Created on: May 10, 2017 * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa + * Modified by Benjamin T James */ @@ -23,7 +24,7 @@ class Matrix public: - + Matrix(std::vector m); Matrix(int r, int c); Matrix(); ~Matrix(); @@ -47,6 +48,7 @@ class Matrix void rowToVector(int, std::vector&); void colToVector(int, std::vector&); int getNumRow() const; + int getNumCol() const { return numCol; }; }; } #endif /* MATRIX_H_ */ diff --git a/src/cluster/src/MultiMute.cpp b/src/predict/MultiMute.cpp similarity index 90% rename from src/cluster/src/MultiMute.cpp rename to src/predict/MultiMute.cpp index 73ee242..5acef58 100644 --- a/src/cluster/src/MultiMute.cpp +++ b/src/predict/MultiMute.cpp @@ -1,5 +1,6 @@ /** * Author: Alex Baumgartner + * Modified by Benjamin T James * The Bioinformatics Toolsmith Laboratory, the University of Tulsa * 5/15/2018 * @@ -11,7 +12,7 @@ #include #include "Random.h" -MultiMute::MultiMute(int a, int c, int g, int t, int alloc, bool enableTrans, bool enableRev) +MultiMute::MultiMute(int a, int c, int g, int t, int alloc, bool enableTrans, bool enableRev, std::random_device::result_type seed) : rng(seed) { percAs = a; percCs = c; @@ -38,22 +39,26 @@ MultiMute::MultiMute(int a, int c, int g, int t, int alloc, bool enableTrans, bo maxDel = 0; } else if (enableTrans) { if (alloc > 1) { - maxTrans = rand() % alloc; + maxTrans = rng.randMod(alloc); +// maxTrans = rand() % alloc; alloc -= maxTrans; } } else if (enableRev) { if (alloc > 1) { - maxReverse = rand() % alloc; + maxReverse = rng.randMod(alloc); +// maxReverse = rand() % alloc; alloc -= maxReverse; } } if (alloc > 1) { - maxDel = (rand() % alloc); + maxDel = rng.randMod(alloc); +// maxDel = (rand() % alloc); alloc -= maxDel; } if (alloc > 0) { - maxDup = rand() % alloc; + maxDup = rng.randMod(alloc); +// maxDup = rand() % alloc; alloc -= maxDup; } else { maxDup = 0; @@ -167,14 +172,15 @@ vector MultiMute::genMulti(string * sequence) void MultiMute::reverse(vector * toAddTo) { //Keep forming strings until the allocation of reverse is used up - int size; + int64_t size; //cout << "maxReverse: " << maxReverse << endl; while (maxReverse > 0) { //Automatically make it 2 to avoid modulus error - if (maxReverse == 2) { - size = 2; + if (maxReverse <= 2) { + size = maxReverse; } else { - size = (rand() % (maxReverse - 2)) + 2; + size = rng.randMod(maxReverse - 2) + 2; +// size = (rand() % (maxReverse - 2)) + 2; //Add 1 to size if the remaining reverse allocation would be 1 if (maxReverse - size == 1) { size++; @@ -190,12 +196,12 @@ void MultiMute::reverse(vector * toAddTo) void MultiMute::translocate(vector * toAddTo) { - int size; + int size = 0; //Keep forming strings until the allocation of Translocate is used up while (maxTrans > 0) { //Automatically make it 2 to avoid modulus error - if (maxTrans == 2) { - size = 2; + if (maxTrans <= 2) { + size = maxTrans; } else { size = rng.randMod(std::min(max_block_size, maxTrans - 2)) + 2; //Add 1 to size if the remaining reverse allocation would be 1 @@ -214,12 +220,13 @@ void MultiMute::translocate(vector * toAddTo) void MultiMute::insert(vector * toAddTo) { - int size; + long size = 0; + const int initial_maxInsert = maxInsert; //Keep forming strings until the allocation of insert is used up while (maxInsert > 0) { //Automatically make it 2 to avoid modulus error - if (maxInsert == 2) { - size = 2; + if (maxInsert <= 2) { + size = maxInsert; } else { // size = (rand() % (maxInsert - 2)) + 2; size = rng.randMod(std::min(max_block_size, maxInsert - 2)) + 2; @@ -231,6 +238,10 @@ void MultiMute::insert(vector * toAddTo) // cout << "maxInsert=" << maxInsert << " insert " << size << endl; //Add an I for where to insert, and add a generated string to the insetions vector toAddTo->push_back("I"); + if (size < 0) { + cerr << "insert is " << size << endl; + throw std::exception(); + } insertions->push_back(genInsert(size)); maxInsert -= size; } @@ -238,12 +249,12 @@ void MultiMute::insert(vector * toAddTo) void MultiMute::deleteNucl(vector * toAddTo) { - int size; + int size = 0; //Keep forming strings until the allocation of deletion is used up while (maxDel > 0) { //Automatically make it 2 to avoid modulus error - if (maxDel == 2) { - size = 2; + if (maxDel <= 2) { + size = maxDel; } else { size = rng.randMod(std::min(max_block_size, maxDel - 2)) + 2; //size = (rand() % (maxDel - 2)) + 2; @@ -262,12 +273,12 @@ void MultiMute::deleteNucl(vector * toAddTo) void MultiMute::duplicate(vector * toAddTo) { - int size; + int size = 0; //Keep forming strings until the allocation of duplicate is used up while (maxDup > 0) { //Automatically make it 2 to avoid modulus error - if (maxDup == 2) { - size = 2; + if (maxDup <= 2) { + size = maxDup; } else { size = rng.randMod(std::min(max_block_size, maxDup - 2)) + 2; // size = (rand() % (maxDup - 2)) + 2; @@ -302,7 +313,8 @@ string MultiMute::genInsert(int size) int value; //Keep adding characters based on the original distribution of nucleotides for (int i = 0; i < size; i++) { - value = rand() % (percAs + percCs + percGs + percTs); + value = rng.randMod(percAs + percCs + percGs + percTs); +// value = rand() % (percAs + percCs + percGs + percTs); if (value < percAs) { toInsert.push_back('A'); } else if (value < percAs + percCs) { @@ -340,7 +352,11 @@ vector MultiMute::formatString(int maxSize, vector * mutationsChars) //If it is an I, get the next insertion string and append it to the back of the mutaton string, as long as the insertion vector still has stuffing else if (mutationsChars->at(j) == 'I') { if (insertions->size() > 0) { - temp.append(insertions->back()); + std::string ins = insertions->back(); + temp.append(ins); + for (char c : ins) { + validCharacters.push_back(false); + } insertions->pop_back(); } //Increment only the char vector @@ -358,9 +374,12 @@ vector MultiMute::formatString(int maxSize, vector * mutationsChars) } //I and J are not incremented because they are incremented in the loop temp.append(temp2); - } + // } else if (mutationChars->at(j) == 'X') { + // i++; + // j++; + // } //Otherwise, skip over the nuleotide - else { + } else { i++; j++; } @@ -449,7 +468,8 @@ void MultiMute::checkForAllPalindromes(vector * toParseFrom) { } //Insert enough I's randomly for the amount of transversals that replaced reversals for (int i = 0; i < insertionChanges; i++) { - int index = rand() % toParseFrom->size(); + int index = rng.randMod(toParseFrom->size()); +// int index = rand() % toParseFrom->size(); toParseFrom->insert(toParseFrom->begin() + index, "I"); } } diff --git a/src/cluster/src/MultiMute.h b/src/predict/MultiMute.h similarity index 95% rename from src/cluster/src/MultiMute.h rename to src/predict/MultiMute.h index 8d27d6e..1e5dc4b 100644 --- a/src/cluster/src/MultiMute.h +++ b/src/predict/MultiMute.h @@ -1,5 +1,6 @@ /** * Author: Alex Baumgartner + * Modified by Benjamin T James * The Bioinformatics Toolsmith Laboratory, the University of Tulsa * 5/15/2018 * @@ -8,7 +9,7 @@ */ #ifndef MULTIMUTE_H -#define MULTIMUTE_H +#define MULTIMUTE_H #include #include @@ -16,7 +17,7 @@ #include #include #include "Random.h" - +#include "LCG.h" using namespace std; class MultiMute { @@ -33,7 +34,7 @@ class MultiMute { int: The total allocation for non-single mutations int: bool to exclude Translocate and reverse, 1 for disable, any other umber for include */ - MultiMute(int, int, int, int, int, bool, bool); + MultiMute(int, int, int, int, int, bool, bool, std::random_device::result_type); /* Takes in a string pointer, and mutates it based on the allocation given to the constructor. @@ -64,7 +65,7 @@ class MultiMute { int64_t alignmentLength; int64_t IBP; int64_t total_alloc; - Random rng; + LCG rng; int64_t max_block_size; std::vector * insertions; diff --git a/src/predict/Predictor.cpp b/src/predict/Predictor.cpp new file mode 100644 index 0000000..79d99d6 --- /dev/null +++ b/src/predict/Predictor.cpp @@ -0,0 +1,992 @@ +/* -*- C++ -*- + * + * Predictor.cpp + * + * Author: Benjamin T James + * + * Predictor implementation class + * train(vector<>...) is entry point, generates "semi-synthetic" sequences + * train() actually trains applicable GLM's. + * close() and similarity() are callable once trained + */ +#include "Predictor.h" +#include "../clutil/LCG.h" +#include "../clutil/Loader.h" +#include "Matrix.h" +#include "HandleSeq.h" +#include "../clutil/Progress.h" +#include "../clutil/Random.h" +#include "../clutil/Clock.h" +#include "../clutil/Datatype.h" +#include +#include +#include "FeatureSelector.h" +#include "BestFirstSelector.h" +#include "GreedySelector.h" + +template +void Predictor::save(std::string file, std::string datatype) +{ + std::ofstream out(file); + out << "k: " << k << endl; + out << "mode: " << (unsigned int)mode << endl; + out << "max_features: " << max_num_feat << endl; + out << "ID: " << id << endl; + out << "Datatype: " << datatype << endl; + out << "feature_set: " << feats64 << endl; + if (mode & PRED_MODE_CLASS) { + write_to(out, feat_c, c_glm); + } + if (mode & PRED_MODE_REGR) { + write_to(out, feat_r, r_glm); + } + +} + +template +Predictor::Predictor(const std::string filename) +{ + std::ifstream in(filename); + std::string buf; + unsigned mode_ = 0; + in >> buf >> k; + //cout << buf << k << endl; + in >> buf >> mode_; + mode = mode_; +// cout << buf << mode << endl; + in >> buf >> max_num_feat; +// cout << buf << max_num_feat << endl; + in >> buf >> id; +// cout << buf << id << endl; + in >> buf >> datatype; +// cout << buf << datatype << endl; + in >> buf >> feats64; +// cout << buf << feats64 << endl; + + is_trained = true; + is_training = false; + if (mode & PRED_MODE_CLASS) { + auto pr = read_from(in, k); + c_glm = pr.first; + feat_c = pr.second; + } + if (mode & PRED_MODE_REGR) { + auto pr = read_from(in, k); + r_glm = pr.first; + feat_r = pr.second; + } + Datatype::set(datatype); +} + +template +void Predictor::write_to(std::ofstream &out, Feature* feat, matrix::GLM glm) +{ + auto combos = feat->get_combos(); + auto lookup = feat->get_lookup(); + auto mins = feat->get_mins(); + auto maxs = feat->get_maxs(); + out << std::endl << "n_combos: " << combos.size() << std::endl; + out << std::setprecision(std::numeric_limits::digits10) << glm.get_weights().get(0, 0) << endl; + for (int j = 0; j < combos.size(); j++) { + auto cmb = combos[j]; + unsigned int val = 0; + uint64_t flags = 0; + for (auto i : cmb.second) { + flags |= lookup[i]; + } + switch (cmb.first) { + case Combo::xy: + val = 0; + break; + case Combo::xy2: + val = 1; + break; + case Combo::x2y: + val = 2; + break; + case Combo::x2y2: + val = 3; + break; + } + out << val << " "; + out << flags << " "; + out << std::setprecision(std::numeric_limits::digits10) << glm.get_weights().get(j+1, 0) << std::endl; + } + out << std::endl << "n_singles: " << lookup.size() << std::endl; + for (int j = 0; j < lookup.size(); j++) { + out << lookup[j] << " "; + out << std::setprecision(std::numeric_limits::digits10) << mins[j] << " "; + out << std::setprecision(std::numeric_limits::digits10) << maxs[j] << std::endl; + } +} + + +template +pair*> Predictor::read_from(std::ifstream& in, int k_) +{ + matrix::GLM glm; + int c_num_raw_feat, c_num_combos; + Feature *feat = new Feature(k_); + std::string buf; + in >> buf >> c_num_combos; +// cout << buf << "\"" << c_num_combos << "\"" << endl; + matrix::Matrix weights(c_num_combos+1, 1); + double d_; + in >> d_; + weights.set(0, 0, d_); + for (int i = 0; i < c_num_combos; i++) { + int cmb; + in >> cmb; + // cout << (int)cmb << endl; + uint64_t flags; + in >> flags; +// cout << flags << endl; + double d; + in >> d; +// cout << "[" << 0 << "," << i << "] " << d << endl; + weights.set(i+1, 0, d);//push_back(d); + Combo cmb_ = Combo::xy; + switch (cmb) { + case 0: + cmb_ = Combo::xy; + break; + case 1: + cmb_ = Combo::xy2; + break; + case 2: + cmb_ = Combo::x2y; + break; + case 3: + cmb_ = Combo::x2y2; + break; + default: + cerr << "error reading weights file" << endl; + break; + } + feat->add_feature(flags, cmb_); + } + + in >> buf >> c_num_raw_feat; +// cout << buf << "\"" << c_num_raw_feat << "\"" << endl; + for (int i = 0; i < c_num_raw_feat; i++) { + uint64_t single_flag; + double min_, max_; + in >> single_flag; +// cout << single_flag << endl; + in >> min_; +// cout << min_ << endl; + in >> max_; +// cout << max_ << endl; + feat->set_normal(single_flag, min_, max_); + } + feat->finalize(); + glm.load(weights); + return {glm, feat}; +} + +void identities_for_gen(double id_begin, double id_end, int num_seq, LCG& rnd, vector &to_ret) +{ + double inc = (id_end - id_begin) / num_seq; + for (size_t i = 0; i < num_seq; i++) { + double iter_id = id_begin + inc * (i + 0.5); + double actual_id = rnd.rand_between(iter_id, inc, id_begin, id_end); + int mut = round(100 - actual_id); + mut = (mut == 0) ? 1 : mut; + to_ret.push_back(mut); + } +} + +template +void Predictor::add_feats(std::vector >& vec, uint64_t feat_flags) +{ + for (uint64_t i = 1; i <= feat_flags; i *= 2) { + if ((i & feat_flags) == 0) { + continue; + } + for (uint64_t j = 1; j <= i; j *= 2) { + if ((j & feat_flags) == 0) { + continue; + } + vec.emplace_back(i | j, Combo::xy); + vec.emplace_back(i | j, Combo::x2y2); + if (i != j) { + vec.emplace_back(i | j, Combo::x2y); + vec.emplace_back(i | j, Combo::xy2); + } + } + } +} +template +void Predictor::check() +{ + // if (!is_trained && training.size() >= threshold && !is_training) { + // omp_set_lock(&lock); + // is_training = true; + // train(); + // is_training = false; + // omp_unset_lock(&lock); + // } +} +template +double Predictor::similarity(Point* a, Point* b) +{ + if (!is_trained) { +// double d = Selector::align(a, b); + cerr << "alignment: we don't do that here" << endl; + throw "Bad"; + // return d; + // if (!is_training) { + // omp_set_lock(&lock); + // if (training.size() < testing.size() && training.size() < threshold) { + // training.push_back(pra(a, b, d)); + // } else if (training.size() >= testing.size() && testing.size() < threshold) { + // testing.push_back(pra(a, b, d)); + // } + // omp_unset_lock(&lock); + // } + return 0; + + } else { + return predict(a, b); + } +} + +template +bool Predictor::close(Point *a, Point *b) +{ + if (!is_trained) { +// double d = Selector::align(a, b); + cerr << "alignment shouldn't be used here" << endl; + throw "bad"; + // if (!is_training) { + // omp_set_lock(&lock); + // if (training.size() < testing.size() && training.size() < threshold) { + // training.push_back(pra(a, b, d)); + // } else if (training.size() >= testing.size() && testing.size() < threshold) { + // testing.push_back(pra(a, b, d)); + // } + // omp_unset_lock(&lock); + // } +// return d > id; + return false; + } + bool val = p_close(a, b); + if ((mode & PRED_MODE_REGR) && val) { + // val = p_predict(a, b) > id; + // if (!val) { + // cout << "FIXED" << endl; + // } + } + return val; +} + +template +double Predictor::p_predict(Point* a, Point* b) +{ + auto cache = feat_r->compute(*a, *b); + auto weights = r_glm.get_weights(); + double sum = weights.get(0, 0); + for (int col = 0; col < feat_r->size(); col++) { + double val = (*feat_r)(col, cache); + sum += weights.get(col+1, 0) * val; + } +// sum = scale_min + (scale_max - scale_min) * sum; + if (sum < 0) { + sum = 0; + } else if (sum > 1) { + sum = 1; + } + return sum; +} +template +double Predictor::predict(Point* a, Point* b) +{ + return p_predict(a, b); +} + +double _bias = 0; +//double _bias = 0; + +template +void Predictor::set_bias(double b) +{ + _bias = b; +} +template +double Predictor::classify_sum(double sum) +{ +// cout << "Bias is " << _bias << endl; + return matrix::GLM::logistic(sum) + _bias; +} + +template +bool Predictor::p_close(Point* a, Point* b) +{ + auto weights = c_glm.get_weights(); + double sum = weights.get(0, 0); + auto cache = feat_c->compute(*a, *b); + for (int col = 1; col < weights.getNumRow(); col++) { + double d = (*feat_c)(col-1, cache); + sum += weights.get(col, 0) * d; + } + return round(classify_sum(sum)) > 0; +} + + +template +std::pair generate_feat_mat(const vector > &data, Feature& feat, double cutoff, bool do_print=false)//bool classify, double cutoff, double smin, double smax) +{ + bool classify = (cutoff > 0); + int nrows = data.size(); + int ncols = feat.size()+1; + matrix::Matrix feat_mat(nrows, ncols); + matrix::Matrix labels(nrows, 1); + #pragma omp parallel for + for (int row = 0; row < data.size(); row++) { + auto kv = data.at(row); + vector cache; + // #pragma omp critical + // { + cache = feat.compute(*kv.first, *kv.second); + // } + feat_mat.set(row, 0, 1); + if (classify) { + labels.set(row, 0, kv.val >= cutoff ? 1 : -1); + } else { + labels.set(row, 0, kv.val); + // labels.set(row, 0, (kv.val - smin) / (smax - smin)); + } + for (int col = 1; col < ncols; col++) { + double val = feat(col-1, cache); + feat_mat.set(row, col, val); + } + } + if (do_print) { + for (int row = 0; row < data.size(); row++) { + cout << "FM " << labels.get(row, 0) << " "; + for (int col = 0; col < ncols; col++) { + auto val = feat_mat.get(row, col); + cout << val << " "; + } + cout << endl; + } + cout << endl; + } + return std::make_pair(feat_mat, labels); +} + +std::string bin2acgt(const std::string& input) +{ + std::string out = ""; + for (char c : input) { + switch (c) { + case 0: + out += 'A'; + break; + case 1: + out += 'C'; + break; + case 2: + out += 'G'; + break; + case 3: + out += 'T'; + break; + default: + out += "ERR"; + } + } + return out; +} + +std::string uniqheader(std::string hdr) +{ + std::string out = ""; + bool reached_space = false; + for (char c : hdr) { + if (c == ' ') { + break; + } + out += c; + } + auto ptr = hdr.find("_mut"); + if (ptr != std::string::npos) { + return out + hdr.substr(ptr); + } else { + return out; + } + +} + +template +size_t remove_uniform(std::vector > &vec, size_t trim_size, std::vector > &out_vec) +{ + size_t N = vec.size(); + double inc = (double)N / trim_size; + if (inc <= 1) { + inc = 1; + } + size_t output_size = 0; + double i_keep = 0; + for (size_t i = 0; i < N; i++) { + if (i == round(i_keep)) { + output_size++; + out_vec.push_back(vec[i]); + i_keep += inc; + } else { + delete vec[i].second; + } + } + return output_size; +} + +template +size_t remove_uniform_old(std::vector > &vec, size_t trim_size, std::vector > &out_vec) +{ + size_t N = vec.size(); + size_t inc = N - trim_size; + if (inc <= 0) { // no removal so make sure it is never equal + inc = N; + } + size_t i_rm = N % inc; // shift off to remove ending points instead of first point + size_t output_size = 0; + for (size_t i = 0; i < N; i++) { + if (i == i_rm) { + /* dont do anything but set the next bad index */ + i_rm += inc; + delete vec[i].second; + } else { + output_size++; + out_vec.push_back(vec[i]); + } + } + return output_size; +} + +template +void remove_boundary(std::vector > &vec, size_t trim_size, std::vector > &out_vec, bool left_rm = false) +{ + size_t N = vec.size(); + size_t to_rm = N - trim_size; + for (size_t i = 0; i < N; i++) { + if ((!left_rm || i >= to_rm) && (left_rm || i < trim_size)) { + out_vec.push_back(vec[i].deep_clone()); + } else { + cout << "Removing point " << vec[i].val << endl; + } + delete vec[i].first; + delete vec[i].second; + } +} + + +template +void remove_random(std::vector > &vec, size_t trim_size, std::vector > &out_vec, Random& random) +{ + std::shuffle(vec.begin(), vec.end(), random.gen()); + for (size_t i = 0; i < vec.size(); i++) { + if (i < trim_size) { + out_vec.push_back(vec[i].deep_clone()); + } + delete vec[i].first; + delete vec[i].second; + } +} +template +size_t split_thd_data(std::vector > >& vec, double id, std::vector >& pos, std::vector >& neg) +{ + for (int i = 0; i < vec.size(); i++) { + for (auto pr : vec[i]) { + if (pr.val > id) { + uint64_t len = pr.first->get_length(); + uint64_t min_len = len * id; + uint64_t max_len = len / id; + uint64_t second_len = pr.second->get_length(); + if (second_len >= min_len && second_len <= max_len) { + pos.push_back(pr); + } else { + cout << "Bad generated point " << len << " " << second_len << endl; + } + } else { + neg.push_back(pr); + } + } + vec[i].clear(); + } + return min(pos.size(), neg.size()); +} +template +void Predictor::train(const vector *> &points, uintmax_t &_id, size_t total_num_samples, size_t num_templates) +{ + if (is_trained) { return; } + + // for (auto p : points) { + // cout << "H: " << p->get_header() << endl; + // } + cout << "params: total_samples: " << total_num_samples << " num_templates: " << num_templates << endl; + num_templates = min(num_templates, points.size()); + vector*> f_points_tr, f_points_test; + size_t total_size = points.size();// + queries.size(); + for (int i = 0; i < num_templates; i++) { + int i1 = floor((double)i * total_size / (2 * num_templates)); + int i2 = floor((i + 1) * (double)total_size / (2 * num_templates)); + f_points_tr.push_back(points.at(i1)); + f_points_test.push_back(points.at(i2)); + } + cout << "# of templates: " << num_templates << " train: " << f_points_tr.size() << " test: " << f_points_test.size() << endl; + const double pts_per_mut = (double)total_num_samples / num_templates; + // size_t q_sample = min(num_sample / 10, queries.size()); + // while (10 * f_points_tr.size() <= 11 * num_sample) { + // for (int i = 0; i < q_sample; i++) { + // int i1 = floor((double)i * queries.size() / (2 * q_sample)); + // int i2 = floor((i + 1) * (double)queries.size() / (2 * q_sample)); + // f_points_tr.push_back(queries.at(i1)); + // f_points_test.push_back(queries.at(i2)); + // } + // } + training.clear(); + testing.clear(); + if (mode & PRED_MODE_CLASS) { + vector train_seeds, test_seeds; + for (size_t i = 0; i < f_points_tr.size(); i++) { + train_seeds.push_back(random.nextRandSeed()); + } + for (size_t i = 0; i < f_points_test.size(); i++) { + test_seeds.push_back(random.nextRandSeed()); + } + std::vector > pos_buf, neg_buf; + std::vector > > thd_data(f_points_tr.size()); + cout << "mutating sequences" << endl; + int n_mut = 15; + int n_pos = 10; + int n_neg = 10; + if (1) { + auto p = f_points_tr[0]; + vector mut_rates; + std::random_device::result_type seed = random.nextRandSeed(); + LCG rnd(seed); + identities_for_gen(100 * id, 100, n_mut, rnd, mut_rates); + identities_for_gen(min_id, 100 * id, 2 * n_mut, rnd, mut_rates); + std::vector out_mut(3 * n_mut); + std::string bin_seq = p->get_data_str(); + std::string seq; + for (auto c : bin_seq) { + switch (c) { + case 0: + seq += 'A'; + break; + case 1: + seq += 'C'; + break; + case 2: + seq += 'G'; + break; + case 3: + seq += 'T'; + break; + case 'N': + seq += 'C'; + break; + default: + cout << "Invalid character " << c << endl; + cout << "from sequence " << bin_seq << endl; + throw 3; + } + } + #pragma omp parallel for + for (int i = 0; i < mut_rates.size(); i++) { + int mut_rate = mut_rates[i]; + HandleSeq hs(mut_type, seed); + LCG lcg(seed); + int spt = lcg.randMod(mut_rate); + auto newseq = hs.mutate(seq, mut_rate, spt); + out_mut[i] = newseq.first; + } + double P = 0; + double N = 0; + for (double val : out_mut) { + if (val > id) { + P++; + } else { + N++; + } + } + cout << "pts_per_mut: " << pts_per_mut << " / " << " P: " << P << " N: " << N << endl; + + // Avoid singular solution + P = std::max(1.0, P); + N = std::max(1.0, N); + /* Equation for solving number of pos and neg + 2 * P + N = pts_per_mut + P * n_mut_pos + N * n_mut_neg = n_mut_pos + n_mut_neg + + solved: + */ + + double nd_pos = pts_per_mut / (1 + 4*P/N); + double nd_neg = pts_per_mut / (1 + N/(P*4)); + n_pos = ceil(nd_pos); + n_neg = ceil(nd_neg); + cout << "found: " << (int)P << ", " << (int)N << " -> " << nd_pos << ", " << nd_neg << " -> " << n_pos << ", " << n_neg << endl; + cout << "final +: " << n_pos << " -: " << n_neg << endl; + // n_pos = max(n_pos, n_neg); + // n_neg = max(n_pos, n_neg); + } + + Progress prog1(f_points_tr.size(), "Generating training"); +#pragma omp parallel for + for (size_t i = 0; i < f_points_tr.size(); i++) { + auto p = f_points_tr[i]; + mutate_seqs(p, n_pos, thd_data[i], 100 * id, 100, _id, train_seeds[i]); + mutate_seqs(p, n_neg, thd_data[i], min_id, 100 * id, _id, train_seeds[i]); + #pragma omp critical + prog1++; + } + prog1.end(); + size_t buf_size = split_thd_data(thd_data, id, pos_buf, neg_buf); + cout << "training +: " << pos_buf.size() << endl; + cout << "training -: " << neg_buf.size() << endl; + auto pra_cmp = [&](const pra &a, const pra &b) { + // int fc = a.first->get_header().compare(b.first->get_header()); + // int sc = a.second->get_header().compare(b.second->get_header()); +// return fc < 0 || (fc == 0 && sc < 0); + return fabs(a.val - id) < fabs(b.val - id); + }; + std::sort(pos_buf.begin(), pos_buf.end(), pra_cmp); + std::sort(neg_buf.begin(), neg_buf.end(), pra_cmp); + + size_t num_pos = buf_size; + size_t num_neg = 2 * buf_size; + // remove_random(pos_buf, num_pos, training, random); + // remove_random(neg_buf, num_neg, training, random); + num_pos = remove_uniform(pos_buf, num_pos, training); + num_neg = remove_uniform(neg_buf, num_neg, training); + // remove_boundary(pos_buf, num_pos, training); + // remove_boundary(neg_buf, num_neg, training); + cout << "Training final #: +: " << num_pos << " -: " << num_neg << endl; + + + + pos_buf.clear(); + neg_buf.clear(); + thd_data.resize(f_points_test.size()); + Progress prog2(f_points_test.size(), "Generating testing"); + #pragma omp parallel for + for (size_t i = 0; i < f_points_test.size(); i++) { + auto p = f_points_test[i]; + mutate_seqs(p, n_pos, thd_data[i], 100 * id, 100, _id, test_seeds[i]); + mutate_seqs(p, n_neg, thd_data[i], min_id, 100 * id, _id, test_seeds[i]); +#pragma omp critical + prog2++; + } + prog2.end(); + buf_size = split_thd_data(thd_data, id, pos_buf, neg_buf); + cout << "testing +: " << pos_buf.size() << endl; + cout << "testing -: " << neg_buf.size() << endl; + std::sort(pos_buf.begin(), pos_buf.end(), pra_cmp); + std::sort(neg_buf.begin(), neg_buf.end(), pra_cmp); + + // std::shuffle(pos_buf.begin(), pos_buf.end(), random.gen()); + // std::shuffle(neg_buf.begin(), neg_buf.end(), random.gen()); + num_pos = buf_size; + num_neg = 2 * buf_size; + num_pos = remove_uniform(pos_buf, num_pos, testing); + num_neg = remove_uniform(neg_buf, num_neg, testing); + // remove_boundary(pos_buf, num_pos, testing); + // remove_boundary(neg_buf, num_neg, testing); + // remove_random(pos_buf, num_pos, testing, random); + // remove_random(neg_buf, num_neg, testing, random); + cout << "Testing final #: +: " << num_pos << " -: " << num_neg << endl; + Clock::stamp("data_generation"); + } else { + for (auto p : f_points_tr) { + mutate_seqs(p, 5, training, training, min_id, 100, _id, random.nextRandSeed()); + } + for (auto p : f_points_test) { + mutate_seqs(p, 5, testing, testing, min_id, 100, _id, random.nextRandSeed()); + } + } + train(); +} + + +template +void Predictor::filter(std::vector > &vec, std::string prefix) +{ + std::vector > > bins; + std::vector limits; + size_t num_bins = 10; + size_t smallest_bin_size = vec.size(); + for (size_t i = 0; i < num_bins; i++) { + limits.push_back(id + i * (1 - id) / num_bins); + bins.push_back(std::vector >()); + } + limits.push_back(1); + for (auto p : vec) { + for (size_t i = 1; i < limits.size(); i++) { + if (p.val <= limits[i] && p.val > limits[i-1]) { + bins[i-1].push_back(p); + break; + } + } + } + size_t bin_size = 0; + for (auto &v : bins) { + bin_size += v.size(); + // smallest_bin_size = std::min(smallest_bin_size, v.size()); + std::shuffle(v.begin(), v.end(), random.gen()); + } + smallest_bin_size = bin_size / bins.size(); + vec.clear(); + + for (auto &v : bins) { + for (size_t i = 0; i < std::min(v.size(), smallest_bin_size); i++) { + vec.push_back(v[i]); + if (prefix != "") { + cout << prefix << " bin " << i - 1 << " " << v[i].val << endl; + } + } + } + cout << "new vector size: " << vec.size() << " divided into " << bins.size() << " equal parts" << endl; +} + + +template +void Predictor::mutate_seqs(Point* p, size_t num_seq, vector > &thd_buf, double id_begin, double id_end, uintmax_t& _id, std::random_device::result_type seed) +{ + LCG newRand(seed); + HandleSeq h(mut_type, newRand.nextRandSeed()); + + std::string bin_seq = p->get_data_str(); + std::string seq; + for (auto c : bin_seq) { + switch (c) { + case 0: + seq += 'A'; + break; + case 1: + seq += 'C'; + break; + case 2: + seq += 'G'; + break; + case 3: + seq += 'T'; + break; + case 'N': + seq += 'C'; + break; + default: + cout << "Invalid character " << c << endl; + cout << "from sequence " << bin_seq << endl; + throw 3; + } + } + + double inc = (id_end - id_begin) / num_seq; + for (size_t i = 0; i < num_seq; i++) { + double iter_id = id_begin + inc * (i + 0.5); + double actual_id = newRand.rand_between(iter_id, inc, id_begin, id_end); +// double actual_id = rand_between(iter_id, inc, id_begin, id_end); + int mut = round(100 - actual_id); + mut = (mut == 0) ? 1 : mut; + int spt = newRand.randMod(mut); + auto newseq = h.mutate(seq, mut, spt); + std::string chrom; + std::ostringstream oss; + oss << p->get_header() << "_mut" << mut << "_" << spt << "_" << i; + std::string header = oss.str(); + Point* new_pt = Loader::get_point(header, newseq.second, _id, k, false); + pra pr; + //pr.first = p->clone(); + pr.first = p; +// pr.first->set_data_str(""); +// pr.first->set_data_str(bin_seq); + pr.second = new_pt; + pr.second->set_data_str(""); +// pr.second->set_data_str(newseq.second); + pr.val = newseq.first; + thd_buf.push_back(pr); + } +} +template +void Predictor::mutate_seqs(Point* p, size_t num_seq, vector > &pos_buf, vector > &neg_buf, double id_begin, double id_end, uintmax_t& _id, std::random_device::result_type seed) +{ + + LCG newRand(seed); + HandleSeq h(mut_type, newRand.nextRandSeed()); + + std::string bin_seq = p->get_data_str(); + std::string seq; + for (auto c : bin_seq) { + switch (c) { + case 0: + seq += 'A'; + break; + case 1: + seq += 'C'; + break; + case 2: + seq += 'G'; + break; + case 3: + seq += 'T'; + break; + case 'N': + seq += 'C'; + break; + default: + cout << "Invalid character " << c << endl; + cout << "from sequence " << bin_seq << endl; + throw 3; + } + } + + double inc = (id_end - id_begin) / num_seq; + for (size_t i = 0; i < num_seq; i++) { + double iter_id = id_begin + inc * (i + 0.5); + double actual_id = newRand.rand_between(iter_id, inc, id_begin, id_end); +// double actual_id = rand_between(iter_id, inc, id_begin, id_end); + int mut = round(100 - actual_id); + mut = (mut == 0) ? 1 : mut; + int spt = newRand.randMod(mut); + auto newseq = h.mutate(seq, mut, spt); + std::string chrom; + std::ostringstream oss; + oss << p->get_header() << "_mut" << mut << "_" << spt << "_" << i; + std::string header = oss.str(); + Point* new_pt = Loader::get_point(header, newseq.second, _id, k); + pra pr; + pr.first = p->clone(); + pr.first->set_data_str(bin_seq); + pr.second = new_pt; + pr.second->set_data_str(newseq.second); + pr.val = newseq.first; +#pragma omp critical + { + if (pr.val > id) { + pos_buf.push_back(pr); + } else { + neg_buf.push_back(pr); + } + } + } +} +template +void Predictor::train() +{ + Feature feat(k); + feat.set_save(true); + + uint64_t max_feat = 0; + for (uint64_t i = 0; i < possible_feats.size(); i++) { + if (possible_feats.at(i).first > max_feat) { + max_feat |= possible_feats.at(i).first; + } + } + for (uint64_t i = 1; i <= max_feat; i *= 2) { + if (i & max_feat) { + feat.add_feature(i, Combo::xy); + } + } + feat.normalize(training); + feat.normalize(testing); + feat.finalize(); + + + + // cout << "Class Training:" << endl; + // for (auto p : training) { + // cout << p.val << " "; + // } + // cout << "Class Testing:" << endl; + // for (auto p : testing) { + // cout << p.val << " "; + // } + if (mode & PRED_MODE_CLASS) { + train_class(&feat); + if (mode & PRED_MODE_REGR) { + // vector*> f_points_tr, f_points_test; + // for (int i = 0; i < 10; i++) { + // f_points_tr.push_back(training[rand()%training.size()].first); + // f_points_test.push_back(training[rand()%training.size()].first); + // } + // training.clear(); + // testing.clear(); + // for (auto p : f_points_tr) { + // mutate_seqs(p, 50, training, 100 * id, 100); + // mutate_seqs(p, 50, training, 60, 100 * id); + // } + // for (auto p : f_points_test) { + // mutate_seqs(p, 50, testing, 100 * id, 100); + // mutate_seqs(p, 50, testing, 60, 100 * id); + // } + // filter(); + auto func = [&](pra pr) { + return pr.val <= id; + }; + training.erase(std::remove_if(training.begin(), training.end(), func), training.end()); + testing.erase(std::remove_if(testing.begin(), testing.end(), func), testing.end()); + filter(training);//, "training"); + filter(testing);//, "testing"); + + } + } + if (mode & PRED_MODE_REGR) { + train_regr(&feat); + } + cout << "Training size: " << training.size() << endl; + cout << "Testing size: " << testing.size() << endl; + for (auto p : training) { +// delete p.first; + delete p.second; + } + for (auto p : testing) { +// delete p.first; + delete p.second; + } + cout << endl; + feat.set_save(false); + training.clear(); + testing.clear(); + possible_feats.clear(); + is_trained = true; + // save("weights.txt"); + // exit(100); + Clock::stamp("GLM"); +} + +template +void Predictor::train_class(Feature* feat) +{ + // std::vector > bf_feats; + // for (int i = 0; bf_feats.size() < 2; i++) { + // if (possible_feats[i].second == Combo::xy) { + // bf_feats.push_back(possible_feats[i]); + // } + // } + // bf_feats.push_back(std::make_pair(FEAT_INTERSECTION, Combo::xy)); + // bf_feats.push_back(std::make_pair(FEAT_NORMALIZED_VECTORS, Combo::xy)); + FeatureSelector *fs = new BestFirstSelector(possible_feats, min_num_feat, max_num_feat); +// FeatureSelector *fs = new GreedySelector(possible_feats, min_num_feat, max_num_feat); + auto pr = fs->train_class(feat, training, testing, id); + delete fs; + feat_c = pr.first; + c_glm = pr.second; +} +template +void Predictor::train_regr(Feature* feat) +{ + FeatureSelector *fs = new GreedySelector(possible_feats, min_num_feat, max_num_feat); + auto pr = fs->train_regression(feat, training, testing); + delete fs; + feat_r = pr.first; + r_glm = pr.second; +} + +template class Predictor; +template class Predictor; +template class Predictor; +template class Predictor; +template class Predictor; +template class Predictor; diff --git a/src/cluster/src/Predictor.h b/src/predict/Predictor.h similarity index 79% rename from src/cluster/src/Predictor.h rename to src/predict/Predictor.h index bf35036..cda6b08 100644 --- a/src/cluster/src/Predictor.h +++ b/src/predict/Predictor.h @@ -15,6 +15,7 @@ #include "Point.h" #include "Feature.h" #include +#include "Random.h" #include #define PRED_MODE_CLASS 1 #define PRED_MODE_REGR 2 @@ -26,7 +27,7 @@ template class Predictor { public: - Predictor(int k_, double id_, uint8_t mode_, uint64_t feats, int mut_type_, int min_num_feat_=3, int max_num_feat_=5, double min_id_=0.35) : k(k_), id(id_), is_trained(false), is_training(false), mode(mode_), max_num_feat(max_num_feat_), mut_type(mut_type_), min_num_feat(min_num_feat_), min_id(min_id_ * 100) { + Predictor(int k_, double id_, uint8_t mode_, uint64_t feats, int mut_type_, int min_num_feat_=3, int max_num_feat_=5, double min_id_=0.35) : k(k_), id(id_), is_trained(false), is_training(false), mode(mode_), max_num_feat(max_num_feat_), mut_type(mut_type_), min_num_feat(min_num_feat_), min_id(min_id_ * 100), feats64(feats) { add_feats(possible_feats, feats); feat_c = NULL; feat_r = NULL; @@ -45,13 +46,20 @@ class Predictor { training.clear(); testing.clear(); } - void train(const std::vector* >& vec, const std::vector* >& vecq, uintmax_t& _id, size_t num_sample); + static double classify_sum(double sum); + static void set_bias(double bias); + void train(const std::vector* >& vec, uintmax_t& _id, size_t num_sample, size_t n_templates); double similarity(Point* a, Point* b); bool close(Point* a, Point* b); - void save(std::string file); + void save(std::string file, std::string datatype); void check(); uint8_t get_mode() const { return mode; } pair*, matrix::GLM> get_class() { return std::make_pair(new Feature(*feat_c), c_glm); } + void mutate_seqs(Point* p, size_t num_seq, vector > &,vector > & , double id_begin, double id_end, uintmax_t& _id, std::random_device::result_type seed); + void mutate_seqs(Point* p, size_t num_seq,vector > &,double id_begin, double id_end, uintmax_t& _id, std::random_device::result_type seed); + std::string get_datatype() const { return datatype; } + int get_k() const { return k; } + double get_id() const { return id; } private: static void add_feats(std::vector >& vec, uint64_t flags); static pair*> read_from(std::ifstream &in, int k_); @@ -64,7 +72,7 @@ class Predictor { double predict(Point* a, Point* b); bool p_close(Point* a, Point* b); double p_predict(Point* a, Point* b); - void mutate_seqs(Point* p, size_t num_seq, vector > &,vector > & , double id_begin, double id_end, uintmax_t& _id); + Feature *feat_c, *feat_r; matrix::GLM c_glm, r_glm; vector > training, testing; @@ -74,5 +82,10 @@ class Predictor { double id, min_id; vector > possible_feats; omp_lock_t lock; + Random random; + uint64_t feats64; + std::string datatype; + double scale_min = 1000; + double scale_max = -1000; }; #endif diff --git a/src/predict/SingMute.cpp b/src/predict/SingMute.cpp new file mode 100644 index 0000000..3772e08 --- /dev/null +++ b/src/predict/SingMute.cpp @@ -0,0 +1,162 @@ +/* -*- C++ -*- */ +/* + * SingMute.cpp + * + * Original Author: Alexander Baumgartner + * Modified by Benjamin T James + */ +#include "SingMute.h" +#include +#include +#include +#include + +#ifdef MUTDEBUG +static const std::string INSERT_BEGIN = "["; +static const std::string INSERT_END = "]"; +static const std::string SWITCH_BEGIN = "("; +static const std::string SWITCH_END = ")"; +static const std::string DEL = "-"; +#else +static const std::string INSERT_BEGIN = ""; +static const std::string INSERT_END = ""; +static const std::string SWITCH_BEGIN = ""; +static const std::string SWITCH_END = ""; +static const std::string DEL = ""; +#endif + + +char SingMute::randNucl() +{ + char character; + int value = rng.randMod(percAs + percCs + percGs + percTs); +// int value = 40436 % (percAs + percCs + percGs + percTs); + if (value < percAs) { + character = 'A'; + } else if (value < percAs + percCs) { + character = 'C'; + } else if (value < percAs + percCs + percGs) { + character = 'G'; + } else { + character = 'T'; + } + return character; +} +void SingMute::init(const std::vector &valid) +{ + maxInsert = 0; + maxDel = 0; + maxSwitch = 0; + if (num_mut == 0) { + out_seq = std::string(*seq); + IBP = 0; + alignmentLength = 0; + return; + } else if (num_mut == 1) { + maxInsert = 1; + maxDel = 0; + maxSwitch = 0; + } else { + maxSwitch = rng.randMod(num_mut); + num_mut -= maxSwitch; + + if (maxSwitch % 2 == 1 && num_mut >= 1) { + maxSwitch++; + num_mut--; + } else if (num_mut == 0) { + maxSwitch--; + num_mut++; + } + if (num_mut > 1) { + maxInsert = rng.randMod(num_mut); + num_mut -= maxInsert; + } else { + maxInsert = num_mut; + num_mut -= maxInsert; + } + maxDel = num_mut; + } + size_t seq_len = seq->length(); + + maxDel *= seq_len / 100.0; + maxInsert *= seq_len / 100.0; + maxSwitch *= seq_len / 100.0; + alignmentLength = maxInsert; + IBP = maxDel + maxSwitch; + + + std::vector command_str(seq_len, 'S'); + long idx = 0; + long nons_len = maxInsert + maxDel + maxSwitch; + for (long i = 0; i < maxInsert; i++) { + command_str[idx++] = 'I'; + } + for (long i = 0; i < maxDel; i++) { + command_str[idx++] = 'D'; + } + for (long i = 0; i < maxSwitch; i++) { + command_str[idx++] = 'W'; + } + //std::shuffle(command_str.begin(), command_str.end(), rng.gen()); + std::shuffle(command_str.begin(), command_str.end(), std::minstd_rand0(rng.nextRandSeed())); + std::vector valid_indices; + long repl = command_str.size() - 1; + for (long i = 0; i < command_str.size(); i++) { + if (command_str[i] != 'S' && !valid[i]) { + if (!valid_indices.empty()) { + repl = valid_indices.back(); + valid_indices.pop_back(); + } else { + for (; repl > 0; repl--) { + if (valid[repl]) { + break; + } + } + } + std::swap(command_str[i], command_str[repl]); + } else if (command_str[i] == 'S' + && valid[i] + && valid_indices.size() < nons_len) { + + valid_indices.push_back(i); + } + } + // std::set s_ins, s_del, s_switch; + // generate_unique_set(command_str.size(), s_ins, maxInsert, s_del, s_switch, valid); + // generate_unique_set(command_str.size(), s_del, maxDel, s_ins, s_switch, valid); + // generate_unique_set(command_str.size(), s_switch, maxSwitch, s_ins, s_del, valid); + // for (auto idx : s_ins) { + // command_str[idx] = 'I'; + // } + // for (auto idx : s_del) { + // command_str[idx] = 'D'; + // } + // for (auto idx : s_switch) { + // command_str[idx] = 'W'; + // } + out_seq = ""; + out_seq.reserve(maxInsert + seq_len - maxDel + 1); + + for (long i = 0; i < seq_len; i++) { + auto cmd = command_str.at(i); + switch (cmd) { + case 'I': { + out_seq += INSERT_BEGIN + randNucl() + INSERT_END; + out_seq += seq->at(i); + break; + } + case 'S': { + out_seq += seq->at(i); + break; + } + case 'D': { + out_seq += DEL; + break; + } + case 'W': { + out_seq += SWITCH_BEGIN + randNucl() + SWITCH_END; + break; + } + } + } +} diff --git a/src/cluster/src/SingMute.h b/src/predict/SingMute.h similarity index 72% rename from src/cluster/src/SingMute.h rename to src/predict/SingMute.h index c659afd..bb97d06 100644 --- a/src/cluster/src/SingMute.h +++ b/src/predict/SingMute.h @@ -1,3 +1,10 @@ +/* -*- C++ -*- */ +/* + * SingMute.h + * + * Original Author: Alexander Baumgartner + * Modified by Benjamin T James + */ #ifndef SINGMUTE_H #define SINGMUTE_H @@ -5,6 +12,7 @@ #include #include #include "Random.h" +#include "LCG.h" class SingMute { public: @@ -19,8 +27,8 @@ class SingMute { int: percentage of T's int: The total allocation for non-single mutations */ - SingMute(int pa, int pc, int pg, int pt, uintmax_t tt, const std::string* s, const std::vector &valid_) : percAs(pa), - percCs(pc), percGs(pg), percTs(pt), num_mut(tt), seq(s) { + SingMute(int pa, int pc, int pg, int pt, uintmax_t tt, const std::string* s, const std::vector &valid_, std::random_device::result_type seed) : percAs(pa), + percCs(pc), percGs(pg), percTs(pt), num_mut(tt), seq(s), rng(seed) { init(valid_); } long getAlignmentLength() { return alignmentLength; } @@ -43,6 +51,6 @@ class SingMute { const std::string * seq; std::string out_seq; char randNucl(); - Random rng; + LCG rng; }; #endif diff --git a/src/utility/AffineId.cpp b/src/utility/AffineId.cpp deleted file mode 100644 index 484a5bd..0000000 --- a/src/utility/AffineId.cpp +++ /dev/null @@ -1,212 +0,0 @@ -/* - * AffineId.cpp - * - * Created on: Dec 6, 2012 - * Modified on: Nov 6, 2017 - * Author: Hani Zakaria Girgis, PhD - */ - -// ToDo: -// 1. Add pre-conditions after testing -#include "AffineId.h" - -#include "Util.h" -#include "../exception/InvalidInputException.h" - -#include -#include -using namespace std; -//using namespace exception; - -namespace utility { - -AffineId::AffineId(const char * seq1In, int start1In, int end1In, - const char * seq2In, int start2In, int end2In) { - - // The shorter of the two sequences is seq2 - seq1 = seq1In; - start1 = start1In; - end1 = end1In; - - seq2 = seq2In; - start2 = start2In; - end2 = end2In; - - if (end1 - start1 < end2 - start2) { - seq1 = seq2In; - start1 = start2In; - end1 = end2In; - - seq2 = seq1In; - start2 = start1In; - end2 = end1In; - } - - /* if (start1 < 0 || end1 < 0 || start1 > end1) { - string msg("Invalid Input. Start1 is "); - msg.append(Util::int2string(start1)); - msg.append(". End 1 is "); - msg.append(Util::int2string(end1)); - msg.append("."); - //throw InvalidInputException(msg); - - cerr << msg << endl; - throw exception(); - } - - if (start2 < 0 || end2 < 0 || start2 > end2) { - string msg("Invalid Input. Start2 is "); - msg.append(Util::int2string(start2)); - msg.append(". End2 is "); - msg.append(Util::int2string(end2)); - msg.append("."); - //throw InvalidInputException(msg); - - cerr << msg << endl; - throw exception(); - }*/ - - // Validate input - // cout << start1 << " " << end1 << endl; - // cout << start2 << " " << end2 << endl; - - len1 = end1 - start1 + 2; - len2 = end2 - start2 + 2; - - align(); -} - -AffineId::~AffineId() { -} - -void AffineId::align() { - // Initialize needed arrays - auto m = new int[len2][2](); // Middle level array - auto u = new int[len2][2](); // Upper level array - auto mId = new int[len2][2](); // Array storing number of matches in the middle array - auto uId = new int[len2][2](); // Array storing number of matches in the upper array - auto mPath = new int[len2][2](); // Array storing number of steps in the middle array - auto uPath = new int[len2][2](); // Array storing number of steps in the upper array - - // Apply the DP - // The i index is only used to get a character from the first sequence - // It is not used for filling the DP matrix - for (int i = 1; i < len1; i++) { - char base1 = seq1[start1 + i - 1]; - int lower = 0; - int lowerId = 0; - int lowerPath = 0; - - // j is the row. There are only two columns 0 and 1 - for (int j = 1; j < len2; j++) { - // Update the lower value - int extLower = lower + EXT; - int openLower = m[j - 1][0] + OPEN; - if (extLower > openLower) { - lower = extLower; - lowerPath++; - } else { - lower = openLower; - lowerId = mId[j - 1][0]; - lowerPath = mPath[j - 1][0] + 1; - } - - // Fill the array of the upper level - int extUpper = u[j][0] + EXT; - int openUpper = m[j][0] + OPEN; - if (extUpper > openUpper) { - u[j][1] = extUpper; - uId[j][1] = uId[j][0]; - uPath[j][1] = uPath[j][0] + 1; - } else { - u[j][1] = openUpper; - uId[j][1] = mId[j][0]; - uPath[j][1] = mPath[j][0] + 1; - } - - // Fill the array of the middle level - int matchOrMis; - if (base1 == seq2[start2 + j - 1]) { - matchOrMis = m[j - 1][0] + MATCH; - } else { - matchOrMis = m[j - 1][0] + MIS; - } - - int lowerOrUpper; - if (lower > u[j][1]) { - lowerOrUpper = lower; - } else { - lowerOrUpper = u[j][1]; - } - - if (matchOrMis > lowerOrUpper) { - m[j][1] = matchOrMis; - mPath[j][1] = mPath[j - 1][0] + 1; - if (base1 == seq2[start2 + j - 1]) { - mId[j][1] = mId[j - 1][0] + 1; - } else { - mId[j][1] = mId[j - 1][0]; - } - } else { - m[j][1] = lowerOrUpper; - if (lower > u[j][1]) { - mId[j][1] = lowerId; - mPath[j][1] = lowerPath; - } else { - mId[j][1] = uId[j][1]; - mPath[j][1] = uPath[j][1]; - } - } - } - - // // Test - // for (int h = 0; h < len2; h++) { - // cout << m[h][0] << "\t" << m[h][1] << "----" << mId[h][0] << "\t" - // << mId[h][1] << endl; - // } - // cout << "---------------------------------------------------" << endl; - // // End of test - - // Copy the second column to the first one - if (i != len1 - 1) { - for (int h = 0; h < len2; h++) { - m[h][0] = m[h][1]; - u[h][0] = u[h][1]; - mId[h][0] = mId[h][1]; - uId[h][0] = uId[h][1]; - mPath[h][0] = mPath[h][1]; - uPath[h][0] = uPath[h][1]; - } - } - } - - lenCS = mId[len2 - 1][1]; - lenPath = mPath[len2 - 1][1]; - //cout << "Alignment length = " << lenPath << endl; - delete[] u; - delete[] m; - delete[] mId; - delete[] uId; - delete[] mPath; - delete[] uPath; -} - -double AffineId::getAlign() { - double amt = lenCS; - return amt / (double)lenPath; -} - -} -/* namespace utility */ - -// // Testing code -// int main() { -// string s1("GATCTCAG"); -// string s2("GACAG"); - -// utility::AffineId id(s1.c_str(), 0, s1.length() - 1, s2.c_str(), 0, -// s2.length() - 1); -// cout << "Length = " << id.getLenCS() << endl; - -// return 0; -// } diff --git a/src/utility/AffineId.h b/src/utility/AffineId.h deleted file mode 100644 index 61173e7..0000000 --- a/src/utility/AffineId.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * AffineId.h - * - * Created on: Dec 6, 2012 - * Modified on: Nov 6, 2017 - * Author: Hani Zakaria Girgis, PhD - */ - -#ifndef AFFINEID_H_ -#define AFFINEID_H_ - -namespace utility { - -class AffineId { -private: - const char * seq1; - int start1; - int end1; - const char * seq2; - int start2; - int end2; - - int len1; - int len2; - //int lenTotal; - int lenCS; - int lenPath; - int * m; // Middle level - //int * l; // Lower level - int * u; // Upper level - - // const int MATCH = 4; // Score of a match - // const int MIS = -4; // Score of a mismatch - // const int OPEN = -2; // Score of a gap opening - // const int EXT = -1; // Score of a gap extension - - const int MATCH = 1; - const int MIS = -1; - const int OPEN = -2; - const int EXT = -1; - void align(); - -public: - AffineId(const char *, int, int, const char *, int, int); - virtual ~AffineId(); - double getAlign(); -}; - -} /* namespace utility */ -#endif /* AFFINEID_H_ */ diff --git a/src/utility/Util.cpp b/src/utility/Util.cpp index 4a6d4c1..c778d02 100644 --- a/src/utility/Util.cpp +++ b/src/utility/Util.cpp @@ -18,7 +18,13 @@ Util::~Util() { string Util::fileSeparator("/"); -//string * Util::emptyString = new string(""); +string * Util::emptyString = new string(""); + +bool Util::isDna = true; + +const int Util::getAlphabetSize(){ + return Util::isDna? 4 : 22; +} void Util::readFasta(string seqFile, vector * infoList, vector * seqList, bool canCheckFormat) { diff --git a/src/utility/Util.h b/src/utility/Util.h index a9ed695..b63277a 100644 --- a/src/utility/Util.h +++ b/src/utility/Util.h @@ -33,6 +33,7 @@ class Util { public: static string * emptyString; static string fileSeparator; + static bool isDna; static void readFasta(string, vector *, vector *, bool); static void readFasta(string, vector *, vector *); static void readCoordinates(string, vector *); @@ -53,6 +54,9 @@ class Util { static int sumTotalLength(const vector *); + // Added on Oct 6 2018 + static const int getAlphabetSize(); + /** * Delete the objects pointed to by pointers in a vector. * It does not delete the vector itself.