From 61e031e66c06d5b8c320b4bc473508623df10ead Mon Sep 17 00:00:00 2001 From: Benjamin James Date: Mon, 22 Oct 2018 14:27:11 -0500 Subject: [PATCH] initial commit --- Makefile | 19 + README | 74 + src/Makefile | 175 ++ src/RepeatsDetector.cpp | 583 ++++++ src/cluster/Makefile | 31 + src/cluster/src/Center.h | 42 + src/cluster/src/ClusterFactory.cpp | 1024 +++++++++ src/cluster/src/ClusterFactory.h | 82 + src/cluster/src/DivergencePoint.cpp | 284 +++ src/cluster/src/DivergencePoint.h | 89 + src/cluster/src/Feature.cpp | 1823 +++++++++++++++++ src/cluster/src/Feature.h | 380 ++++ src/cluster/src/GLM.cpp | 66 + src/cluster/src/GLM.h | 31 + src/cluster/src/HandleSeq.cpp | 155 ++ src/cluster/src/HandleSeq.h | 77 + src/cluster/src/Histogram.cpp | 195 ++ src/cluster/src/Histogram.h | 80 + src/cluster/src/Loader.cpp | 111 + src/cluster/src/Loader.h | 73 + src/cluster/src/LogTable.cpp | 41 + src/cluster/src/LogTable.h | 20 + src/cluster/src/Mat.h | 73 + src/cluster/src/Matrix.cpp | 360 ++++ src/cluster/src/Matrix.h | 52 + src/cluster/src/MultiMute.cpp | 455 ++++ src/cluster/src/MultiMute.h | 142 ++ src/cluster/src/NearestNeighbor.h | 52 + src/cluster/src/Point.h | 83 + src/cluster/src/Predictor.cpp | 837 ++++++++ src/cluster/src/Predictor.h | 78 + src/cluster/src/Progress.cpp | 65 + src/cluster/src/Progress.h | 29 + src/cluster/src/Random.h | 22 + src/cluster/src/Runner.cpp | 397 ++++ src/cluster/src/Runner.h | 44 + src/cluster/src/SingMute.cpp | 116 ++ src/cluster/src/SingMute.h | 48 + src/cluster/src/SingleFeature.cpp | 50 + src/cluster/src/SingleFeature.h | 26 + src/cluster/src/SingleFileLoader.cpp | 84 + src/cluster/src/SingleFileLoader.h | 29 + src/cluster/src/SingleMute.cpp | 221 ++ src/cluster/src/SingleMute.h | 89 + src/cluster/src/Trainer.cpp | 930 +++++++++ src/cluster/src/Trainer.h | 67 + src/cluster/src/bvec.cpp | 332 +++ src/cluster/src/bvec.h | 69 + src/cluster/src/bvec_iterator.cpp | 28 + src/cluster/src/bvec_iterator.h | 84 + src/cluster/src/main.cpp | 12 + src/cluster/src/needleman_wunsch.cpp | 153 ++ src/cluster/src/needleman_wunsch.h | 43 + src/exception/FileDoesNotExistException.cpp | 25 + src/exception/FileDoesNotExistException.h | 23 + src/exception/InvalidInputException.cpp | 24 + src/exception/InvalidInputException.h | 23 + src/exception/InvalidOperationException.cpp | 19 + src/exception/InvalidOperationException.h | 26 + .../InvalidOrderOfOperationsException.cpp | 24 + .../InvalidOrderOfOperationsException.h | 23 + src/exception/InvalidScoreException.cpp | 24 + src/exception/InvalidScoreException.h | 23 + src/exception/InvalidStateException.cpp | 25 + src/exception/InvalidStateException.h | 23 + src/nonltr/ChromDetector.cpp | 41 + src/nonltr/ChromDetector.h | 29 + src/nonltr/ChromDetectorMaxima.cpp | 94 + src/nonltr/ChromDetectorMaxima.h | 47 + src/nonltr/ChromListMaker.cpp | 123 ++ src/nonltr/ChromListMaker.h | 38 + src/nonltr/Chromosome.cpp | 308 +++ src/nonltr/Chromosome.h | 78 + src/nonltr/ChromosomeOneDigit.cpp | 246 +++ src/nonltr/ChromosomeOneDigit.h | 43 + src/nonltr/ChromosomeRandom.cpp | 363 ++++ src/nonltr/ChromosomeRandom.h | 51 + src/nonltr/DetectorMaxima.cpp | 518 +++++ src/nonltr/DetectorMaxima.h | 77 + src/nonltr/EnrichmentMarkovView.cpp | 217 ++ src/nonltr/EnrichmentMarkovView.h | 69 + src/nonltr/HMM.cpp | 630 ++++++ src/nonltr/HMM.h | 103 + src/nonltr/IChromosome.h | 28 + src/nonltr/ITableView.h | 34 + src/nonltr/KmerHashTable.cpp | 445 ++++ src/nonltr/KmerHashTable.h | 83 + src/nonltr/LocationList.cpp | 153 ++ src/nonltr/LocationList.h | 53 + src/nonltr/LocationListCollection.cpp | 101 + src/nonltr/LocationListCollection.h | 41 + src/nonltr/Scanner.cpp | 379 ++++ src/nonltr/Scanner.h | 71 + src/nonltr/Scorer.cpp | 143 ++ src/nonltr/Scorer.h | 54 + src/nonltr/TableBuilder.cpp | 121 ++ src/nonltr/TableBuilder.h | 68 + src/nonltr/Trainer.cpp | 278 +++ src/nonltr/Trainer.h | 80 + src/utility/AffineId.cpp | 212 ++ src/utility/AffineId.h | 50 + src/utility/EmptyLocation.cpp | 53 + src/utility/EmptyLocation.h | 35 + src/utility/GlobAlignE.cpp | 317 +++ src/utility/GlobAlignE.h | 58 + src/utility/ILocation.h | 29 + src/utility/LCSLen.cpp | 103 + src/utility/LCSLen.h | 37 + src/utility/Location.cpp | 74 + src/utility/Location.h | 41 + src/utility/Util.cpp | 347 ++++ src/utility/Util.h | 79 + 112 files changed, 17449 insertions(+) create mode 100644 Makefile create mode 100644 README create mode 100644 src/Makefile create mode 100644 src/RepeatsDetector.cpp create mode 100644 src/cluster/Makefile create mode 100644 src/cluster/src/Center.h create mode 100644 src/cluster/src/ClusterFactory.cpp create mode 100644 src/cluster/src/ClusterFactory.h create mode 100644 src/cluster/src/DivergencePoint.cpp create mode 100644 src/cluster/src/DivergencePoint.h create mode 100644 src/cluster/src/Feature.cpp create mode 100644 src/cluster/src/Feature.h create mode 100644 src/cluster/src/GLM.cpp create mode 100644 src/cluster/src/GLM.h create mode 100644 src/cluster/src/HandleSeq.cpp create mode 100644 src/cluster/src/HandleSeq.h create mode 100644 src/cluster/src/Histogram.cpp create mode 100644 src/cluster/src/Histogram.h create mode 100644 src/cluster/src/Loader.cpp create mode 100644 src/cluster/src/Loader.h create mode 100644 src/cluster/src/LogTable.cpp create mode 100644 src/cluster/src/LogTable.h create mode 100644 src/cluster/src/Mat.h create mode 100644 src/cluster/src/Matrix.cpp create mode 100644 src/cluster/src/Matrix.h create mode 100644 src/cluster/src/MultiMute.cpp create mode 100644 src/cluster/src/MultiMute.h create mode 100644 src/cluster/src/NearestNeighbor.h create mode 100644 src/cluster/src/Point.h create mode 100644 src/cluster/src/Predictor.cpp create mode 100644 src/cluster/src/Predictor.h create mode 100644 src/cluster/src/Progress.cpp create mode 100644 src/cluster/src/Progress.h create mode 100644 src/cluster/src/Random.h create mode 100644 src/cluster/src/Runner.cpp create mode 100644 src/cluster/src/Runner.h create mode 100644 src/cluster/src/SingMute.cpp create mode 100644 src/cluster/src/SingMute.h create mode 100644 src/cluster/src/SingleFeature.cpp create mode 100644 src/cluster/src/SingleFeature.h create mode 100644 src/cluster/src/SingleFileLoader.cpp create mode 100644 src/cluster/src/SingleFileLoader.h create mode 100644 src/cluster/src/SingleMute.cpp create mode 100644 src/cluster/src/SingleMute.h create mode 100644 src/cluster/src/Trainer.cpp create mode 100644 src/cluster/src/Trainer.h create mode 100644 src/cluster/src/bvec.cpp create mode 100644 src/cluster/src/bvec.h create mode 100644 src/cluster/src/bvec_iterator.cpp create mode 100644 src/cluster/src/bvec_iterator.h create mode 100644 src/cluster/src/main.cpp create mode 100644 src/cluster/src/needleman_wunsch.cpp create mode 100644 src/cluster/src/needleman_wunsch.h create mode 100644 src/exception/FileDoesNotExistException.cpp create mode 100644 src/exception/FileDoesNotExistException.h create mode 100644 src/exception/InvalidInputException.cpp create mode 100644 src/exception/InvalidInputException.h create mode 100644 src/exception/InvalidOperationException.cpp create mode 100644 src/exception/InvalidOperationException.h create mode 100644 src/exception/InvalidOrderOfOperationsException.cpp create mode 100644 src/exception/InvalidOrderOfOperationsException.h create mode 100644 src/exception/InvalidScoreException.cpp create mode 100644 src/exception/InvalidScoreException.h create mode 100644 src/exception/InvalidStateException.cpp create mode 100644 src/exception/InvalidStateException.h create mode 100644 src/nonltr/ChromDetector.cpp create mode 100644 src/nonltr/ChromDetector.h create mode 100644 src/nonltr/ChromDetectorMaxima.cpp create mode 100644 src/nonltr/ChromDetectorMaxima.h create mode 100644 src/nonltr/ChromListMaker.cpp create mode 100644 src/nonltr/ChromListMaker.h create mode 100644 src/nonltr/Chromosome.cpp create mode 100644 src/nonltr/Chromosome.h create mode 100644 src/nonltr/ChromosomeOneDigit.cpp create mode 100644 src/nonltr/ChromosomeOneDigit.h create mode 100644 src/nonltr/ChromosomeRandom.cpp create mode 100644 src/nonltr/ChromosomeRandom.h create mode 100644 src/nonltr/DetectorMaxima.cpp create mode 100644 src/nonltr/DetectorMaxima.h create mode 100644 src/nonltr/EnrichmentMarkovView.cpp create mode 100644 src/nonltr/EnrichmentMarkovView.h create mode 100644 src/nonltr/HMM.cpp create mode 100644 src/nonltr/HMM.h create mode 100644 src/nonltr/IChromosome.h create mode 100644 src/nonltr/ITableView.h create mode 100644 src/nonltr/KmerHashTable.cpp create mode 100644 src/nonltr/KmerHashTable.h create mode 100644 src/nonltr/LocationList.cpp create mode 100644 src/nonltr/LocationList.h create mode 100644 src/nonltr/LocationListCollection.cpp create mode 100644 src/nonltr/LocationListCollection.h create mode 100644 src/nonltr/Scanner.cpp create mode 100644 src/nonltr/Scanner.h create mode 100644 src/nonltr/Scorer.cpp create mode 100644 src/nonltr/Scorer.h create mode 100644 src/nonltr/TableBuilder.cpp create mode 100644 src/nonltr/TableBuilder.h create mode 100644 src/nonltr/Trainer.cpp create mode 100644 src/nonltr/Trainer.h create mode 100644 src/utility/AffineId.cpp create mode 100644 src/utility/AffineId.h create mode 100644 src/utility/EmptyLocation.cpp create mode 100644 src/utility/EmptyLocation.h create mode 100644 src/utility/GlobAlignE.cpp create mode 100644 src/utility/GlobAlignE.h create mode 100644 src/utility/ILocation.h create mode 100644 src/utility/LCSLen.cpp create mode 100644 src/utility/LCSLen.h create mode 100644 src/utility/Location.cpp create mode 100644 src/utility/Location.h create mode 100644 src/utility/Util.cpp create mode 100644 src/utility/Util.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..2e611c1 --- /dev/null +++ b/Makefile @@ -0,0 +1,19 @@ +all: bin/Red.o bin/meshclust2 + +bin/Red.o: + mkdir -p bin + mkdir -p bin/exception + mkdir -p bin/nonltr + mkdir -p bin/utility + $(MAKE) -C src +bin/meshclust2: bin/Red.o + $(MAKE) -C src/cluster + cp src/cluster/meshclust2 bin + +clean: + $(MAKE) clean -C src + $(MAKE) clean -C src/cluster + $(RM) -r bin + +rebuild: clean all +.PHONY: all clean diff --git a/README b/README new file mode 100644 index 0000000..7b3e7cd --- /dev/null +++ b/README @@ -0,0 +1,74 @@ +MeShClust2 +Release version + +Requirements: g++ 4.9.1 or later, requires Homebrew on Mac OS X + +Compilation using g++ (homebrew) and GNU Make on Mac OS X +CXX=g++-7 make + +see: https://stackoverflow.com/questions/29057437/compile-openmp-programs-with-gcc-compiler-on-os-x-yosemite + + +Linux/Unix compilation: +make + +Usage: bin/meshclust2 --id 0.x [OPTIONS] *.fasta + +--id The most important parameter, --id, controls the identity cutoff of the sequences. + Needs to be between 0 and 1. + If it is not specified, an identity of 0.9 is used. + +--kmer decides the size of the kmers. It is by default automatically decided by average sequence length, + but if provided, MeShClust can speed up a little by not having to find the largest sequence length. + Increasing kmer size can increase accuracy, but increases memory consumption. + +--mut-type {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation} + changes the mutation generation algorithm. By default, "single" is used, utilizing only + single point mutations. On low identity data sets, "both", which includes single mutations + and block mutations, is preferable. The option "nonsingle-typical" uses only block mutations, + disallowing single point mutations. Other options include "all", which includes single, + block, and nontypical mutations translocation and reversion. + +--feat determines the combinations of features to be used. By default, "fast" allows 9 fast combinations + to be selected from. "slow" adds 2 slower features which include logarithm based features, + and "extraslow" includes 33 total features used in a previous study. + +--min-feat (default 3) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs + will be used. Recall that features include pairwise combinations of the "feat" option. + +--max-feat (default 5) sets the maximum feature pairs to be used. Diminishing returns appears quickly, + so a very large maximum is not advised. + +--sample selects the total number of sequences used for both training and testing. + 300 is the default value. Each sequence generates 10 synthetic mutants. + That is, --sample 300 provides 3000 training pairs and 3000 testing pairs. + +--min-id (default 0.35) sets the lower bound for mutation identity scores to be calculated. Shouldn't need + to be set normally, as lower identites take much longer, especially with single mutations only. + +--threads sets the number of threads to be used. By default OpenMP uses the number of available cores + on your machine, but this parameter overwrites that. + +--output specifies the output file, in CD-HIT's CLSTR format, described below: + A '>Cluster ' followed by an increasing index designates a cluster. + Otherwise, the sequence is printed out. + A '*' at the end of a sequence designates the center of the cluster. + An example of a small data set: + + >Cluster 0 + 0 993nt, >seq128 template_6... * + >Cluster 1 + 0 1043nt, >seq235 template_10... + 1 1000nt, >seq216 template_10... * + 2 1015nt, >seq237 template_10... + + +--delta decides how many clusters are looked around in the final clustering stage. + Increasing it creates more accuracy, but takes more time. Default value is 5. + +--iterations specifies how many iterations in the final stage of merging are done until convergence. + Default value is 15. + + + +If the argument is not listed here, it is interpreted as an input (FASTA format) file. diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..3013ed0 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,175 @@ +# CXX = /usr/bin/c++ +CXX ?= g++ + +CXXFLAGS = -O3 -g -fmessage-length=0 -Wall -march=native -std=c++11 + +# +# Objects +# + +ORed = ../bin/Red.o + +# Exception +OInvalidInputException = ../bin/exception/InvalidInputException.o +OInvalidStateException = ../bin/exception/InvalidStateException.o +OFileDoesNotExistException = ../bin/exception/FileDoesNotExistException.o +OInvalidOrderOfOperationsException = ../bin/exception/InvalidOrderOfOperationsException.o +OInvalidScoreException = ../bin/exception/InvalidScoreException.o +OInvalidOperationException = ../bin/exception/InvalidOperationException.o + +# Utility +OUtil = ../bin/utility/Util.o +OLocation = ../bin/utility/Location.o +OEmptyLocation = ../bin/utility/EmptyLocation.o +OLCSLen = ../bin/utility/LCSLen.o +OAffineId = ../bin/utility/AffineId.o +OGlobAlignE = ../bin/utility/GlobAlignE.o + +# Non TR +OChromosome = ../bin/nonltr/Chromosome.o +OChromosomeOneDigit = ../bin/nonltr/ChromosomeOneDigit.o +OChromosomeRandom = ../bin/nonltr/ChromosomeRandom.o +OChromListMaker = ../bin/nonltr/ChromListMaker.o +OTableBuilder = ../bin/nonltr/TableBuilder.o +OScorer = ../bin/nonltr/Scorer.o +ODetectorMaxima = ../bin/nonltr/DetectorMaxima.o +OChromDetectorMaxima = ../bin/nonltr/ChromDetectorMaxima.o +OHMM = ../bin/nonltr/HMM.o +OScanner = ../bin/nonltr/Scanner.o +OTrainer = ../bin/nonltr/Trainer.o +OLocationList = ../bin/nonltr/LocationList.o +OLocationListCollection = ../bin/nonltr/LocationListCollection.o + +OBJS = $(ORed) $(OInvalidInputException) $(OInvalidStateException) $(OFileDoesNotExistException) $(OInvalidOrderOfOperationsException) $(OInvalidOperationException) $(OInvalidScoreException) $(OUtil) $(OLocation) $(OEmptyLocation) $(OChromosome) $(OChromosomeOneDigit) $(OChromosomeRandom) $(OChromListMaker) $(OTableBuilder) $(OScorer) $(ODetectorMaxima) $(OChromDetector) $(OChromDetectorMaxima) $(OHMM) $(OScanner) $(OTrainer) $(OLocationList) $(OLocationListCollection) $(OLCSLen) $(OAffineId) $(OGlobAlignE) + +# +# Target +# + +TRed = ../bin/Red + +# +# Make RepeatsDetector +# + +$(TRed): $(OBJS) + $(CXX) -o $(TRed) $(OBJS) + +# +# RepeatsDetector +# + +$(ORed): RepeatsDetector.cpp nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/TableBuilder.h nonltr/HMM.h nonltr/Scanner.h nonltr/Trainer.h utility/Util.h + $(CXX) $(CXXFLAGS) -c RepeatsDetector.cpp -o $(ORed) + +# +# Exception +# +$(OInvalidInputException): exception/InvalidInputException.cpp exception/InvalidInputException.h + $(CXX) $(CXXFLAGS) -c exception/InvalidInputException.cpp -o $(OInvalidInputException) + +$(OInvalidStateException): exception/InvalidStateException.cpp exception/InvalidStateException.h + $(CXX) $(CXXFLAGS) -c exception/InvalidStateException.cpp -o $(OInvalidStateException) + +$(OFileDoesNotExistException): exception/FileDoesNotExistException.cpp exception/FileDoesNotExistException.h + $(CXX) $(CXXFLAGS) -c exception/FileDoesNotExistException.cpp -o $(OFileDoesNotExistException) + +$(OInvalidOrderOfOperationsException): exception/InvalidOrderOfOperationsException.cpp exception/InvalidOrderOfOperationsException.h + $(CXX) $(CXXFLAGS) -c exception/InvalidOrderOfOperationsException.cpp -o $(OInvalidOrderOfOperationsException) + +$(OInvalidScoreException): exception/InvalidScoreException.cpp exception/InvalidScoreException.h + $(CXX) $(CXXFLAGS) -c exception/InvalidScoreException.cpp -o $(OInvalidScoreException) + +$(OInvalidOperationException): exception/InvalidOperationException.cpp exception/InvalidOperationException.h + $(CXX) $(CXXFLAGS) -c exception/InvalidOperationException.cpp -o $(OInvalidOperationException) + +# +# Utility +# + +$(OUtil): utility/Util.cpp utility/Util.h utility/Location.h exception/FileDoesNotExistException.h + $(CXX) $(CXXFLAGS) -c utility/Util.cpp -o $(OUtil) + +$(OLocation): utility/Location.cpp utility/Location.h utility/ILocation.h exception/InvalidInputException.h utility/Util.h + $(CXX) $(CXXFLAGS) -c utility/Location.cpp -o $(OLocation) + +$(OEmptyLocation): utility/EmptyLocation.cpp utility/EmptyLocation.h utility/ILocation.h exception/InvalidOperationException.h + $(CXX) $(CXXFLAGS) -c utility/EmptyLocation.cpp -o $(OEmptyLocation) + +$(OLCSLen): utility/LCSLen.cpp utility/LCSLen.h + $(CXX) $(CXXFLAGS) -c utility/LCSLen.cpp -o $(OLCSLen) + +$(OAffineId): utility/AffineId.cpp utility/AffineId.h + $(CXX) $(CXXFLAGS) -c utility/AffineId.cpp -o $(OAffineId) + +$(OGlobAlignE): utility/GlobAlignE.cpp utility/GlobAlignE.h + $(CXX) $(CXXFLAGS) -c utility/GlobAlignE.cpp -o $(OGlobAlignE) +# +# Non LTR +# + +$(OChromosome): nonltr/Chromosome.cpp nonltr/Chromosome.h nonltr/IChromosome.h utility/Util.h exception/InvalidInputException.h exception/InvalidOperationException.h + $(CXX) $(CXXFLAGS) -c nonltr/Chromosome.cpp -o $(OChromosome) + +$(OChromosomeOneDigit): nonltr/ChromosomeOneDigit.cpp nonltr/ChromosomeOneDigit.h nonltr/Chromosome.h exception/InvalidInputException.h + $(CXX) $(CXXFLAGS) -c nonltr/ChromosomeOneDigit.cpp -o $(OChromosomeOneDigit) + +$(OChromosomeRandom): nonltr/ChromosomeRandom.cpp nonltr/ChromosomeRandom.h nonltr/IChromosome.h exception/InvalidInputException.h exception/InvalidStateException.h utility/Util.h + $(CXX) $(CXXFLAGS) -c nonltr/ChromosomeRandom.cpp -o $(OChromosomeRandom) + +$(OTableBuilder): nonltr/TableBuilder.cpp nonltr/TableBuilder.h utility/Util.h nonltr/ChromosomeOneDigit.h nonltr/ITableView.h nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/EnrichmentMarkovView.h nonltr/EnrichmentMarkovView.cpp exception/InvalidStateException.h nonltr/ChromListMaker.h nonltr/IChromosome.h + $(CXX) $(CXXFLAGS) -c nonltr/TableBuilder.cpp -o $(OTableBuilder) + +$(OScorer): nonltr/Scorer.cpp nonltr/Scorer.h nonltr/ChromosomeOneDigit.h utility/Util.h exception/InvalidStateException.h + $(CXX) $(CXXFLAGS) -c nonltr/Scorer.cpp -o $(OScorer) + +$(ODetectorMaxima): nonltr/DetectorMaxima.cpp nonltr/DetectorMaxima.h utility/ILocation.h exception/InvalidStateException.h + $(CXX) $(CXXFLAGS) -c nonltr/DetectorMaxima.cpp -o $(ODetectorMaxima) + +$(OChromDetectorMaxima): nonltr/ChromDetectorMaxima.cpp nonltr/ChromDetectorMaxima.h nonltr/DetectorMaxima.h nonltr/ChromosomeOneDigit.h utility/Util.h utility/ILocation.h utility/Location.h + $(CXX) $(CXXFLAGS) -c nonltr/ChromDetectorMaxima.cpp -o $(OChromDetectorMaxima) + +$(OHMM): nonltr/HMM.cpp nonltr/HMM.h utility/ILocation.h exception/InvalidStateException.h exception/InvalidInputException.h exception/FileDoesNotExistException.h exception/InvalidOperationException.h + $(CXX) $(CXXFLAGS) -c nonltr/HMM.cpp -o $(OHMM) + +$(OScanner): nonltr/Scanner.cpp nonltr/Scanner.h nonltr/Chromosome.h nonltr/ChromosomeOneDigit.h nonltr/HMM.h nonltr/ITableView.h nonltr/Scorer.h utility/Util.h utility/ILocation.h exception/InvalidInputException.h exception/InvalidStateException.h exception/FileDoesNotExistException.h exception/InvalidOperationException.h + $(CXX) $(CXXFLAGS) -c nonltr/Scanner.cpp -o $(OScanner) + +$(OTrainer): nonltr/Trainer.cpp nonltr/Trainer.h nonltr/TableBuilder.h nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/HMM.h nonltr/ChromDetectorMaxima.h nonltr/Scorer.h nonltr/ChromListMaker.h utility/Util.h nonltr/LocationListCollection.h + $(CXX) $(CXXFLAGS) -c nonltr/Trainer.cpp -o $(OTrainer) + +$(OChromListMaker): nonltr/ChromListMaker.cpp nonltr/ChromListMaker.h nonltr/Chromosome.h nonltr/ChromosomeOneDigit.h utility/Util.h + $(CXX) $(CXXFLAGS) -c nonltr/ChromListMaker.cpp -o $(OChromListMaker) + +$(OCluster): nonltr/Cluster.cpp nonltr/Cluster.h utility/Util.h exception/InvalidStateException.h exception/InvalidInputException.h + $(CXX) $(CXXFLAGS) -c nonltr/Cluster.cpp -o $(OCluster) + +$(OLocationList): nonltr/LocationList.cpp nonltr/LocationList.h utility/ILocation.h utility/Location.h exception/InvalidStateException.h + $(CXX) $(CXXFLAGS) -c nonltr/LocationList.cpp -o $(OLocationList) + +$(OLocationListCollection): nonltr/LocationListCollection.cpp nonltr/LocationListCollection.h utility/Location.h exception/InvalidStateException.h + $(CXX) $(CXXFLAGS) -c nonltr/LocationListCollection.cpp -o $(OLocationListCollection) + + +# +# Make binary directories +# + +red: $(TRed) + +# +# Make Red +# + +bin: + mkdir ../bin + mkdir ../bin/exception + mkdir ../bin/utility + mkdir ../bin/nonltr + +# +# Make clean +# + +clean: + rm -f ../bin/*.o ../bin/exception/*.o ../bin/ms/*.o ../bin/nonltr/*.o ../bin/test/*.o ../bin/utility/*.o ../bin/tr/*.o *.o $(TRed) diff --git a/src/RepeatsDetector.cpp b/src/RepeatsDetector.cpp new file mode 100644 index 0000000..443cf24 --- /dev/null +++ b/src/RepeatsDetector.cpp @@ -0,0 +1,583 @@ +//============================================================================ +// Name : RepeatsDetector.cpp +// Author : Hani Zakaria Girgis, PhD +// Version : +// Description : Red (RepeatsDetector) +//============================================================================ +#include +#include +#include +#include +#include +#include +#include + +#include "nonltr/Trainer.h" +#include "nonltr/KmerHashTable.h" +#include "nonltr/TableBuilder.h" +#include "nonltr/HMM.h" +#include "nonltr/Scanner.h" +#include "nonltr/ChromListMaker.h" +#include "utility/Util.h" + +using namespace std; +using namespace nonltr; +using namespace utility; +using namespace exception; + +/** + * Parameters + */ +// Required parameters +const static string LEN_PRM = string("-len"); // k - length of the motif. + +// Train and Scan the whole genome +const static string GNM_PRM = string("-gnm"); // Train and scan. +const static string ORD_PRM = string("-ord"); // order of background markov chain. +const static string GAU_PRM = string("-gau"); // Half width of the Gaussian mask. +const static string THR_PRM = string("-thr"); // The threshold part of the definition of non-repeats +const static string MIN_PRM = string("-min"); // The minimum number of observations + +// Scan using pre-calculated scores and a trained HMM +const static string HMI_PRM = string("-hmi"); // File including the trained model +const static string SEQ_PRM = string("-seq"); // File including the sequence +const static string SCI_PRM = string("-sci"); // File including the scores of the sequence + +// Output options with -gnm only +const static string TBL_PRM = string("-tbl"); // Write the k-mer to the provided file +const static string SCO_PRM = string("-sco"); // Write the scores to the +const static string HMO_PRM = string("-hmo"); // The Markov model is writen to this file. +const static string CND_PRM = string("-cnd"); // Write candidate region to a directory + +// Output options with -gnm and -hmm +const static string MSK_PRM = string("-msk"); // Write masked sequence(s) to file or directory +const static string RPT_PRM = string("-rpt"); // Write coordinates to file or directory +const static string DIR_PRM = string("-dir"); // Read additional sequences(.fa) or scores (.sc) under directory +const static string FRM_PRM = string("-frm"); // Format of the ouput + +void drive(map * const param) { + // Delete old output files + if (param->count(MSK_PRM) > 0) { + if (param->count(GNM_PRM) > 0) { + cout << "Deleting pre-existing files under " << param->at(MSK_PRM); + cout << endl; + Util::deleteFilesUnderDirectory(param->at(MSK_PRM)); + } else if (param->count(HMI_PRM) > 0) { + cout << "Deleting pre-existing " << param->at(MSK_PRM) << endl; + Util::deleteFile(param->at(MSK_PRM)); + } + } + + if (param->count(RPT_PRM) > 0) { + if (param->count(GNM_PRM) > 0) { + cout << "Deleting pre-existing files under " << param->at(RPT_PRM); + cout << endl; + Util::deleteFilesUnderDirectory(param->at(RPT_PRM)); + } else if (param->count(HMI_PRM) > 0) { + cout << "Deleting pre-existing " << param->at(RPT_PRM) << endl; + Util::deleteFile(param->at(RPT_PRM)); + } + } + + if (param->count(SCO_PRM) > 0 && param->count(GNM_PRM) > 0) { + cout << "Deleting pre-existing files under " << param->at(SCO_PRM); + cout << endl; + Util::deleteFilesUnderDirectory(param->at(SCO_PRM)); + } + + if (param->count(HMO_PRM) > 0 && param->count(GNM_PRM) > 0) { + cout << "Deleting pre-existing " << param->at(HMO_PRM) << endl; + Util::deleteFile(param->at(HMO_PRM)); + } + + if (param->count(TBL_PRM) > 0 && param->count(GNM_PRM) > 0) { + cout << "Deleting pre-existing " << param->at(TBL_PRM) << endl; + Util::deleteFile(param->at(TBL_PRM)); + } + + // Process the input + int k = atoi(param->at(LEN_PRM).c_str()); + + if (param->count(GNM_PRM) > 0) { + string genomeDir = param->at(GNM_PRM); + int order = atoi(param->at(ORD_PRM).c_str()); + double s = atoi(param->at(GAU_PRM).c_str()); + double t = atoi(param->at(THR_PRM).c_str()); + int minObs = atoi(param->at(MIN_PRM).c_str()); + + // Adjust the threshold when it is one because of the log base. + if (((int) t) == 1) { + t = 1.5; + cout << "The base of the logarithmic function is adjusted." << endl; + } + + + // This part or the next + Trainer * trainer; + if (param->count(CND_PRM) > 0) { + trainer = new Trainer(genomeDir, order, k, s, t, param->at(CND_PRM), minObs); + } else { + trainer = new Trainer(genomeDir, order, k, s, t, minObs); + } + + + if (param->count(TBL_PRM)) { + cout << "Printing the count of the kmer's to: "; + cout << param->at(TBL_PRM) << endl; + trainer->printTable(param->at(TBL_PRM)); + } + + if (param->count(HMO_PRM) > 0) { + cout << "Printing the HMM to: " << endl; + cout << param->at(HMO_PRM) << endl; + trainer->printHmm(param->at(HMO_PRM)); + } + + // Stage 3: Scan + cout << endl << endl; + cout << "Stage 4: Scanning ..." << endl; + vector * fileList = new vector(); + Util::readChromList(genomeDir, fileList, string("fa")); + if (param->count(DIR_PRM) > 0) { + Util::readChromList(param->at(DIR_PRM), fileList, string("fa")); + } + + int chromCount = fileList->size(); + for (int i = 0; i < chromCount; i++) { + cout << "Scanning: " << fileList->at(i) << endl; + + // Output file name + string path(fileList->at(i)); + int slashLastIndex = path.find_last_of(Util::fileSeparator); + int dotLastIndex = path.find_last_of("."); + string nickName = path.substr(slashLastIndex + 1, dotLastIndex - slashLastIndex - 1); + + // Process each sequence with the ith file + ChromListMaker * maker = new ChromListMaker(fileList->at(i)); + const vector * chromList = maker->makeChromOneDigitList(); + + ChromListMaker * oMaker = new ChromListMaker(fileList->at(i)); + const vector * oChromList; + if (param->count(MSK_PRM) > 0) { + oChromList = oMaker->makeChromList(); + } + + for (int h = 0; h < chromList->size(); h++) { + ChromosomeOneDigit * chrom = dynamic_cast(chromList->at(h)); + + // Scan the forward strand + Scanner * scanner = new Scanner(trainer->getHmm(), k, chrom,trainer->getTable()); + + // Scan the reverse complement + chrom->makeRC(); + Scanner * scannerRC = new Scanner(trainer->getHmm(), k, chrom, trainer->getTable()); + scannerRC->makeForwardCoordinates(); + scanner->mergeWithOtherRegions(scannerRC->getRegionList()); + delete scannerRC; + chrom->makeRC(); + + + // Scan the reverse + chrom->makeR(); + Scanner * scannerR = new Scanner(trainer->getHmm(), k, chrom, trainer->getTable()); + scannerR->makeForwardCoordinates(); + scanner->mergeWithOtherRegions(scannerR->getRegionList()); + delete scannerR; + + //@@ The chromosome now has the sequence of the reverse strand + // The actual strand is calculated if the user requested the scores. + + // Print according to the user's requests + bool canAppend = (h == 0) ? false : true; + + if (param->count(SCO_PRM) > 0) { + // Calculate the forward strand from the reverse + chrom->makeR(); + + string scoFile = param->at(SCO_PRM) + Util::fileSeparator + nickName + ".scr"; + if (!canAppend) { + cout << "Printing scores to: " << scoFile << endl; + } + // Make sure to print the original E-values not their logarithm + Scorer * scorer = new Scorer(chrom, trainer->getTable()); + scorer->printScores(scoFile, canAppend); + delete scorer; + } + + if (param->count(RPT_PRM) > 0) { + string rptFile = param->at(RPT_PRM) + Util::fileSeparator + nickName + ".rpt"; + if (!canAppend) { + cout << "Printing locations to: " << rptFile << endl; + } + scanner->printIndex(rptFile, canAppend, atoi(param->at(FRM_PRM).c_str())); + } + + if (param->count(MSK_PRM) > 0) { + string mskFile = param->at(MSK_PRM) + Util::fileSeparator + nickName + ".msk"; + if (!canAppend) { + cout << "Printing masked sequence to: " << mskFile << endl; + } + Chromosome * oChrom = oChromList->at(h); + scanner->printMasked(mskFile, *oChrom, canAppend); + } + + // Free memory + delete scanner; + } + + delete maker; + delete oMaker; + } + + // Free memory + fileList->clear(); + delete fileList; + delete trainer; + } else if (param->count(HMI_PRM) > 0) { + HMM * hmm = new HMM(param->at(HMI_PRM)); + + string chromFile = param->at(SEQ_PRM); + string scoresFile = param->at(SCI_PRM); + + ChromosomeOneDigit * chrom = new ChromosomeOneDigit(chromFile); + Scanner * scanner = new Scanner(hmm, k, chrom, scoresFile); + + if (param->count(RPT_PRM) > 0) { + string rptFile = param->at(RPT_PRM); + cout << "Printing locations to: " << rptFile << endl; + scanner->printIndex(rptFile, false, atoi(param->at(FRM_PRM).c_str())); + } + + if (param->count(MSK_PRM) > 0) { + string mskFile = param->at(MSK_PRM); + cout << "Printing masked sequence to: " << mskFile << endl; + Chromosome oChrom(chromFile); + scanner->printMasked(mskFile, oChrom, false); + } + + // Free memory + delete scanner; + delete chrom; + delete hmm; + } +} + +int main(int argc, char * argv[]) { + cout << endl << endl; + cout << "This is Red (REpeat Detector) designed and developed by "; + cout << "Hani Zakaria Girgis, PhD." << endl << endl; + + cout << "Version: 05/22/2015" << endl << endl; + + string message = string("Valid argument pairs:\n"); + + message.append("\t-gnm input genome directory, required.\n"); + message.append("\t\tFiles with \".fa\" extension in this directory are used for completing the table of the adjusted counts.\n"); + message.append("\t\tThese Files are scanned for repeats.\n"); + message.append("\t-dir directory including additional input sequences, optional.\n"); + message.append("\t\tFiles with \".fa\" extension in this directory are NOT used for completing the table.\n"); + message.append("\t\tThese Files MUST have different names from those in the genome directory.\n"); + message.append("\t\tThese Files are scanned for repeats.\n"); + + + message.append("\t-len word length equals k defining the k-mer. The default is floor(log_4(genome size)).\n"); + message.append("\t-ord order of the background Markov chain. The default is floor(k/2)-1.\n"); + message.append("\t-gau half width of the mask. The default is based on the GC content.\n"); + message.append("\t\t20 if the GC content > 33% and < 67%, 40 otherwise.\n"); + + message.append("\t-thr the threshold score of the low adjusted scores of non-repeats. The default is 2.\n"); + message.append("\t-min the minimum number of the observed k-mers. The default is 3.\n"); + message.append("\t-tbl file where the table of the adjusted counts is written, optional.\n"); + message.append("\t-sco directory where scores are saved, optional.\n"); + message.append("\t\tScore files have the \".scr\" extension.\n"); + + message.append("\t-cnd directory where candidate regions are saved, optional.\n"); + message.append("\t\tCandidates files have the \".cnd\" extension.\n"); + message.append("\t-rpt directory where repeats locations are saved, optional.\n"); + message.append("\t\tRepeats files have the \".rpt\" extension.\n"); + message.append("\t-msk directory where masked sequences are saved, optional.\n"); + message.append("\t\tMasked sequences files have the \".msk\" extension.\n"); + + message.append("\t-frm the format of the output: 1 (chrName:start-end) or 2 (chrName\tstart\tend).\n"); + message.append("\t\tThe output format are zero based and the end is exclusive.\n"); + + message.append("\t-hmo file where the HMM is saved, optional.\n\n"); + + message.append("Examples:\n"); + message.append("\tThe following command runs Red with the defaults and generates the masked sequences.\n"); + message.append("\tRed -gnm genome_directory -msk output_directory\n\n"); + message.append("\tThe following command runs Red with the defaults and generates the masked sequences and the locations of repeats.\n"); + message.append("\tRed -gnm genome_directory -msk output_directory -rpt output_directory\n\n"); + + // Table of valid argument pairs + map * validParam = new map(); + validParam->insert(map::value_type(LEN_PRM, "DUMMY")); + validParam->insert(map::value_type(GNM_PRM, "DUMMY")); + validParam->insert(map::value_type(ORD_PRM, "DUMMY")); + validParam->insert(map::value_type(GAU_PRM, "DUMMY")); + validParam->insert(map::value_type(THR_PRM, "DUMMY")); + validParam->insert(map::value_type(HMI_PRM, "DUMMY")); + validParam->insert(map::value_type(SEQ_PRM, "DUMMY")); + validParam->insert(map::value_type(SCI_PRM, "DUMMY")); + validParam->insert(map::value_type(TBL_PRM, "DUMMY")); + validParam->insert(map::value_type(SCO_PRM, "DUMMY")); + validParam->insert(map::value_type(HMO_PRM, "DUMMY")); + validParam->insert(map::value_type(MSK_PRM, "DUMMY")); + validParam->insert(map::value_type(RPT_PRM, "DUMMY")); + validParam->insert(map::value_type(CND_PRM, "DUMMY")); + validParam->insert(map::value_type(DIR_PRM, "DUMMY")); + validParam->insert(map::value_type(MIN_PRM, "DUMMY")); + validParam->insert(map::value_type(FRM_PRM, "DUMMY")); + + // Make a table of the user provided arguments + map * param = new map(); + if (argc > 1 && argc % 2 == 1) { + for (int i = 1; i < argc - 1; i += 2) { + if (validParam->count(argv[i]) > 0) { + param->insert(map::value_type(argv[i], argv[i + 1])); + } else { + cerr << "Invalid argument: " << argv[i] << " " << argv[i + 1]; + cerr << endl; + cerr << message << endl; + return 1; + } + } + + + // Check if the user provided the essential arguments + + + if (param->count(LEN_PRM) == 0) { + if (param->count(GNM_PRM) > 0) { + // Calculate the size of the genome + long genomeLength = 0; + vector * fileList = new vector(); + Util::readChromList(param->at(GNM_PRM), fileList, "fa"); + cout << "Calculating the length, k, of the k-mer "; + cout << "based on the input genome ... " << endl; + for (int i = 0; i < fileList->size(); i++) { + ChromListMaker * maker = new ChromListMaker(fileList->at(i)); + const vector * chromList = maker->makeChromList(); + for (int h = 0; h < chromList->size(); h++) { + genomeLength += chromList->at(h)->getEffectiveSize(); + } + delete maker; + } + fileList->clear(); + delete fileList; + + double temp = log(genomeLength) / log(4.0); + + int k = floor(temp); + cout << "The recommended k is " << k << "." << endl; + if (k > 15) { + cout << "Due to a memory constraint, k is set to 15."; + cout << endl; + k = 15; + } + + if (k < 12) { + cout<< "Due to a statistical consideration, k is set to 12."; + cout << endl; + k = 12; + } + cout << endl; + + string kString = Util::int2string(k); + param->insert(map::value_type(LEN_PRM, kString)); + + } else { + cerr << "The word length is required." << endl; + cerr << message << endl; + return 1; + } + } + + if(param->count(FRM_PRM) == 0){ + cout << "Using the default output format chrName:start-end" << endl; + param->insert(map::value_type(FRM_PRM, Util::int2string(Scanner::FRMT_POS))); + } else { + if (atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_POS && atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_BED) { + cerr << "The output format must be " << Scanner::FRMT_POS << " or "; + cerr << Scanner::FRMT_BED << ". The format received is " ; + cerr << param->at(FRM_PRM) << "." << endl; + return 1; + } + } + + if (param->count(GNM_PRM) > 0) { + Util::checkFile(param->at(GNM_PRM)); + + if (param->count(ORD_PRM) == 0) { + double k = atoi(param->at(LEN_PRM).c_str()); + int o = floor(k / 2.0) - 1; + + cout << "Using the default background order: " << o << "."; + cout << endl; + + string oString = Util::int2string(o); + param->insert(map::value_type(ORD_PRM, oString)); + } + + if (param->count(THR_PRM) == 0) { + cout << "Using the default threshold: 2." << endl; + param->insert(map::value_type(THR_PRM, string("2"))); + } else { + if (atoi(param->at(THR_PRM).c_str()) < 1) { + cerr << "The threshold cannot be less than 1."; + cerr << endl; + cerr << message << endl; + return 1; + } + } + + if (param->count(MIN_PRM) == 0) { + cout << "Using the default minimum of the observed count of k-mers: 3." << endl; + param->insert(map::value_type(MIN_PRM, string("3"))); + } else { + if (atoi(param->at(MIN_PRM).c_str()) < 0) { + cerr << "The minimum of the observed count of k-mers cannot be less than 0."; + cerr << endl; + cerr << message << endl; + return 1; + } + } + + if (param->count(GAU_PRM) == 0) { + cout << "Calculating GC content ..." << endl; + + // 1: Count the gc content of the input genome + long genomeLength = 0; + long genomeGc = 0; + vector * fileList = new vector(); + Util::readChromList(param->at(GNM_PRM), fileList, "fa"); + for (int i = 0; i < fileList->size(); i++) { + ChromListMaker * maker = new ChromListMaker(fileList->at(i)); + const vector * chromList = maker->makeChromList(); + + for (int h = 0; h < chromList->size(); h++) { + genomeGc += chromList->at(h)->getGcContent(); + genomeLength += chromList->at(h)->getEffectiveSize(); + } + delete maker; + } + fileList->clear(); + delete fileList; + + // 2: Calculate the gc content of the input genome + double gc = 100.00 * genomeGc / genomeLength; + int w = 20; + if (gc < 33 || gc > 67) { + w = 40; + } + cout << "Using the default half width: " << w; + cout << " based on the GC content of " << gc << endl; + string wString = Util::int2string(w); + param->insert(map::value_type(GAU_PRM, wString)); + } + } else if (param->count(HMI_PRM) > 0) { + Util::checkFile(param->at(HMI_PRM)); + + if (param->count(SEQ_PRM) == 0) { + cerr << "The sequence file is required."; + cerr << endl; + cerr << message << endl; + return 1; + } else { + Util::checkFile(param->at(SEQ_PRM)); + } + + if (param->count(SCI_PRM) == 0) { + cerr << "The scores file is required."; + cerr << endl; + cerr << message << endl; + return 1; + } else { + Util::checkFile(param->at(SCI_PRM)); + } + + } else { + cerr << "A mode is required: training and scanning (-gnm) or "; + cerr << "scanning only (-hmi)." << endl; + cerr << message << endl; + return 1; + } + + // Check optional parameters + if (param->count(TBL_PRM) > 0 && param->count(GNM_PRM) == 0) { + cerr << "Printing the k-mer table is optional with -gnm only."; + cerr << endl; + cerr << message << endl; + return 1; + } + + if (param->count(HMO_PRM) > 0 && param->count(GNM_PRM) == 0) { + cerr << "Printing the HMM is optional with -gnm only."; + cerr << endl; + cerr << message << endl; + return 1; + } + + if (param->count(SCO_PRM) > 0 && param->count(GNM_PRM) == 0) { + cerr << "Printing the scores is optional with -gnm only."; + cerr << endl; + cerr << message << endl; + return 1; + } else if (param->count(SCO_PRM) > 0 && param->count(GNM_PRM) > 0) { + Util::checkFile(param->at(SCO_PRM)); + } + + + if (param->count(CND_PRM) > 0 && param->count(GNM_PRM) == 0) { + cerr << "Printing candidate regions is optional with -gnm only."; + cerr << endl; + cerr << message << endl; + return 1; + } else if (param->count(CND_PRM) > 0 && param->count(GNM_PRM) > 0) { + Util::checkFile(param->at(CND_PRM)); + } + + + if (param->count(DIR_PRM) > 0 && param->count(GNM_PRM) == 0) { + cerr << "Processing additional sequences is optional with -gnm only."; + cerr << endl; + cerr << message << endl; + return 1; + } else if (param->count(DIR_PRM) > 0 && param->count(GNM_PRM) > 0) { + Util::checkFile(param->at(DIR_PRM)); + } + + if (param->count(MSK_PRM) > 0 && param->count(GNM_PRM) > 0) { + Util::checkFile(param->at(MSK_PRM)); + } + + if (param->count(RPT_PRM) > 0 && param->count(GNM_PRM) > 0) { + Util::checkFile(param->at(RPT_PRM)); + } + + // Print out the parameters table + typedef map myMap; + myMap::iterator sIter = param->begin(); + myMap::iterator eIter = param->end(); + cout << endl << "List of final parameters: " << endl; + while (sIter != eIter) { + cout << (*sIter).first << ": " << (*sIter).second << endl; + sIter++; + } + cout << endl; + + // Start! + drive(param); + + // Clear parameters when done. + param->clear(); + delete param; + } else { + cerr << "Argument pairs of the form: -flag value are required."; + cerr << endl; + cerr << message << endl; + } + + //return EXIT_SUCCESS; + return 0; +} diff --git a/src/cluster/Makefile b/src/cluster/Makefile new file mode 100644 index 0000000..817559a --- /dev/null +++ b/src/cluster/Makefile @@ -0,0 +1,31 @@ +TARGET ?= meshclust2 +VERSION ?= 2.0.0 +CXX ?= g++ +ifeq ($(debug),yes) + CXXFLAGS += -ggdb -DDEBUG -fno-omit-frame-pointer -fopenmp +else + CXXFLAGS += -fopenmp -O3 -march=native -g +endif +CXXFLAGS += -std=c++11 -DVERSION=\"$(VERSION)\" +LDFLAGS += -lm + +SOURCES := $(shell find ./src -name '*.cpp') +OBJECTS = $(SOURCES:%.cpp=bin/%.o) +BIN_OBJECTS := $(shell find ../../bin/ -mindepth 2 -name '*.o') + +all: clean $(TARGET) + +$(TARGET): $(OBJECTS) $(BIN_OBJECTS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + +bin/%.o: %.cpp + mkdir -p $(@D) + $(CXX) $(CXXFLAGS) -c $< -o $@ + +clean: + $(RM) $(OBJECTS) $(TARGET) + +install: $(TARGET) + cp $(TARGET) ~/bin + +.PHONY: all clean install diff --git a/src/cluster/src/Center.h b/src/cluster/src/Center.h new file mode 100644 index 0000000..8c2acc5 --- /dev/null +++ b/src/cluster/src/Center.h @@ -0,0 +1,42 @@ +/* -*- C++ -*- + * + * Center.h + * + * Author: Benjamin T James + */ +#ifndef CENTER_H +#define CENTER_H + +#include "Point.h" + +template +struct Center { + Center(Point* c, const vector*> &pts) : center(c->clone()), points(pts), is_to_delete(false) { + } + Center(const Center &cc) : center(cc.center->clone()), points(cc.points), is_to_delete(cc.is_to_delete) {} + + + // Center(const Center& c) { + // center = c.get_clone(); + // points = c.getPoints_c(); + // is_to_delete = c.is_delete(); + // } + ~Center() { if (is_to_delete) { delete center; }} + + Point* getCenter() { return center; } + vector*> &getPoints() { return points; } + + const vector*> &getPoints_c() const { return points; }; + bool is_delete() const { return is_to_delete; } + void lazy_remove() { is_to_delete = true; } + size_t size() const { return points.size(); } + bool empty() const { return points.empty(); } + Point* get_clone() const { + return center->clone(); + } + Point *center; + vector*> points; + bool is_to_delete; +}; + +#endif diff --git a/src/cluster/src/ClusterFactory.cpp b/src/cluster/src/ClusterFactory.cpp new file mode 100644 index 0000000..741a325 --- /dev/null +++ b/src/cluster/src/ClusterFactory.cpp @@ -0,0 +1,1024 @@ +/* -*- C++ -*- + * + * ClusterFactory.cpp + * + * Author: Benjamin T James + */ + +#ifndef HEADER_HACK +#include "ClusterFactory.h" +#endif + +#include +#include +#include +#include +#include +#include +#include "Histogram.h" +#include "../../nonltr/KmerHashTable.h" +#include "../../nonltr/ChromListMaker.h" +#include "DivergencePoint.h" +#include "Center.h" +#include "Progress.h" +//#include + +template +T avg_distance(Point &c, const std::vector*> &vec) +{ + T dist = 0; + for (auto pt : vec) { + dist += pt->distance(c); + } + return dist / vec.size(); +} +template +Point* find_center(const std::vector*> &vec) +{ + Point* best = vec.front(); + T lowest = avg_distance(*best, vec); + for (int i = 1; i < vec.size(); i++) { + T dist = avg_distance(*vec[i], vec); + if (dist < lowest) { + best = vec[i]; + lowest = dist; + } + } + return best; +} + + + +template +void old_merge(vector*> ¢ers, map*,vector*>*> &clusters, T bandwidth) +{ + cout << "Merging points ... "; + cout.flush(); + vector*> new_centers; + vector*> to_delete; + for (int i = 0; i < centers.size(); i++) { + bool is_good = true; + for (int j = i + 1; j < centers.size(); j++) { + T dist = centers[i]->distance(*centers[j]); + if (dist < bandwidth) { + cout << "Merging centers " << centers[i]->get_header() << " and " << centers[j]->get_header() << endl; + for (auto p : *clusters[centers[i]]) { + clusters[centers[j]]->push_back(p); + } + delete clusters[centers[i]]; + clusters[centers[i]] = NULL; + centers[i]->set_to_delete(true); + to_delete.push_back(centers[i]); + delete centers[i]; + is_good = false; + break; + } + } + if (is_good) { + new_centers.push_back(centers[i]); + } + } + for (auto it = clusters.begin(); it != clusters.end(); it++) { + if ((*it).first->is_to_delete()) { + clusters.erase(it); + } + } + centers.clear(); + centers = new_centers; + cout << "Done" << endl; +} + +template +void sort_nn_func(std::vector *> &points, std::function&, const Point&)> func, std::function&, const Point&)> distfunc) { + if (points.empty()) { + return; + } + cout << "Sorting points... "; + cout.flush(); + list good; + int good_idx = 0; + for (int i = points.size() - 1; i > 0; i--) { // start at idx 1 bc we don't want to redelete it + good.push_front(i); + } + vector*> total_points; + total_points.push_back(points[0]); +// good.erase(good.begin()); + while (good.size() > 0) { + auto last = total_points.back(); + auto smallest = good.begin(); + uint64_t small_dist = std::numeric_limits::max();// / points[*smallest]->prob_under(*last); + int count = 0; + + for (auto i = good.begin(); i != good.end(); i++) { + if (func(*points[*i], *last)) { +// cout << "Breaking loop " << points[*i]->get_length() << " " << last->get_length() << " -> " << count << endl; + break; + } + uint64_t dist = distfunc(*points[*i],*last);// / points[*i]->prob_under(*last); + if (dist < small_dist) { + small_dist = dist; + smallest = i; + } + count++; + } +// cout << "Number of gaps: " << num_gaps << endl; + // if (func(*points[*smallest], *last)) { + // cout << "Gap " << points[*smallest]->get_length() << " " << last->get_length() << endl; + // } +// cout << "Sorting: " << points[*smallest]->get_header() << endl; + total_points.push_back(points[*smallest]); + // cout << points[*smallest]->get_header() << endl; + // if (total_points.size() % 100 == 0) { + // cout << "Size: " << total_points.size() << endl; + // } + good.erase(smallest); + } + assert(good.empty()); + assert(points.size() == total_points.size()); + points = total_points; + cout << "Done" << endl; +} + +template +void sort_nn_length(std::vector*> &points, double similarity) { + similarity *= 0.90; + if (points.empty()) { + return; + } + cout << "Sorting points by length... "; + cout.flush(); + list good; + int good_idx = 0; + for (int i = 1; i < points.size(); i++) { // start at idx 1 bc we don't want to redelete it + good.push_back(i); + } + vector*> total_points; + total_points.push_back(points[0]); +// good.erase(good.begin()); + bool working = true; + while (working && good.size() > 0) { + working = false; + auto last = total_points.back(); + auto smallest = good.begin(); + T small_dist = points[*smallest]->distance(*last);// / points[*smallest]->prob_under(*last); + for (auto i = good.begin(); i != good.end(); i++) { + double ratio = 100.0 * (double)points[*i]->get_length() / last->get_length(); + if (ratio < similarity) { + // cout << "Length ratio " << ratio << " is less than " << similarity << "." << endl; + break; + } + T dist = points[*i]->distance(*last);// / points[*i]->prob_under(*last); + if (dist < small_dist) { + small_dist = dist; + smallest = i; + } + } + total_points.push_back(points[*smallest]); + // cout << points[*smallest]->get_header() << endl; + // if (total_points.size() % 100 == 0) { + // cout << "Size: " << total_points.size() << endl; + // } + good.erase(smallest); + working = true; + } + assert(good.empty()); + assert(points.size() == total_points.size()); + points = total_points; + cout << "Done" << endl; +} + + + +template +void calculate_gaps(const vector*> &vec, queue &gaps, std::function&, const Point&)> func) +{ + for (int i = 1; i < vec.size(); i++) { + if (func(*vec[i], *vec[i-1])) { + gaps.push(i); + } + } +} + +// TODO: fix bounds +template +pair find_bound(int idx, const vector*> &vec, double sim) +{ + size_t begin_len = vec[idx]->get_length(); + int begin_idx = idx; + int end_idx = idx; + #pragma omp parallel for + for (int j = 0; j < 2; j++) { + if (j == 1) { + for (int i = idx - 1; i >= 0; i--) { + if (vec[i]->get_id() == 0) { + size_t len = vec[i]->get_length(); + if (begin_len < sim * len) { + break; + } + begin_idx = i; + } + } + } else { + for (int i = idx + 1; i < vec.size(); i++) { + if (vec[i]->get_id() == 0) { + size_t len = vec[i]->get_length(); + if (len < sim * begin_len) { + break; + } + end_idx = i; + } + } + } + } + if (begin_idx < end_idx) { + return make_pair(begin_idx, end_idx); + } else { + return make_pair(0, vec.size() - 1); + } +} + +template +vector > get_available_or_min(const vector*> &points, Point* p, pair bounds, const Trainer& trn, bool& used_min) +{ + vector*,int> > good; + for (int i = bounds.first; i <= bounds.second; i++) { + if (points[i]->get_id() == 0) { + good.push_back(make_pair(points[i], i)); + } + } + bool f; + vector > close;// = trn.get_close(p, good, f); + used_min = f; + return close; + // if (used_min) { + // used_min = true; + // // find min + // uintmax_t minimum = std::numeric_limits::max(); + // int min_index = -1; + // vector > v; + // for (int i = 0; i < good.size(); i++) { + // uintmax_t u = p->distance(*good[i].first); + // if (u < minimum) { + // min_index = good[i].second; + // minimum = u; + // } + // } + // //std::cout << "none found, using minimum, dist = " << 1.0 - (double)minimum / 10000 << " " << points[min_index]->get_header() << endl; + // // v.push_back(make_pair(min_index, minimum)); + // v.push_back(make_pair(close[0], p->distance(*points[close[0]]))); + // return v; + // } else + // if (!close.empty()) { + // vector > v(close.size()); + // #pragma omp parallel for + // for (int i = 0; i < close.size(); i++) { + // uintmax_t u = p->distance(*points[close[i]]); + // v.at(i) = make_pair(close[i], u); + // } + // return v; + // } else { + // vector > v; + // return v; + // } +} + +template +void mean_shift_update(vector > &part, int j, const Trainer& trn, int delta) +{ + auto center = part[j].getCenter(); + + int i_begin = std::max(0, j - delta); + int i_end = std::min(j + delta, (int)part.size()-1); + // if (i_begin == i_end) { + // return; + // } + Point* top = center->create_double(); + top->zero(); + Point* temp = top->clone(); + uintmax_t bottom = 0; + vector*, bool> > good; + for (int i = i_begin; i <= i_end; i++) { +// const auto& vec = part.at(centers[i]); + const auto& vec = part[i].getPoints(); + for (auto p : vec) { + good.push_back(make_pair(p, false)); + // p->set_arg_to_this_d(*temp); + // *top += *temp; + // bottom++; + } + } + trn.filter(center, good); + if (!good.empty()) { + for (auto p : good) { + p.first->set_arg_to_this_d(*temp); + *top += *temp; + bottom++; + } + *top /= bottom; + Point* next = trn.closest(top, good); + // Point *next = NULL; + // int next_dist = std::numeric_limits::max(); + // for (int i = 0; i < N; i++) { + // int dist = points[i]->distance_d(*top); + // if (dist < next_dist) { + // next_dist = dist; + // next = points[i]; + // } + // } + if (next != NULL) { + center->set(*next); + center->set_data_str(next->get_data_str()); + } else { + cerr << "mean shift: NULL" << endl; + } + } else { + cout << "GOOD: EMPTY" << endl; + } + delete top; + delete temp; +} + +template +Point* get_mean(vector*> &available, Point& last, double bandwidth) +{ + Point* top = last.create_double(); + top->zero(); + Point* temp = top->clone(); + double bottom = 0; + const int N = available.size(); + if (N == 0) { + throw "N cannot be 0, bad"; + } + bottom = available.size(); + // TODO: parallelize this loop + for (int i = 0; i < N; i++) { + available[i]->set_arg_to_this_d(*temp); + *top += *temp; + } + + if (bottom != 0) { + *top /= bottom; + } else { + cerr << "No points in vector" << endl; + throw 5; + } +#pragma omp declare reduction(cmin:std::pair*,double>: \ + omp_out = omp_in.second < omp_out.second ? omp_in : omp_out ) \ + initializer (omp_priv = std::make_pair((Point*)NULL, (double)std::numeric_limits::max())) \ + + std::pair*,double> result = std::make_pair((Point*)NULL, (double)std::numeric_limits::max()); + //todo: add pragma back in +#pragma omp parallel for reduction(cmin:result) + for (int i = 0; i < available.size(); i++) { + double dist = available[i]->distance_d(*top); + if (dist < result.second) { + result = std::make_pair(available[i], dist); + } + } + delete top; + delete temp; + if (result.first == NULL && !available.empty()) { + throw "not working"; + } + return result.first; +} + +template +bool merge(vector > ¢ers, const Trainer& trn, int delta, int bandwidth) +{ + int num_merge = 0; + for (int i = 0; i < centers.size(); i++) { + long ret = trn.merge(centers, i, i + 1, std::min((int)centers.size()-1, i + delta)); + if (ret > i) { + + num_merge++; + auto &to_add = centers[ret].getPoints(); + auto &to_del = centers[i].getPoints(); + to_add.insert(std::end(to_add), std::begin(to_del), std::end(to_del)); + centers[i].lazy_remove(); + } + // vector*,double> > to_merge; + // for (int j = i + 1; j < std::min((int)centers.size(), i + 1 + delta); j++) { + // to_merge.push_back(std::make_pair(centers[j].getCenter(), -1)); + // } + // Point* closest = trn.merge(centers[i].getCenter(), to_merge); + // if (closest != NULL) { + // #ifdef DEBUG + // cout << "Merged center " << centers[i]->get_header() << " and " << closest->get_header() << endl; + // #endif + // num_merge++; + // // auto& to_del = partition[centers[i]]; + // // auto& to_add = partition[closest]; + // // to_add.insert(std::end(to_add), std::begin(to_del), std::end(to_del)); + // // partition.erase(centers[i]); + // // centers[i]->set_to_delete(true); + // auto& to_del = partition[centers[i]]; + // auto& to_add = partition[closest]; + // to_add.insert(std::end(to_add), std::begin(to_del), std::end(to_del)); + // partition.erase(centers[i]); + // centers[i]->set_to_delete(true); + + // } + } + //cout << "Merged " << num_merge << " centers" << endl; + centers.erase(std::remove_if(centers.begin(), centers.end(), [](const Center& p) { + return p.is_delete(); + }), centers.end()); + return num_merge > 0; +} + +template +void print_output(const string& output, vector > & partition) +{ + cout << "Printing output" << endl; + std::ofstream ofs; + ofs.open(output, std::ofstream::out); + int counter = 0; + for (auto& cen : partition) { + if (cen.empty()) { + continue; + } + ofs << ">Cluster " << counter << endl; + int pt = 0; + bool cen_found = false; + for (auto p : cen.getPoints()) { + if (p->get_id() == cen.getCenter()->get_id()) { + cen_found = true; + break; + } + } + if (!cen_found) { + cout << "Center not found" << endl; + cout << "Cluster " << counter << " has center " << cen.getCenter()->get_header() << endl; + // cen.getCenter()->set(*cen.getPoints().at(0)); + } + for (auto p : cen.getPoints()) { + string s = p->get_header(); + ofs << pt << "\t" << p->get_length() << "nt, " << s << "... "; + if (p->get_id() == cen.getCenter()->get_id()) { + ofs << "*"; + } + ofs << endl; + pt++; + } + counter++; + } + ofs.close(); +} + +template +void sort(vector*> &points, vector*> ¢ers, int bandwidth, double sim, const Trainer& trn, string output_file, int iter, int delta) +{ + int cur = 0; + points[0]->set_id(points.size()); + cur++; + int last = 0; + vector v; + using partition = map*, vector*> >; + partition part; + centers.push_back(points.front()->clone()); + part[centers.front()].push_back(points.front()); + + while (true) { + pair bounds = find_bound(last, points, sim); + bool used_min; + auto available = get_available_or_min(points, points[last], bounds, trn, used_min); + // std::sort(available.begin(), available.end(), + // [](const pair a, const pair b) { + // return a.second < b.second; + // }); +// auto available = trn.get_close(points[last], points, bounds, used_min); + if (available.empty()) { + break; + } +// used_min = used_min && !v.empty(); + if (used_min) { + if (!v.empty()) { + auto c = points[last]->clone(); + centers.push_back(c); + for (auto idx : v) { + part[c].push_back(points[idx]); + } + v.clear(); + } + last = available.back().first; + } + for (auto pr : available) { + if (cur % 10000 == 0) { + cout << "Placed " << cur << endl; + } + points[pr.first]->set_id(cur); + v.push_back(pr.first); + cur++; + } + if (!used_min) { + last = get_mean(v, *points[last], points, bandwidth); + } + } + auto c = points[last]->clone(); + centers.push_back(c); + for (auto idx : v) { + part[c].push_back(points[idx]); + } + points[0]->set_id(0); + centers[0]->set_id(0); + + cout << "Found " << centers.size() << " initial centers" << endl; + assert(centers.size() == part.size()); + std::sort(points.begin(), points.end(), [](const Point* a, const Point* b) { + return a->get_id() < b->get_id(); + }); + std::sort(centers.begin(), centers.begin(), [](const Point* a, const Point* b) { + return a->get_id() < b->get_id(); + }); + // for (int i = 0; i < points.size(); i++) { + // cout << points[i]->get_header() << " "; + // if (i == 0) { + // cout << endl; + // continue; + // }; + // int last_dist = points[i]->distance(*points[i-1]); + // cout << last_dist << endl; + // } + Point* lastp = NULL; + for (auto c : centers) { + auto v = part[c]; + for (auto p : v) { + cout << p->get_header() << " "; + cout << c->get_header() << " "; + cout << p->distance(*c) << " "; + if (lastp == NULL) { + cout << endl; + } else { + cout << p->distance(*lastp) << endl; + } + lastp = p; + } + } + for (int i = 0; i < iter; i++) { + print_output(output_file + to_string(i), part); + cout << "Mean shift iteration " << i << endl; + #pragma omp parallel for + for (int j = 0; j < centers.size(); j++) { + mean_shift_update(part, centers, j, trn, delta); + } + merge(centers, part, trn, delta, bandwidth); + for (auto const& kv : part) { + if (kv.second.empty()) { + cerr << "Empty cluster " << kv.first->get_header() << endl; + throw 0; + } + } + } + for (int j = 0; j < centers.size(); j++) { + mean_shift_update(part, centers, j, trn, 0); + } + print_output(output_file, part); +} + + +/* + * Accumulates points in a center until none are close, + * then returns the next center (not cloned) + */ +template +size_t accumulate(Point** last_ptr, bvec &points, vector > ¢ers, + const Trainer& trn, double sim, double bandwidth, int total_iter) +{ + Point* last = *last_ptr; + vector*> current = {last}; + bool is_min = false; + + for (int num_iter=0; !is_min; num_iter++) { + #ifdef DEBUG + cout << num_iter << " last: " << last->get_header() << endl; + #endif + auto len = last->get_length(); + auto bounds = points.get_range(len * sim, len / sim); + auto result = trn.get_close(last, + points.iter(bounds.first), + points.iter(bounds.second), + is_min); + + if (is_min) { + Point* new_pt = get<0>(result); + // cout << "minimum point: " << new_pt->get_header() << endl; + size_t r = get<2>(result); + size_t c = get<3>(result); + #ifdef DEBUG + cout << "center added" << endl; + #endif + // no close points left for center, + // returned value is the next center (return this) + //points.remove_available(bounds.first, bounds.second, newvec); + if (new_pt == NULL) { + // No points left in range, try 1st point + *last_ptr = points.pop(); + } else { + // New center + *last_ptr = new_pt; + points.erase(r, c); + } + vector*> newvec; + points.remove_available(bounds.first, bounds.second, newvec); // DEBUGGING USE ONLY + if (!newvec.empty()) { + throw "this should never happen"; + } + } else { // keep adding points, find new mean + size_t prev_size = current.size(); + points.remove_available(bounds.first, bounds.second, current); + + last = get_mean(current, *last, bandwidth); + size_t added_size = current.size() - prev_size; + #ifdef DEBUG + cout << "added new points (" << added_size << ")" << endl; + #endif + if (last == NULL) { + cerr << "Last is null" << endl; + throw 100; + } + } + } +// cout << "Pushed back center " << last->get_header() << endl; + Center cc(last, current); + centers.push_back(cc); +// Center cen(last, current); +// centers.emplace_back(last, current); + // Point* center = last->clone(); + // centers.push_back(center); + // part[center] = current; + #ifdef DEBUG + for (auto p : current) { + cout << total_iter << " Cluster " << last->get_header() << ": " << p->get_header() << endl; + } + #endif + // if (points.empty()) { + // return true; + // } else { + // return false; + // } + return current.size(); +} + + +template +void ClusterFactory::MS(bvec &points, T bandwidth, double sim, const Trainer& trn, string output, int iter, int delta) +{ + vector > part; +// using partition = map*, vector*> >; +// partition part; + + Point* last = points.pop(); + //cout << "First length: " << last->get_length() << endl; + Progress pa(points.size(), "Accumulation"); + for (int num = 0; last != NULL; num++) { + size_t n = accumulate(&last, points, part, trn, sim, bandwidth, num); + pa += n; + } + pa.end(); +// points.check(); + size_t total = 0; + for (auto cen : part) { + total += cen.getPoints().size(); + } + cout << "total size: " << total << endl; + Progress pu(iter, "Update"); + for (int i = 0; i < iter; i++) { + // #ifdef DEBUG + //print_output(output + to_string(i), part); + // #endif + //cout << "Mean shift iteration " << i << endl; + #pragma omp parallel for + for (int j = 0; j < part.size(); j++) { + mean_shift_update(part, j, trn, delta); + } + merge(part, trn, delta, bandwidth); + pu++; + } + + #pragma omp parallel for + for (int j = 0; j < m_centers.size(); j++) { + mean_shift_update(part, j, trn, 0); + } + pu.end(); + print_output(output, part); +} + +/* + * This uses a callback to specify the specific type of point. + * + * To call this, use like: + * + * factory.build_points("input", &ClusterFactory::get_histogram); + */ +template +std::vector*> ClusterFactory::build_points(vector fileList, std::function*(ChromosomeOneDigit *)> get_point) +{ + std::vector*> points; + std::vector*> cpoints; + unsigned fsize = fileList.size(); + std::vector*> initial_centers; + std::stringstream buffer; + buffer << "Counting " << k << "-mers"; + Progress p(fsize, buffer.str()); + for (unsigned i = 0; i < fsize; i++) { + p++; + ChromListMaker *maker = new ChromListMaker(fileList.at(i)); + const std::vector * chromList = maker->makeChromOneDigitList(); + unsigned csize = chromList->size(); +#pragma omp parallel for ordered + for (unsigned h = 0; h < csize; h++) { + ChromosomeOneDigit *chrom = dynamic_cast(chromList->at(h)); + if (chrom) { + Point *h = get_point(chrom); + if (h != NULL) { +#pragma omp ordered + { + // cout << "Header: " << h->get_header() << endl; + points.push_back(h); + } + } + } else { + throw InvalidStateException(string("Dynamic cast failed")); + } + } + delete maker; + } + return points; +// std::random_shuffle(points.begin(), points.end()); +// queue gaps; +// calculate_gaps(points, gaps, func); + // for (int i = 1; i < points.size(); i++) { + // int la = points[i]->get_length(); + // int lb = points[i-1]->get_length(); + // if (lb > la && 100.0 * la / lb < sim) { + // gaps.push(i); + // } + // } + + +// vector*>> p; +// vector*> tmp; +// tmp.push_back(points[0]); +// for (int j = 1; j < points.size(); j++) { + +// int la = points[j]->get_length(); +// int lb = points[j-1]->get_length(); +// assert(lb >= la); +// if (lb > la && 100.0 * la / lb < sim) { +// p.push_back(tmp); +// cout << "Gap " << tmp.size() << endl; +// tmp.clear(); +// } +// tmp.push_back(points[j]); +// } +// if (!tmp.empty()) { +// p.push_back(tmp); +// } + +// // calculate_distances(points); +// int idx = 0; +// for (auto &c : p) { +// sort_nn_func(c, func); +// for (auto v : c) { +// v->set_id(idx++); +// cpoints.push_back(v); +// } +// } + + // sort_nn_func(points, + // [&](const Point&a, const Point&b) { + // int la = a.get_length(); + // int lb = b.get_length(); + // return lb > la && 100.0 * la / lb < sim; + // }, + // [](const Point& a, const Point& b) { + // return a.distance_k1(b); + // }); + + + // // for(auto p : points){ + // // cout << p->get_header() << endl; + // // } + + + + // sort_nn_func(points, + // [&](const Point& a, const Point& b) { + // int la = a.get_length(); + // int lb = b.get_length(); + // if (lb > la && 100.0 * la / lb < sim) { + // double mono = a.distance_k1(b) * 100; + // bool q = mono < sim; + // /* + // if (q) { + // cout << "TRUE" << endl; + // } else { + // cout << "FALSE"<< endl; + // } + // */ + // return q; + // } else { + // return false; + // } + // }, + // [](const Point& a, const Point& b) { + // return a.distance(b); + // }); + // uint64_t idx = 0; + // for (auto v : points) { + // v->set_id(idx++); + + // cpoints.push_back(v); + // } + // cout << "Points: " << cpoints.size() << endl; + + + // for (int i = 0; i < points.size(); i++) { + // cout << points[i]->get_header(); + // if (i > 0) { + // cout << " " << points[i]->distance(*points[i-1]); + // } + // cout << endl; + // } + + + + // for (int i = 0; i < points.size(); i++) { + // points[i]->set_id(i); + // cpoints.push_back(points[i]); + // assert(cpoints[i]->get_id() == i); + // } + return points; +} + + +// consider all from 'to', distances[].size must be >= to.size() +template +Point* find_nearest(const std::vector*> &to, + vector &good, const std::vector*> &from, + std::vector* distances, int& last_idx) +{ + // Step 1. Fill the closest distance list + int best_dist = 0; + Point* best_pt = NULL; + std::vector::iterator best_idx; + last_idx %= to.size(); + for (auto idx = good.begin(); idx != good.end(); idx++) { + int i = *idx; + distances[last_idx][i] = to[last_idx]->distance(*from[i]); + int dist = 0; + for (int j = 0; j < to.size(); j++) { + dist += distances[j][*idx]; + } + if (best_pt == NULL || dist < best_dist) { + best_pt = from[i]; + best_dist = dist; + best_idx = idx; + } + } + cout << "Dist: " << best_dist << endl; + last_idx++; + good.erase(best_idx); + return best_pt; +} + +template +void ClusterFactory::sort_nn(std::vector *> &points, Point* nearest_to, int arg) const +{ + + if (points.empty()) { + return; + } + cout << "Sorting points... "; + cout.flush(); + vector good; + int good_idx = points.size() - 1; + for (int i = 0; i < points.size(); i++) { + if (nearest_to != NULL && nearest_to == points[i]) { + good_idx = i; + } + good.push_back(i); + } + vector*> total_points; + total_points.push_back(points[good_idx]); + good.erase(good.begin() + good_idx); + bool working = true; + while (working && good.size() > 0) { + working = false; + auto last = total_points.back(); + auto smallest = good.begin(); + T small_dist = points[*smallest]->distance(*last); + for (auto i = good.begin(); i != good.end(); i++) { + T dist = points[*i]->distance(*last);// / points[*i]->prob_under(*last); + if (dist < small_dist) { + small_dist = dist; + smallest = i; + } + } + total_points.push_back(points[*smallest]); + // cout << points[*smallest]->get_header() << endl; + if (total_points.size() % 100 == 0) { + cout << "Size: " << total_points.size() << endl; + } + good.erase(smallest); + working = true; + } + assert(good.empty()); + assert(points.size() == total_points.size()); + points = total_points; + cout << "Done" << endl; +} + +template +Point *ClusterFactory::get_divergence_point(ChromosomeOneDigit *chrom) +{ + if (chrom == NULL) { + return NULL; + } + KmerHashTable table(k, 1); + KmerHashTable table_k1(1, 0); + std::vector values; + vector values_k1; + values.clear(); + fill_table(table, chrom, values); + fill_table(table_k1, chrom, values_k1); +// int tmplate = get_template(chrom->getHeader(), templates); + Point *p = new DivergencePoint(values, chrom->size()); +// cout << "mag: " << ((DivergencePoint*)p)->getPseudoMagnitude() << std::endl; + p->set_1mers(values_k1); + p->set_header(chrom->getHeader()); + p->set_length(chrom->getBase()->length()); + p->set_data_str(*chrom->getBase()); + return p; +} + + +template +Point *ClusterFactory::get_histogram(ChromosomeOneDigit *chrom) +{ + if (chrom == NULL) { + return NULL; + } + KmerHashTable table(k, 0); + std::vector values; + values.clear(); + fill_table(table, chrom, values); +// int tmplate = get_template(chrom->getHeader(), templates); +// Point *p = new Histogram(values); + Point *p = new DivergencePoint(values, chrom->size()); + p->set_header(chrom->getHeader()); + p->set_length(chrom->getBase()->length()); + return p; +} + +template +T ClusterFactory::find_h(const std::vector*> ¢ers) const +{ + int size = centers.size(); + T div = 0; + int num_divergence = 0; + vector divs; + for (int i = 0; i < size; i++) { + for (int j = 0; j < size; j++) { + if (j == i) { continue; } + divs.push_back(centers[i]->distance(*centers[j])); +// num_divergence++; + } + } + std::sort(divs.begin(), divs.end()); + int end = divs.size() / 50; + for (int i = 0; i < end; i++) { + div += divs[i]; + } + return div / end / 100; + if (divs.size() % 2 == 0) { + return (divs[divs.size()/2 - 1] + divs[divs.size()/2]) / 2; + } else { + return divs[divs.size()/2]; + } +} +/* +template +std::vector *> ClusterFactory::get_centers(const std::vector *> &points) +{ + std::vector*> centers; + for (typename std::vector*>::const_iterator it = points.begin(); it != points.end(); ++it) { + Point *p = *it; + if (choose_center(*p)) { + centers.push_back(p->clone()); + } + } + + return centers; +} +*/ +#ifndef HEADER_HACK +template class ClusterFactory; +template class ClusterFactory; +template class ClusterFactory; +template class ClusterFactory; +template class ClusterFactory; +template class ClusterFactory; + +#endif diff --git a/src/cluster/src/ClusterFactory.h b/src/cluster/src/ClusterFactory.h new file mode 100644 index 0000000..12180c9 --- /dev/null +++ b/src/cluster/src/ClusterFactory.h @@ -0,0 +1,82 @@ +/* -*- C++ -*- + * + * ClusterFactory.h + * + * Author: Benjamin T James + */ + +#ifndef CLUSTERFACTORY_H +#define CLUSTERFACTORY_H + + +#include +#include +#include +#include +#include "../../nonltr/ChromosomeOneDigit.h" +#include "../../nonltr/KmerHashTable.h" +#include "Point.h" +#include "Trainer.h" +#include "bvec.h" + +template +class ClusterFactory { +public: + ClusterFactory(int k_len, int npp=std::numeric_limits::max()) : k(k_len), num_per_partition(npp) {} + std::vector*> build_points(vector files, std::function*(ChromosomeOneDigit*)> get_point); + Point* get_histogram(ChromosomeOneDigit *chrom); + Point* get_divergence_point(ChromosomeOneDigit *chrom); + T find_h(const std::vector*> ¢ers) const; + void sort_nn(std::vector*> &points, Point* nearest_to=NULL, int arg=3) const; + void MS(bvec &points, T bandwidth, double sim, const Trainer& trn, string output, int iter, int delta); +private: + vector lookup_table; + vector*> m_centers; + const int num_per_partition; + int k; + //void fill_table(KmerHashTable &table, ChromosomeOneDigit *chrom, std::vector& values); +}; + +template +void fill_table(KmerHashTable &table, ChromosomeOneDigit *chrom, std::vector& values) +{ + const int k = table.getK(); + auto segment = chrom->getSegment(); + const char *seg_bases = chrom->getBase()->c_str(); + for (vector *v : *segment) { + int start = v->at(0); + int end = v->at(1); + table.wholesaleIncrement(seg_bases, start, end - k + 1); + } + unsigned long tableSize = table.getMaxTableSize(); + values.reserve(values.size() + tableSize); + const V * valueArray = table.getValues(); + std::copy(&valueArray[0], &valueArray[tableSize], std::back_inserter(values)); +} +// template +// void fill_table(KmerHashTable &table, ChromosomeOneDigit *chrom, std::vector& values) +// { +// const int k = table.getK(); +// auto segment = chrom->getSegment(); +// const char *seg_bases = chrom->getBase()->c_str(); +// for (vector *v : *segment) { +// int start = v->at(0); +// int end = v->at(1); +// table.wholesaleIncrement(seg_bases, start, end - k + 1); +// } +// std::vector *keys = table.getKeys(); +// for (std::string str : *keys) { +// values.push_back(table.valueOf(str.c_str())); +// } +// keys->clear(); +// delete keys; +// } + +#ifdef HEADER_HACK +#ifndef CLUSTERFACTORY_C +#define CLUSTERFACTORY_C +#include "ClusterFactory.cpp" +#endif +#endif + +#endif diff --git a/src/cluster/src/DivergencePoint.cpp b/src/cluster/src/DivergencePoint.cpp new file mode 100644 index 0000000..70e4e2d --- /dev/null +++ b/src/cluster/src/DivergencePoint.cpp @@ -0,0 +1,284 @@ +/* -*- C++ -*- + * + * DivergencePoint.cpp + * + * Author: Benjamin T James + * + * Main histogram type, includes distance() which is intersection() in Feature.cpp + */ +#include "DivergencePoint.h" +#include +#include +#include +#include + + +template +double DivergencePoint::prob_under(Point &p) const +{ + const DivergencePoint& c = dynamic_cast&>(p); + double sum = 0; + const size_t s = points.size(); + double total = 0; + std::feclearexcept(FE_OVERFLOW); + std::feclearexcept(FE_UNDERFLOW); + for (int i = 0; i < s; i++) { + sum += c.points[i]; + if (i % 4 == 3) { + for (int j = i - 3; j <= i; j++) { + double prob = c.points[j] / sum; + double log_prob = log(prob); + total += (points[j] - 1) * log_prob; + if ((bool)std::fetestexcept(FE_UNDERFLOW)) { + cout << "Underflow!" << endl; + } + // cond.push_back(log(prob)/log4); + } + sum = 0; + } + } + // for (size_t q = 0; q < s; q += 4) { + // double sum = 0; + // for (int i = q; i < q + 4; i++) { + // sum += c.points[i]; + // } + // for (int i = q; i < q + 4; i++) { + // double prob = c.points[i] / sum; + // double log_prob = log(prob); + // total += (points[i] - 1) * log_prob; + // } + // } + return exp(total / s); +} + +template +double DivergencePoint::distance_d(Point& p) const +{ + const DivergencePoint& c = dynamic_cast&>(p); + uint64_t dist = 0; + uint64_t mag = 0; + for (auto i = 0; i < points.size(); i++) { + dist += 2 * min(points[i],(T)c.points[i]); + mag += points[i] + c.points[i]; + } + double frac = (double)dist / mag; + return 10000.0 * (1.0 - frac * frac); +} + + +template +uint64_t DivergencePoint::distance(const Point& p) const +{ + const DivergencePoint& c = dynamic_cast&>(p); + uint64_t dist = 0; + const uint64_t mag = getPseudoMagnitude() + c.getPseudoMagnitude(); + #pragma omp simd + for (auto i = 0; i < points.size(); i++) { + dist += min(points[i], c.points[i]); + } + dist *= 2; + double frac = (double)dist / mag; + return 10000.0 * (1.0 - frac * frac); +} + +template +double DivergencePoint::distance_k1(const Point &p) const +{ + uint64_t dist = 0; + + auto a = Point::get_1mers(), b = p.get_1mers(); + uint64_t mag = 0; + for (auto i = 0; i < 4; i++) { + dist += std::min(a[i], b[i]); + mag += a[i]; + } + return (double)dist / (double)mag; + +} +template +DivergencePoint::DivergencePoint(const std::vector& pts, uint64_t len) +{ + mag = 0; + points = pts; + for (unsigned int i = 0; i < pts.size(); i++) { + mag += pts.at(i); + } +// display(); + nucl_length = len; + to_delete = false; + id = 0; +} + + +template +DivergencePoint::DivergencePoint(unsigned int size) +{ + for (unsigned int i = 0; i < size; i++) { + points.push_back(0); + } + to_delete = false; + nucl_length = 0; + id = 0; +} + +template +void DivergencePoint::operator*=(double d) +{ + unsigned int size = points.size(); + for (auto& pt : points) { + pt *= d; + } +} + +template +bool DivergencePoint::operator<(Point& p) const +{ + const DivergencePoint& h = dynamic_cast&>(p); + unsigned int size = std::min(points.size(),h.points.size()); + /*int boundary = 0; + for (unsigned int i = 0; i < size; i++) { + if (points.at(i) > h.points.at(i)) { + boundary++; + } else if (points.at(i) < h.points.at(i)) { + boundary--; + } + } + return boundary < 0;*/ + for (unsigned int i = 0; i < size; i++) { + if (points.at(i) >= h.points.at(i)) { + return false; + } + } + return true; +} + +template +void DivergencePoint::operator/=(double d) +{ + unsigned int size = points.size(); + for (unsigned int i = 0; i < size; i++) { + points[i] /= d; + } +// cout << endl; +} + +template +void DivergencePoint::operator+=(Point& p) +{ + const DivergencePoint& h = dynamic_cast&>(p); + unsigned int size = std::min(points.size(),h.points.size()); + for (unsigned int i = 0; i < size; i++) { + points.at(i) += h.points.at(i); + } +} + +template +uint64_t DivergencePoint::operator-(const Point& p) const +{ + return distance(p); +} + +template +void DivergencePoint::set(Point& p) +{ + const DivergencePoint& h = dynamic_cast&>(p); + points = std::vector(h.points); + set_length(h.get_length()); + to_delete = h.to_delete; + Point::set_header(h.get_header()); + set_id(h.get_id()); +} + +template +void DivergencePoint::display() const +{ + unsigned size = points.size(); + for (unsigned i = 0; i < size; i++) { + std::cout << points.at(i) << " "; + } + std::cout << std::endl; +} + +template +void DivergencePoint::zero() +{ + for (auto &i : points) { + i = 0; + } +} + +template +void DivergencePoint::addOne() +{ + for (auto& a : points) { + a++; + } +} + +template +void DivergencePoint::subOne() +{ + for (auto& a : points) { + a--; + } +} + +/* + * p(y|x) = cond_p + * q(y|x) = cond_p + */ +template +double DivergencePoint::divergence(Point& p) const +{ + const DivergencePoint& d = dynamic_cast&>(p); + T sum4_p = 0, sum4_q = 0; // Sum for every 4 nucleotides + double total_sum_p = 0, total_sum_q = 0; // Total running sum of all nucleotides + double outer_sum_p = 0, outer_sum_q = 0; // Prior K-mer sum + for (int i = 0; i < points.size(); i++) { // Compute divergence for P and Q simultaneously + sum4_p += points[i]; + sum4_q += d.points[i]; + if (i % 4 == 3) { //finished counting word, now compute probabilities + double inner_sum_p = 0; // Sum of p(X|Y) * log(p(X|Y) / q(X|Y)) + double inner_sum_q = 0; // Sum of q(X|Y) * log(q(X|Y) / p(X|Y)) + for (int j = i - 3; j <= i; j++) { + double conditional_p = points[j] / sum4_p; + double conditional_q = d.points[j] / sum4_q; + double lg = log(conditional_p) - log(conditional_q); + inner_sum_p += conditional_p * lg; + inner_sum_q += -1 * conditional_q * lg; + } + outer_sum_p += sum4_p * inner_sum_p; + outer_sum_q += sum4_q * inner_sum_q; + + total_sum_p += sum4_p; + total_sum_q += sum4_q; + sum4_p = 0; + sum4_q = 0; + } + } + double left = outer_sum_p / total_sum_p; + double right = outer_sum_q / total_sum_q; + return (left + right) / 2.0; +} + +template +uint64_t DivergencePoint::getPseudoMagnitude() const +{ + return mag; +} + + +template +uint64_t DivergencePoint::getRealMagnitude() const +{ + return mag - points.size(); +} + +#ifndef HEADER_HACK +template class DivergencePoint; +template class DivergencePoint; +template class DivergencePoint; +template class DivergencePoint; +template class DivergencePoint; +template class DivergencePoint; +#endif diff --git a/src/cluster/src/DivergencePoint.h b/src/cluster/src/DivergencePoint.h new file mode 100644 index 0000000..087bff1 --- /dev/null +++ b/src/cluster/src/DivergencePoint.h @@ -0,0 +1,89 @@ +/* -*- C++ -*- + * + * DivergencePoint.h + * + * Author: Benjamin T James + * + * Header for most often used k-mer histogram type + */ +#ifndef DIVERGENCE_POINT_H +#define DIVERGENCE_POINT_H +#include "Point.h" +#include +template +class DivergencePoint : public Point { +public: + DivergencePoint(const std::vector& pts, uint64_t len); + DivergencePoint(unsigned int size); + ~DivergencePoint() { points.clear(); } + void operator*=(double d); + void operator/=(double d); + uint64_t operator-(const Point& p) const; + bool operator<(Point& p) const; + void operator+=(Point& p); + void set(Point& p); + void display() const; + void zero(); + void addOne(); + void subOne(); + double prob_under(Point& p) const; + uint64_t getRealMagnitude() const; + uint64_t getPseudoMagnitude() const; +// T magnitude() const { return getRealMagnitude(); }; + double distance_k1(const Point& p) const; + double get_stddev() const { return s_dev; }; + DivergencePoint* clone() const { + auto d = new DivergencePoint(points, to_delete); + d->set_header(Point::get_header()); + d->set_id(get_id()); + d->set_length(get_length()); + d->set_stddev(get_stddev()); + return d; + } + DivergencePoint* create() const { + return new DivergencePoint(points.size()); + } + Point* create_double() const { + vector v; + for (auto val : points) { + v.push_back(val); + } + return new DivergencePoint(v, nucl_length); + } + void set_arg_to_this_d(Point& p) const { + DivergencePoint& c = dynamic_cast< DivergencePoint&>(p); + for (int i = 0; i < points.size(); i++) { + c.points[i] = points[i]; + } + c.set_id(id); + }; + + + bool is_to_delete() const { + return to_delete; + } + void set_to_delete(bool b) { + to_delete = b; + } + double divergence(Point& p) const; + double distance_d(Point& p) const; + uint64_t distance(const Point& p) const; + const vector& get_data() const { return points; } + void set_id(uintmax_t c_id) { id = c_id; }; + const uintmax_t get_id() const { return id; }; + + void set_length(unsigned long len) { nucl_length = len; }; + void set_stddev(double s_dev_) { s_dev = s_dev_; }; + unsigned long get_length() const { return nucl_length; }; + unsigned long size() const { return points.size(); }; + std::vector points; + +private: + uintmax_t mag; + bool to_delete; + uint64_t id; + uint64_t nucl_length; + double s_dev; +}; + +#endif diff --git a/src/cluster/src/Feature.cpp b/src/cluster/src/Feature.cpp new file mode 100644 index 0000000..67baf50 --- /dev/null +++ b/src/cluster/src/Feature.cpp @@ -0,0 +1,1823 @@ +/* -*- C++ -*- + * + * Feature.cpp + * + * Author: Benjamin T James + * + * Raw feature methods are here. + * Duplicates exist of many of the functions + * exist because I was lazy and couldn't get + * anonymous functions to work with the hashing + */ +#include "Feature.h" +#include "DivergencePoint.h" +#include +#include +#include +#include +#include "../../utility/GlobAlignE.h" + + +template +Feature::Feature(const Feature& feat_) : k(feat_.get_k()) +{ + flags = feat_.get_flags(); + mins = feat_.get_mins(); + maxs = feat_.get_maxs(); + is_sims = feat_.get_sims(); + combos = feat_.get_combos(); + lookup = feat_.get_lookup(); + is_finalized = feat_.get_finalized(); + do_save = false; + auto freverse = [](int idx, int k) { + int sum = 0; + for (int i = 0; i < k; i++) { + int rem = idx % 4; + idx /= 4; + sum = 4 * sum + rem; + + } + return sum; + }; + auto freverse_complement = [](int idx, int k) { + std::vector v; + for (int i = 0; i < k; i++) { + v.push_back(3 - idx % 4); + idx /= 4; + } + int sum = 0; + for (auto val : v) { + sum = 4 * sum + val; + } + return sum; + }; + for (auto f : lookup) { + raw_funcs.push_back(get_func(f)); + } +} + + +// void * __gxx_personality_v0=0; +// void * _Unwind_Resume =0; +template +Feature Feature::operator=(const Feature& feat_) +{ + k = feat_.get_k(); + flags = feat_.get_flags(); + mins = feat_.get_mins(); + maxs = feat_.get_maxs(); + is_sims = feat_.get_sims(); + combos = feat_.get_combos(); + lookup = feat_.get_lookup(); + is_finalized = feat_.get_finalized(); + do_save = false; + auto freverse = [](int idx, int k) { + int sum = 0; + for (int i = 0; i < k; i++) { + int rem = idx % 4; + idx /= 4; + sum = 4 * sum + rem; + + } + return sum; + }; + auto freverse_complement = [](int idx, int k) { + std::vector v; + for (int i = 0; i < k; i++) { + v.push_back(3 - idx % 4); + idx /= 4; + } + int sum = 0; + for (auto val : v) { + sum = 4 * sum + val; + } + return sum; + }; + for (auto f : lookup) { + raw_funcs.push_back(get_func(f)); + } + return *this; +} + +template +void Feature::add_feature(uint64_t f_flags, Combo combo) +{ +// cout << "Adding combo " << f_flags << endl; + if (combo != Combo::xy && combo != Combo::x2y && combo != Combo::xy2 && combo != Combo::x2y2) { + throw "invalid combo"; + } + vector indices; + for (uint64_t f = 1; f <= f_flags; f = (f << 1)) { + // it is in the new parameter but not currently in store + if ((f_flags & f) != 0) { + if ((flags & f) == 0) { + lookup.push_back(f); + raw_funcs.push_back(get_func(f)); + //cout << "new single feature " << f << endl; + mins.push_back(std::numeric_limits::max()); + maxs.push_back(std::numeric_limits::min()); + is_sims.push_back(feat_is_sim(f)); + is_finalized.push_back(false); + flags |= f; + } + indices.push_back(index_of(f)); + } + } + combos.push_back(std::make_pair(combo, indices)); +} + +template +void Feature::finalize() +{ + for (size_t i = 0; i < is_finalized.size(); i++) { + is_finalized[i] = true; + } +} +template +void Feature::normalize_cache(vector &cache) const +{ + for (size_t i = 0; i < lookup.size(); i++) { + double val = (cache[i] - mins[i]) / (maxs[i] - mins[i]); + if (is_sims[i]) { + cache[i] = val; + } else { + cache[i] = 1 - val; + } + } +} +template +vector Feature::compute_all_raw(Point &p, Point &q) +{ + vector cache(lookup.size()); + uint64_t done = 0; +#ifdef FEATURE_OMP +#pragma omp parallel for +#endif + for (size_t i = 0; i < lookup.size(); i++) { + if ((lookup[i] & done) == 0) { +// auto rres = get_func(lookup[i])(p, q); + auto rres = raw_funcs[i](p, q); + cache[i] = rres; + } + } + return cache; +} + +template +void Feature::set_normal(uint64_t single_flag, double min_, double max_) +{ + int idx = index_of(single_flag); + mins.at(idx) = min_; + maxs.at(idx) = max_; + is_finalized.at(idx) = true; +} + +template +void Feature::normalize(const vector > &pairs) +{ + + for (size_t i = 0; i < lookup.size(); i++) { + double small = mins[i], big = maxs[i]; + if (lookup[i] == FEAT_ALIGN) { + mins[i] = 0; + maxs[i] = 1; + continue; + } + if (is_finalized[i]) { + continue; + } +// #ifdef FEATURE_OMP +// #pragma omp parallel for reduction(min:small), reduction(max:big) +// #endif + auto func = raw_funcs[i];// + // get_func(lookup[i]); + for (size_t j = 0; j < pairs.size(); j++) { + double val = func(*pairs[j].first, *pairs[j].second); + if (val < small) { + small = val; + } + if (val > big) { + big = val; + } + } + + mins[i] = small; + maxs[i] = big; + } +}; + +template +vector Feature::feat_names() +{ + std::vector vec; + for (int i = 0; i < combos.size(); i++) { + auto indices = combos[i].second; + std::vector names; + for (auto s : indices) { + names.push_back(feat_name(lookup[s])); + } + std::string str = ""; + auto combo = combos[i].first; + if (combo == Combo::xy) { + str = names[0]; + for (int j = 1; j < indices.size(); j++) { + str += " * " + names[j]; + } + } else if (combo == Combo::xy2 && indices.size() == 2) { + str = names[0] + " * " + names[1] + "^2"; + } else if (combo == Combo::x2y && indices.size() == 2) { + str = names[0] + "^2 * " + names[1]; + } else if (combo == Combo::x2y2) { + str = names[0] + "^2"; + for (int j = 1; j < indices.size(); j++) { + str += " * " + names[j] + "^2"; + } + } + vec.push_back(str); + } + return vec; +} + +template +std::string Feature::feat_name(uint64_t single_flag) +{ + if (single_flag == FEAT_ALIGN) { + return "align"; + } else if (single_flag == FEAT_HELLINGER) { + return "hellinger"; + } else if (single_flag == FEAT_MANHATTAN) { + return "manhattan"; + } else if (single_flag == FEAT_EUCLIDEAN) { + return "euclidean"; + } else if (single_flag == FEAT_CHI_SQUARED) { + return "chi_squared"; + } else if (single_flag == FEAT_NORMALIZED_VECTORS) { + return "normalized_vectors"; + } else if (single_flag == FEAT_HARMONIC_MEAN) { + return "harmonic_mean"; + } else if (single_flag == FEAT_JEFFEREY_DIV) { + return "jefferey_divergence"; + } else if (single_flag == FEAT_K_DIV) { + return "k_divergence"; + } else if (single_flag == FEAT_PEARSON_COEFF) { + return "pearson"; + } else if (single_flag == FEAT_SQCHORD) { + return "squared_chord"; + } else if (single_flag == FEAT_KL_COND) { + return "kl_conditional"; + } else if (single_flag == FEAT_MARKOV) { + return "markov"; + } else if (single_flag == FEAT_INTERSECTION) { + return "intersection"; + } else if (single_flag == FEAT_RRE_K_R) { + return "rre_k_r"; + } else if (single_flag == FEAT_D2z) { + return "d2z"; + } else if (single_flag == FEAT_SIM_MM) { + return "sim_mm"; + } else if (single_flag == FEAT_EUCLIDEAN_Z) { + return "euclidean_z"; + } else if (single_flag == FEAT_EMD) { + return "emd"; + } else if (single_flag == FEAT_SPEARMAN) { + return "spearman"; + } else if (single_flag == FEAT_JACCARD) { + return "jaccard"; + } else if (single_flag == FEAT_LENGTHD) { + return "length_difference"; + } else if (single_flag == FEAT_D2s) { + return "d2s"; + } else if (single_flag == FEAT_AFD) { + return "afd"; + } else if (single_flag == FEAT_MISMATCH) { + return "mismatch"; + } else if (single_flag == FEAT_CANBERRA) { + return "canberra"; + } else if (single_flag == FEAT_KULCZYNSKI1) { + return "kulczynski1"; + } else if (single_flag == FEAT_KULCZYNSKI2) { + return "kulczynski2"; + } else if (single_flag == FEAT_SIMRATIO) { + return "simratio"; + } else if (single_flag == FEAT_JENSEN_SHANNON) { + return "jensen_shannon"; + } else if (single_flag == FEAT_D2_star) { + return "d2_star"; + } else if (single_flag == FEAT_N2R) { + return "n2r"; + } else if (single_flag == FEAT_N2RC) { + return "n2rc"; + } else if (single_flag == FEAT_N2RRC) { + return "n2rrc"; + } else { + return "unknown"; + } +} + +template +std::function&,Point&)> Feature::get_func_(uint64_t single_flag) +{ + std::function&,Point&)> func = [&](Point&,Point&)->double { + cerr << "Unknown single flag " << single_flag << endl; + throw "Function not set"; + }; + if (single_flag == FEAT_ALIGN) { + func = [&](Point& a,Point& b) { + return align(a, b, atable); + }; + } else if (single_flag == FEAT_HELLINGER) { + func = hellinger; + } else if (single_flag == FEAT_MANHATTAN) { + func = manhattan; + } else if (single_flag == FEAT_EUCLIDEAN) { + func = euclidean; + } else if (single_flag == FEAT_CHI_SQUARED) { + func = chi_squared; + } else if (single_flag == FEAT_NORMALIZED_VECTORS) { + func = normalized_vectors; + } else if (single_flag == FEAT_HARMONIC_MEAN) { + func = harmonic_mean; + } else if (single_flag == FEAT_JEFFEREY_DIV) { + func = jefferey_divergence; + } else if (single_flag == FEAT_K_DIV) { + func = k_divergence; + } else if (single_flag == FEAT_PEARSON_COEFF) { + func = pearson; + } else if (single_flag == FEAT_SQCHORD) { + func = squaredchord; + } else if (single_flag == FEAT_KL_COND) { + func = kl_conditional; + } else if (single_flag == FEAT_MARKOV) { + func = markov; + } else if (single_flag == FEAT_INTERSECTION) { + func = intersection; + } else if (single_flag == FEAT_RRE_K_R) { + func = rre_k_r; + } else if (single_flag == FEAT_D2z) { + func = d2z; + } else if (single_flag == FEAT_SIM_MM) { + func = sim_mm; + } else if (single_flag == FEAT_EUCLIDEAN_Z) { + func = euclidean_z; + } else if (single_flag == FEAT_EMD) { + func = emd; + } else if (single_flag == FEAT_SPEARMAN) { + func = spearman; + } else if (single_flag == FEAT_JACCARD) { + func = jaccard; + } else if (single_flag == FEAT_LENGTHD) { + func = length_difference; + } else if (single_flag == FEAT_D2s) { + func = d2s; + } else if (single_flag == FEAT_AFD) { + func = afd; + } else if (single_flag == FEAT_MISMATCH) { + func = mismatch; + } else if (single_flag == FEAT_CANBERRA) { + func = canberra; + } else if (single_flag == FEAT_KULCZYNSKI1) { + func = kulczynski1; + } else if (single_flag == FEAT_KULCZYNSKI2) { + func = kulczynski2; + } else if (single_flag == FEAT_SIMRATIO) { + func = simratio; + } else if (single_flag == FEAT_JENSEN_SHANNON) { + func = [&](Point&a, Point&b) { return jensen_shannon(a, b); }; + } else if (single_flag == FEAT_D2_star) { + func = d2_star; + } else if (single_flag == FEAT_N2R) { + func = [&](Point&a, Point&b) { return n2r(a, b); }; + } else if (single_flag == FEAT_N2RC) { + func = [&](Point&a, Point&b) { return n2rc(a, b); }; + } else if (single_flag == FEAT_N2RRC) { + func = [&](Point&a, Point&b) { return n2rrc(a, b); }; + } + return func; +} + +template +std::function&,Point&)> Feature::get_func(uint64_t single_flag) +{ +// cout << "SINGLE FLAG: " << single_flag << ": " << Feature::log2(single_flag) << endl; + if (!do_save) { + return get_func_(single_flag); + } else if (single_flag == FEAT_HELLINGER) { + return [&](Point& a, Point& b) { return c_hellinger(a,b); }; + } else if (single_flag == FEAT_MANHATTAN) { + return [&](Point& a, Point& b) { return c_manhattan(a,b); }; + } else if (single_flag == FEAT_EUCLIDEAN) { + return [&](Point& a, Point& b) { return c_euclidean(a,b); }; + } else if (single_flag == FEAT_CHI_SQUARED) { + return [&](Point& a, Point& b) { return c_chi_squared(a,b); }; + } else if (single_flag == FEAT_NORMALIZED_VECTORS) { + return [&](Point& a, Point& b) { return c_normalized_vectors(a,b); }; + } else if (single_flag == FEAT_HARMONIC_MEAN) { + return [&](Point& a, Point& b) { return c_harmonic_mean(a,b); }; + } else if (single_flag == FEAT_JEFFEREY_DIV) { + return [&](Point& a, Point& b) { return c_jefferey_divergence(a,b); }; + } else if (single_flag == FEAT_K_DIV) { + return [&](Point& a, Point& b) { return c_k_divergence(a,b); }; + } else if (single_flag == FEAT_PEARSON_COEFF) { + return [&](Point& a, Point& b) { return c_pearson(a,b); }; + } else if (single_flag == FEAT_SQCHORD) { + return [&](Point& a, Point& b) { return c_squaredchord(a,b); }; + } else if (single_flag == FEAT_KL_COND) { + return [&](Point& a, Point& b) { return c_kl_conditional(a,b); }; + } else if (single_flag == FEAT_MARKOV) { + return [&](Point& a, Point& b) { return c_markov(a,b); }; + } else if (single_flag == FEAT_INTERSECTION) { + return [&](Point& a, Point& b) { return c_intersection(a,b); }; + } else if (single_flag == FEAT_RRE_K_R) { + return [&](Point& a, Point& b) { return c_rre_k_r(a,b); }; + } else if (single_flag == FEAT_D2z) { + return [&](Point& a, Point& b) { return c_d2z(a,b); }; + } else if (single_flag == FEAT_SIM_MM) { + return [&](Point& a, Point& b) { return c_sim_mm(a,b); }; + } else if (single_flag == FEAT_EUCLIDEAN_Z) { + return [&](Point& a, Point& b) { return c_euclidean_z(a,b); }; + } else if (single_flag == FEAT_EMD) { + return [&](Point& a, Point& b) { return c_emd(a,b); }; + } else if (single_flag == FEAT_SPEARMAN) { + return [&](Point& a, Point& b) { return c_spearman(a,b); }; + } else if (single_flag == FEAT_JACCARD) { + return [&](Point& a, Point& b) { return c_jaccard(a,b); }; + } else if (single_flag == FEAT_LENGTHD) { + return [&](Point& a, Point& b) { return length_difference(a,b); }; + } else if (single_flag == FEAT_D2s) { + return [&](Point& a, Point& b) { return c_d2s(a,b); }; + } else if (single_flag == FEAT_AFD) { + return [&](Point& a, Point& b) { return c_afd(a,b); }; + } else if (single_flag == FEAT_MISMATCH) { + return [&](Point& a, Point& b) { return c_mismatch(a,b); }; + } else if (single_flag == FEAT_CANBERRA) { + return [&](Point& a, Point& b) { return c_canberra(a,b); }; + } else if (single_flag == FEAT_KULCZYNSKI1) { + return [&](Point& a, Point& b) { return c_kulczynski1(a,b); }; + } else if (single_flag == FEAT_KULCZYNSKI2) { + return [&](Point& a, Point& b) { return c_kulczynski2(a,b); }; + } else if (single_flag == FEAT_SIMRATIO) { + return [&](Point& a, Point& b) { return c_simratio(a,b); }; + } else if (single_flag == FEAT_JENSEN_SHANNON) { + return [&](Point& a, Point& b) { return c_jensen_shannon(a,b); }; + } else if (single_flag == FEAT_D2_star) { + return [&](Point& a, Point& b) { return c_d2_star(a,b); }; + } else if (single_flag == FEAT_N2R) { + return [&](Point& a, Point& b) { return c_n2r(a,b); }; + } else if (single_flag == FEAT_N2RC) { + return [&](Point& a, Point& b) { return c_n2rc(a,b); }; + } else if (single_flag == FEAT_N2RRC) { + return [&](Point& a, Point& b) { return c_n2rrc(a,b); }; + } else { + throw "err"; + } + return get_func_(single_flag); +} +template +vector Feature::multi_to_log(uint64_t multi) +{ + vector ret; + for (uint64_t i = 1; i <= 33; i++) { + if (((1UL << i) & multi) != 0) { + ret.push_back(i); + } + } + return ret; +} + +template +bool Feature::feat_is_sim(uint64_t single_flag) const +{ + bool unknown = true; + bool is_sim = true; + switch (single_flag) { + case FEAT_ALIGN: + is_sim = true; + break; + case FEAT_HELLINGER: + is_sim = false; + break; + case FEAT_MANHATTAN: + is_sim = false; + break; + case FEAT_EUCLIDEAN: + is_sim = false; + break; + case FEAT_CHI_SQUARED: + is_sim = false; + break; + case FEAT_NORMALIZED_VECTORS: + is_sim = true; + break; + case FEAT_HARMONIC_MEAN: + is_sim = true; + break; + case FEAT_JEFFEREY_DIV: + is_sim = false; + break; + case FEAT_K_DIV: + is_sim = false; + break; + case FEAT_PEARSON_COEFF: + is_sim = true; + break; + case FEAT_SQCHORD: + is_sim = false; + break; + case FEAT_KL_COND: + is_sim = false; + break; + case FEAT_MARKOV: + is_sim = true; + break; + case FEAT_INTERSECTION: + is_sim = true; + break; + case FEAT_RRE_K_R: + is_sim = false; + break; + case FEAT_D2z: + is_sim = true; + break; + case FEAT_SIM_MM: + is_sim = true;//probably yes + break; + case FEAT_EUCLIDEAN_Z: + is_sim = false; + break; + case FEAT_EMD: + is_sim = false; + break; + case FEAT_SPEARMAN: + is_sim = true; + break; + case FEAT_JACCARD: + is_sim = true; + break; + case FEAT_LENGTHD: + is_sim = false; + break; + case FEAT_D2s: + is_sim = true; + break; + case FEAT_AFD: + is_sim = false; + break; + case FEAT_MISMATCH: + is_sim = false; + break; + case FEAT_CANBERRA: + is_sim = false; + break; + case FEAT_KULCZYNSKI1: + is_sim = false; + break; + case FEAT_KULCZYNSKI2: + is_sim = true; + break; + case FEAT_SIMRATIO: + is_sim = true; + break; + case FEAT_JENSEN_SHANNON: + is_sim = false; + break; + case FEAT_D2_star: + is_sim = true; + break; + case FEAT_N2R: + is_sim = true; + break; + case FEAT_N2RC: + is_sim = true; + break; + case FEAT_N2RRC: + is_sim = true; + break; + default: + cerr << "bad feature flag " << single_flag << " aka 2^" << log(single_flag) << endl; + + throw single_flag; + } + return is_sim; +} + + +template +double Feature::c_kulczynski2(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_KULCZYNSKI2)); + if (ltable.find(tup) == ltable.end()) { + double val = kulczynski2(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::kulczynski2(Point &a, Point &b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + uint64_t min_sum = 0; + double ap = (double)p.getPseudoMagnitude() / N; + double aq = (double)q.getPseudoMagnitude() / N; + for (auto i = 0; i < N; i++) { + min_sum += std::min(p.points[i], q.points[i]); + } + double coeff = N * (ap + aq) / (2 * ap * aq); + return coeff * min_sum; +} +template +double Feature::align(Point &a, Point &b, std::map, double> &atbl) +{ + auto ai = a.get_id(); + auto bi = b.get_id(); + std::pair pr = ai < bi ? std::make_pair(ai, bi) : std::make_pair(bi, ai); + auto res = atbl.find(pr); + if (res == atbl.end()) { + auto sa = a.get_data_str(); + auto sb = b.get_data_str(); + int la = sa.length(); + int lb = sb.length(); + GlobAlignE galign(sa.c_str(), 0, la-1, + sb.c_str(), 0, lb-1, + 1, -1, 2, 1); + double val = galign.getIdentity(); +#pragma omp critical + atbl[pr] = val; + return val; + } else { + return res->second; + } +} + +template +double Feature::c_squaredchord(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_SQCHORD)); + if (ltable.find(tup) == ltable.end()) { + double val = squaredchord(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::squaredchord(Point &a, Point &b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + double sum = 0; + for (auto i = 0; i < N; i++) { + sum += p.points[i] + q.points[i] - 2 * sqrt(p.points[i] * q.points[i]); + } + return sum; +} + +template +double Feature::c_intersection(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_INTERSECTION)); + if (ltable.find(tup) == ltable.end()) { + double val = intersection(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::intersection(Point &a, Point &b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + uintmax_t dist = 0; + uintmax_t mag = p.getPseudoMagnitude() + q.getPseudoMagnitude(); + #pragma omp simd + for (auto i = 0; i < N; i++) { + dist += 2 * std::min(p.points[i], q.points[i]); + } + return (double)dist / (double)mag; +} + +template +double Feature::c_pearson(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_PEARSON_COEFF)); + if (ltable.find(tup) == ltable.end()) { + double val = pearson(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::pearson(Point &a, Point &b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + double dap = (double)p.getPseudoMagnitude() / N; + double daq = (double)q.getPseudoMagnitude() / N; + double dot = 0, np = 0, nq = 0; + for (auto i = 0; i < N; i++) { + double dp = p.points[i] - dap; + double dq = q.points[i] - daq; + np += dp * dp; + nq += dq * dq; + dot += dp * dq; + } + return dot / sqrt(std::max(np * nq, 0.5)); +} + +template +double Feature::c_simratio(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_SIMRATIO)); + if (ltable.find(tup) == ltable.end()) { + double val = simratio(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::simratio(Point &a, Point &b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + uintmax_t dot = 0, norm2 = 0; + for (auto i = 0; i < N; i++) { + intmax_t diff = p.points[i] - q.points[i]; + dot += p.points[i] * q.points[i]; + norm2 += diff * diff; + } + return dot / (dot + sqrt(norm2)); +} + +template +double Feature::c_manhattan(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_MANHATTAN)); + if (ltable.find(tup) == ltable.end()) { + double val = manhattan(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::manhattan(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + auto N = p.points.size(); + int sum = 0; + #pragma omp simd + for (auto i = 0; i < N; i++) { + sum += p.points[i] > q.points[i] ? p.points[i] - q.points[i] : q.points[i] - p.points[i]; + } +// std::cout << "manhattan: " << sum << std::endl; + return sum; +} + +template +double Feature::length_difference(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + auto lp = p.get_length(); + auto lq = q.get_length(); + if (lp == 0 || lq == 0) { + cerr << "lp: " << lp << " lq: " << lq << endl; + throw 123; + } + auto ret = (lp > lq) ? (lp - lq) : (lq - lp); +// std::cout << "length difference: " << ret << std::endl; + return ret; +} + + +double neighbor(double *cp, double *cq, double ap, double aq, const size_t N) +{ + double sp = 0, sq = 0; + #pragma omp simd + for (auto i = 0; i < N; i++) { + double dp = cp[i] - ap; + double dq = cq[i] - aq; + sp += dp * dp; + sq += dq * dq; + } + sp = sqrt(sp / N); + sq = sqrt(sq / N); + double psum = 0, qsum = 0; + #pragma omp simd + for (auto i = 0; i < N; i++) { + cp[i] = (cp[i] - ap) / sp; + cq[i] = (cq[i] - aq) / sq; + psum += cp[i] * cp[i]; + qsum += cq[i] * cq[i]; + } + double total = 0; + psum = sqrt(psum); + qsum = sqrt(qsum); + #pragma omp simd + for (auto i = 0; i < N; i++) { + cp[i] /= psum; + cq[i] /= qsum; + total += cp[i] * cq[i]; + } + return total; +} + +template +double Feature::c_n2rrc(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_N2RRC)); + if (ltable.find(tup) == ltable.end()) { + double val = n2rrc(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::n2rrc(Point& a, Point& b) const +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + double *cp = new double[N]; + double *cq = new double[N]; + double ap = 0, aq = 0; + for (auto i = 0; i < N; i++) { + int j = reverse.at(i); + int h = reverse_complement.at(i); + cp[i] = p.points[h] + p.points[i] + p.points[j]; + cq[i] = q.points[h] + q.points[i] + q.points[j]; + ap += cp[i]; + aq += cq[i]; + } + ap /= N; + aq /= N; + double total = neighbor(cp, cq, ap, aq, N); + delete[] cp; + delete[] cq; +// std::cout << "n2rrc: " << total << std::endl; + return total; +} + +template +double Feature::c_jensen_shannon(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_JENSEN_SHANNON)); + if (ltable.find(tup) == ltable.end()) { + double val = jensen_shannon(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::jensen_shannon(Point &a, Point &b) const +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + uint64_t mp = p.getPseudoMagnitude(); + uint64_t mq = q.getPseudoMagnitude(); + double sum = 0; + const auto N = p.points.size(); + #pragma omp simd reduction(+:sum) + for (auto i = 0; i < N; i++) { + double pp = (double)p.points[i] / mp; + double pq = (double)q.points[i] / mq; + double avg = 0.5 * (pp + pq); + #ifndef USETBL + double lp = // tbl[(int)(coeff * pp / avg)]; + log(pp / avg); + double lq = // tbl[(int)(coeff * pq / avg)]; + log(pq / avg); + #else + double lp = tbl[(int)(coeff * pp / avg)]; + double lq = tbl[(int)(coeff * pq / avg)]; + #endif + sum += pp * lp + pq * lq; + } + return sum / 2; +} + +template +double Feature::c_rre_k_r(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_RRE_K_R)); + if (ltable.find(tup) == ltable.end()) { + double val = rre_k_r(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::rre_k_r(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + double op = 0, oq = 0; + const double l4 = log(4); + uint64_t sum4_p = 0, sum4_q = 0; + for (auto i = 0; i < N; i++) { + sum4_p += p.points[i]; + sum4_q += q.points[i]; + if (i % 4 == 3) { + double inner_sum_p = 0; + double inner_sum_q = 0; + for (auto j = i - 3; j <= i; j++) { + double conditional_p = (double)p.points[j] / sum4_p; + double conditional_q = (double)q.points[j] / sum4_q; + double avg = 0.5 * (conditional_p + conditional_q); + inner_sum_p += (double)(p.points[j]) + * log(conditional_p / avg) / sum4_p; + inner_sum_q += (double)(q.points[j]) + * log(conditional_q / avg) / sum4_q; + } + op += inner_sum_p; + oq += inner_sum_q; + sum4_p = 0; + sum4_q = 0; + } + } + double val = 0.5 * (op + oq); + return val; +} + + +template +double Feature::c_hellinger(Point& a, Point& b) { + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_HELLINGER)); + if (ltable.find(tup) == ltable.end()) { + double val = hellinger(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::hellinger(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + double ap = (double)p.getPseudoMagnitude() / N; + double aq = (double)q.getPseudoMagnitude() / N; + double sum = 0; + for (auto i = 0; i < N; i++) { + double diff = sqrt(p.points[i] / ap) - sqrt(q.points[i] / aq); + sum += diff * diff; + } + return sqrt(2 * sum); +} + +template +double Feature::c_euclidean(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_EUCLIDEAN)); + if (ltable.find(tup) == ltable.end()) { + double val = euclidean(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::euclidean(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + uintmax_t sum = 0; + for (auto i = 0; i < N; i++) { + auto diff = p.points[i] - q.points[i]; + sum += diff * diff; + } + return sqrt(sum); +} + +template +double Feature::c_chi_squared(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_CHI_SQUARED)); + if (ltable.find(tup) == ltable.end()) { + double val = chi_squared(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::chi_squared(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + double sum = 0; + for (auto i = 0; i < N; i++) { + auto diff = p.points[i] - q.points[i]; + sum += (double)(diff * diff) / (p.points[i] + q.points[i]); + } + return sum; +} + +template +double Feature::c_normalized_vectors(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_NORMALIZED_VECTORS)); + if (ltable.find(tup) == ltable.end()) { + double val = normalized_vectors(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::normalized_vectors(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + uintmax_t sum = 0; + uintmax_t d1 = 0, d2 = 0; + for (auto i = 0; i < N; i++) { + sum += p.points[i] * q.points[i]; + d1 += p.points[i] * p.points[i]; + d2 += q.points[i] * q.points[i]; + } + return (double)sum / sqrt(d1 * d2); +} + +template +double Feature::c_harmonic_mean(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_HARMONIC_MEAN)); + if (ltable.find(tup) == ltable.end()) { + double val = harmonic_mean(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::harmonic_mean(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + double sum = 0; + for (auto i = 0; i < N; i++) { + double numer = p.points[i] * q.points[i]; + sum += numer / (p.points[i] + q.points[i]); + } + return 2 * sum; +} + +template +double Feature::c_jefferey_divergence(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_JEFFEREY_DIV)); + if (ltable.find(tup) == ltable.end()) { + double val = jefferey_divergence(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::jefferey_divergence(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + uint64_t mp = p.getPseudoMagnitude(); + uint64_t mq = q.getPseudoMagnitude(); + double sum = 0; + const auto N = p.points.size(); + for (auto i = 0; i < N; i++) { + double pp = (double)p.points[i] / mp; + double pq = (double)q.points[i] / mq; + double diff = pp - pq; + sum += diff * log(pp / pq); + } + return sum; +} + +template +double Feature::c_k_divergence(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_K_DIV)); + if (ltable.find(tup) == ltable.end()) { + double val = k_divergence(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::k_divergence(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + uint64_t mp = p.getPseudoMagnitude(); + uint64_t mq = q.getPseudoMagnitude(); + double sum = 0; + const auto N = p.points.size(); + for (auto i = 0; i < N; i++) { + double pp = (double)p.points[i] / mp; + double pq = (double)q.points[i] / mq; + double avg = 0.5 * (pp + pq); + sum += pp * log(pp / avg); + } + return sum; +} + +template +double Feature::c_kl_conditional(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_KL_COND)); + if (ltable.find(tup) == ltable.end()) { + double val = kl_conditional(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::kl_conditional(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + uint64_t sum4_p = 0, sum4_q = 0; // Sum for every 4 nucleotides + double outer_sum_p = 0, outer_sum_q = 0; // Prior K-mer sum + const auto N = p.points.size(); + for (auto i = 0; i < N; i++) { + sum4_p += p.points[i]; + sum4_q += q.points[i]; + if (i % 4 == 3) { //finished counting word, now compute probabilities + double inner_sum_p = 0; // Sum of p(X|Y) * log(p(X|Y) / q(X|Y)) + double inner_sum_q = 0; // Sum of q(X|Y) * log(q(X|Y) / p(X|Y)) + for (auto j = i - 3; j <= i; j++) { + double conditional_p = (double)p.points[j] / sum4_p; + double conditional_q = (double)q.points[j] / sum4_q; + double lg = log(conditional_p / conditional_q); + inner_sum_p += conditional_p * lg; + inner_sum_q += -1 * conditional_q * lg; + } + outer_sum_p += sum4_p * inner_sum_p; + outer_sum_q += sum4_q * inner_sum_q; + + sum4_p = 0; + sum4_q = 0; + } + } + double left = outer_sum_p / p.getPseudoMagnitude(); + double right = outer_sum_q / q.getPseudoMagnitude(); + return (left + right) / 2.0; +} + +template +double Feature::c_markov(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_MARKOV)); + if (ltable.find(tup) == ltable.end()) { + double val = markov(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::markov(Point& a, Point& b) +{ + const DivergencePoint& q = dynamic_cast&>(a); + const DivergencePoint& p = dynamic_cast&>(b); + double total = 0; // Prior K-mer sum + const auto N = p.points.size(); + for (auto i = 0; i < N; i += 4) { + uint64_t psum = 0, qsum = 0; + for (auto j = 0; j < 4; j++) { + psum += p.points[i+j]; + qsum += q.points[i+j]; + } + double lpsum = log(psum); + double lqsum = log(qsum); + for (auto j = 0; j < 4; j++) { + total += (q.points[i+j]-1) * (log(p.points[i+j]) - lpsum); + total += (p.points[i+j]-1) * (log(q.points[i+j]) - lqsum); + } + } + return total / 2; +} + +template +double Feature::c_d2z(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_D2z)); + if (ltable.find(tup) == ltable.end()) { + double val = d2z(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::d2z(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + double sum = 0; + const auto N = p.points.size(); + double ap = (double)p.getPseudoMagnitude() / N; + double aq = (double)q.getPseudoMagnitude() / N; + double sp = p.get_stddev(), sq = q.get_stddev(); + for (auto i = 0; i < N; i++) { + double pz = (p.points[i] - ap) / sp; + double qz = (q.points[i] - aq) / sq; + sum += pz * qz; + } + return sum; +} + +template +double d_markov(Point& a, Point& b) +{ + const DivergencePoint& q = dynamic_cast&>(b); + return log(Feature::markov(b, a) / Feature::markov(b, b)) / q.getRealMagnitude(); +} + +template +double Feature::c_sim_mm(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_SIM_MM)); + if (ltable.find(tup) == ltable.end()) { + double val = sim_mm(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::sim_mm(Point& a, Point& b) +{ + return 1 - exp(0.5 * (d_markov(a, b) + d_markov(b, a))); +} + +template +double Feature::c_euclidean_z(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_EUCLIDEAN_Z)); + if (ltable.find(tup) == ltable.end()) { + double val = euclidean_z(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::euclidean_z(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + double sum = 0; + const auto N = p.points.size(); + double ap = (double)p.getPseudoMagnitude() / N; + double aq = (double)q.getPseudoMagnitude() / N; + double sp = p.get_stddev(), sq = q.get_stddev(); + for (auto i = 0; i < N; i++) { + double pz = (p.points[i] - ap) / sp; + double qz = (q.points[i] - aq) / sq; + sum += (pz - qz) * (pz - qz); + } + return sqrt(sum); +} + +template +double Feature::c_emd(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_EMD)); + if (ltable.find(tup) == ltable.end()) { + double val = emd(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::emd(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + uintmax_t cp = 0, cq = 0; + uintmax_t dist = 0; + for (auto i = 0; i < N; i++) { + cp += p.points[i]; + cq += q.points[i]; + dist += cp > cq ? cp - cq : cq - cp; + } + return (double)dist; +} + +template +std::vector tiedrank(const Point& a) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const auto N = p.points.size(); + vector ip(N, 0); + std::iota(std::begin(ip), std::end(ip), 0); + std::sort(std::begin(ip), std::end(ip), [&](size_t x, size_t y) { + return p.points[x] < p.points[y]; + }); +} + +template +double Feature::c_spearman(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_SPEARMAN)); + if (ltable.find(tup) == ltable.end()) { + double val = spearman(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::spearman(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + vector ip(N, 0); + vector iq(N, 0); + std::iota(std::begin(ip), std::end(ip), 0); + std::iota(std::begin(iq), std::end(iq), 0); + std::sort(std::begin(ip), std::end(ip), [&](size_t x, size_t y) { + return p.points[x] < p.points[y]; + }); + std::sort(std::begin(iq), std::end(iq), [&](size_t x, size_t y) { + return q.points[x] < q.points[y]; + }); + double expected = (N+1) / 2.0; + double cov = 0; + double sp = 0; + double sq = 0; + for (auto i = 0; i < N; i++) { + cov += (ip[i] - expected) * (iq[i] - expected); + sp += (ip[i] - expected) * (ip[i] - expected); + sq += (iq[i] - expected) * (iq[i] - expected); + } + return (N * cov) / (sp * sq); +} + +template +double Feature::c_jaccard(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_JACCARD)); + if (ltable.find(tup) == ltable.end()) { + double val = jaccard(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::jaccard(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + uint64_t sum = 0; + for (auto i = 0; i < N; i++) { + if (p.points[i] == q.points[i] && p.points[i] > 1) { + sum++; + } + } + return (double)sum / N; +} + +template +double Feature::c_d2s(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_D2s)); + if (ltable.find(tup) == ltable.end()) { + double val = d2s(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::d2s(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + const int k = (int)(log(N) / log(4)); + const auto p1 = p.get_1mers(); + const auto q1 = q.get_1mers(); + const double pmag = p.getPseudoMagnitude(); + const double qmag = q.getPseudoMagnitude(); + double sum = 0; + for (size_t i = 0; i < N; i++) { + double p1i = 1; + double q1i = 1; + size_t idx = i; + for (int j = 0; j < k; j++) { + int i1 = idx % 4; + idx /= 4; + p1i *= (double)p1[i1] / pmag; + q1i *= (double)q1[i1] / qmag; + } + double hp = p.points[i] - pmag * p1i; + double hq = q.points[i] - qmag * q1i; + if (hp != 0 && hq != 0) { + sum += hp * hq / hypot(hp, hq); + } + } + return sum; +} + +template +double Feature::c_afd(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_AFD)); + if (ltable.find(tup) == ltable.end()) { + double val = afd(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::afd(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + const int k = (int)(log(N) / log(4)); + const auto p1 = p.get_1mers(); + const auto q1 = q.get_1mers(); + const auto pmag = p.getPseudoMagnitude(); + const auto qmag = q.getPseudoMagnitude(); + double sum = 0; + const auto nMinusOne = N / 4; + const auto nMinusTwo = nMinusOne / 4; + int first_i = 0; + for (auto i = 0; i < N; i += nMinusTwo) { +// 16 iterations total, iterating through all 2-mers + uint64_t psum = 0, qsum = 0; + for (auto j = i; j < i + nMinusTwo; j++) { + psum += p.points[j]; + qsum += q.points[j]; + } + double x = (double)psum / p1[first_i / 4]; + double y = (double)qsum / q1[first_i / 4]; + first_i++; + + + double diff = x - y; + double unsquared = (diff * pow(1+diff, -14)); + sum += unsquared * unsquared; + } + return sum; +} + +template +double Feature::c_mismatch(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_MISMATCH)); + if (ltable.find(tup) == ltable.end()) { + double val = mismatch(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::mismatch(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + uint64_t sum = 0; + #pragma omp simd + for (auto i = 0; i < N; i++) { + sum += (p.points[i] != q.points[i]); + } + return sum; +} + +template +double Feature::c_canberra(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_CANBERRA)); + if (ltable.find(tup) == ltable.end()) { + double val = canberra(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::canberra(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + double sum = 0; + #pragma omp simd + for (auto i = 0; i < N; i++) { + auto numer = p.points[i] > q.points[i] ? p.points[i] - q.points[i] : q.points[i] - p.points[i]; + auto denom = p.points[i] + q.points[i]; + sum += (double)numer / denom; + } + return sum; +} + +template +double Feature::c_kulczynski1(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_KULCZYNSKI1)); + if (ltable.find(tup) == ltable.end()) { + double val = kulczynski1(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::kulczynski1(Point &a, Point &b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + double sum = 0; + for (auto i = 0; i < N; i++) { + auto numer = p.points[i] > q.points[i] ? p.points[i] - q.points[i] : q.points[i] - p.points[i]; + auto denom = std::min(p.points[i], q.points[i]); + sum += (double)numer / denom; + } + return sum; +} + +template +double Feature::c_d2_star(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_D2_star)); + if (ltable.find(tup) == ltable.end()) { + double val = d2_star(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::d2_star(Point& a, Point& b) +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + const int k = (int)(log(N) / log(4)); + const auto p1 = p.get_1mers(); + const auto q1 = q.get_1mers(); + + const auto pmag = p.getPseudoMagnitude(); + const auto qmag = q.getPseudoMagnitude(); + double sum = 0; + vector tilde(4, 0); + for (int i = 0; i < 4; i++) { + tilde[i] = (double)(p1[i] + q1[i]) / (pmag + qmag); + } + const double L = sqrt(pmag * qmag); + for (auto i = 0; i < N; i++) { + double p1i = 1; + double q1i = 1; + double tilde_i = 1; + auto idx = i; + for (int j = 0; j < k; j++) { + auto i1 = idx % 4; + idx /= 4; + p1i *= (double)p1[i1] / pmag; + q1i *= (double)q1[i1] / qmag; + tilde_i *= tilde[i1]; + } + double hp = p.points[i] - pmag * p1i; + double hq = q.points[i] - qmag * q1i; + sum += hp * hq / (L * tilde_i); + } + return sum; +} + +template +double Feature::c_n2r(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_N2R)); + if (ltable.find(tup) == ltable.end()) { + double val = n2r(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::n2r(Point& a, Point& b) const +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + double *cp = new double[N]; + double *cq = new double[N]; + double ap = 0, aq = 0; + for (auto i = 0; i < N; i++) { + int j = reverse.at(i); + cp[i] = p.points[i] + p.points[j]; + cq[i] = q.points[i] + q.points[j]; + ap += cp[i]; + aq += cq[i]; + } + ap /= N; + aq /= N; + double total = neighbor(cp, cq, ap, aq, N); + delete[] cp; + delete[] cq; + return total; +} + +template +double Feature::c_n2rc(Point& a, Point& b) { + + auto aid = a.get_id(); + auto bid = b.get_id(); + auto tup = std::tuple(aid, bid, Feature::log2(FEAT_N2RC)); + if (ltable.find(tup) == ltable.end()) { + double val = n2rc(a, b); + ltable.insert({tup, val}); + return val; + } else { + return ltable.at(tup); + } +} + +template +double Feature::n2rc(Point& a, Point& b) const +{ + const DivergencePoint& p = dynamic_cast&>(a); + const DivergencePoint& q = dynamic_cast&>(b); + const auto N = p.points.size(); + double *cp = new double[N]; + double *cq = new double[N]; + double ap = 0, aq = 0; + for (auto i = 0; i < N; i++) { + int h = reverse_complement.at(i); + cp[i] = p.points[h] + p.points[i]; + cq[i] = q.points[h] + q.points[i]; + ap += cp[i]; + aq += cq[i]; + } + ap /= N; + aq /= N; + double total = neighbor(cp, cq, ap, aq, N); + delete[] cp; + delete[] cq; + return total; +} + +template class Feature; +template class Feature; +template class Feature; +template class Feature; +template class Feature; +template class Feature; diff --git a/src/cluster/src/Feature.h b/src/cluster/src/Feature.h new file mode 100644 index 0000000..ba7f73e --- /dev/null +++ b/src/cluster/src/Feature.h @@ -0,0 +1,380 @@ +/* -*- C++ -*- + * + * Feature.h + * + * Author: Benjamin T James + * + * Class containing all features and a glue to bind them together, + * shared indivual features can be shared through hashing if sequence + * id's are set. + */ +#ifndef FEATURES_H +#define FEATURES_H + +#include "SingleFeature.h" +#include +#include +#include + +#define FEAT_ALIGN (1UL << 0) +#define FEAT_HELLINGER (1UL << 1) +#define FEAT_MANHATTAN (1UL << 2) +#define FEAT_EUCLIDEAN (1UL << 3) +#define FEAT_CHI_SQUARED (1UL << 4) +#define FEAT_NORMALIZED_VECTORS (1UL << 5) +#define FEAT_HARMONIC_MEAN (1UL << 6) +#define FEAT_JEFFEREY_DIV (1UL << 7) +#define FEAT_K_DIV (1UL << 8) +#define FEAT_PEARSON_COEFF (1UL << 9) +#define FEAT_SQCHORD (1UL << 10) +#define FEAT_KL_COND (1UL << 11) +#define FEAT_MARKOV (1UL << 12) +#define FEAT_INTERSECTION (1UL << 13) +#define FEAT_RRE_K_R (1UL << 14) +#define FEAT_D2z (1UL << 15) +#define FEAT_SIM_MM (1UL << 16) +#define FEAT_EUCLIDEAN_Z (1UL << 17) +#define FEAT_EMD (1UL << 18) +#define FEAT_SPEARMAN (1UL << 19) +#define FEAT_JACCARD (1UL << 20) +#define FEAT_LENGTHD (1UL << 21) +#define FEAT_D2s (1UL << 22) +#define FEAT_AFD (1UL << 23) +#define FEAT_MISMATCH (1UL << 24) +#define FEAT_CANBERRA (1UL << 25) +#define FEAT_KULCZYNSKI1 (1UL << 26) +#define FEAT_KULCZYNSKI2 (1UL << 27) +#define FEAT_SIMRATIO (1UL << 28) +#define FEAT_JENSEN_SHANNON (1UL << 29) +#define FEAT_D2_star (1UL << 30) +#define FEAT_N2R (1UL << 31) +#define FEAT_N2RC (1UL << 32) +#define FEAT_N2RRC (1UL << 33) + +enum class Combo { + xy, + x2y2, + xy2, + x2y +}; + +template +struct pra { + Point* first; + Point* second; + double val; + pra() {} + pra(const pra&f) : first(f.first), second(f.second), val(f.val) {} + pra(Point* a, Point* b, double c) : first(a), second(b), val(c) {} + pra deep_clone() const { + return pra(first->clone(), second->clone(), val); + } +}; + +/* + * Usage: + * add_feature(FEAT_LD | FEAT_INTERSECTION, COMBO_SELF); + * add_feature(FEAT_LD | FEAT_JENSONSHANNON, COMBO_SELF); + * + * normalize(some_pairs_to_normalize) + * normalize(more_pairs_to_normalize) + * finalize() + * + * add_feature(....); + * + * normalize(some_pairs_to_normalize) + * normalize(more_pairs_to_normalize) + * finalize() + * + * compute(p,q) + * for (size_t i = 0; i < feature.size(); i++) { + * cout << feature[i] << endl; + * } + */ +template +class Feature { +public: + Feature(const Feature& feat_); + Feature operator=(const Feature& feat_); + Feature(const int k_) : k(k_) { + flags = 0; + auto freverse = [](int idx, int k) { + int sum = 0; + for (int i = 0; i < k; i++) { + int rem = idx % 4; + idx /= 4; + sum = 4 * sum + rem; + + } + return sum; + }; + auto freverse_complement = [](int idx, int k) { + std::vector v; + for (int i = 0; i < k; i++) { + v.push_back(3 - idx % 4); + idx /= 4; + } + int sum = 0; + for (auto val : v) { + sum = 4 * sum + val; + } + return sum; + }; + + uint64_t k4 = 1; + for (int i = 0; i < k; i++) { + k4 *= 4; + } + for (int i = 0; i < k4; i++) { + reverse.push_back(freverse(i, k)); + reverse_complement.push_back(freverse_complement(i, k)); + } + } + void add_feature(uint64_t f_flags, Combo combo=Combo::xy); + static vector multi_to_log(uint64_t multi); + vector feat_names(); + static std::string feat_name(uint64_t single); + void finalize(); + + void remove_feature() { // Tear down features SPECIFIC to last pairing + // auto indices_to_rm = combos.back().second; + // combos.pop_back(); + // uint64_t feat_flags; + + + // TO_DEL = TO_RM & (INDICES - REST) + + vector vec = combos.back().second; + combos.pop_back(); + for (auto combo : combos) { + for (auto idx : combo.second) { + vec.erase(std::remove(vec.begin(), vec.end(), idx), vec.end()); + } + } + std::sort(vec.begin(), vec.end(), std::greater()); + for (int idx : vec) { + flags ^= lookup[idx]; + lookup.erase(lookup.begin() + idx); + raw_funcs.erase(raw_funcs.begin() + idx); + mins.erase(mins.begin() + idx); + maxs.erase(maxs.begin() + idx); + is_sims.erase(is_sims.begin() + idx); + is_finalized.erase(is_finalized.begin() + idx); + } + // flags ^= lookup[idx] + // lookup[IDX] + // raw_funcs[IDX] + // mins[idx] + // maxs[idx] + // is_sims[idx] + // is_finalized[idx] + } + void normalize(const vector > &pairs); + void set_normal(uint64_t single_flag, double min, double max); + vector compute(Point& p, Point& q) { + vector cache = compute_all_raw(p, q); + normalize_cache(cache); + return cache; + }; + double operator()(int col, const vector& cache) const { + auto pr = combos.at(col); + Combo combo = pr.first; + auto indices = pr.second; + if (combo == Combo::xy) { + double prod = 1; + for (auto idx : indices) { + prod *= cache[idx]; + } + return prod; + } else if (combo == Combo::x2y2) { + double prod = 1; + for (auto idx : indices) { + prod *= cache[idx] * cache[idx]; + } + return prod; + } else if (combo == Combo::xy2) { + if (indices.size() != 2) { + cerr << "index size: " << indices.size() << endl; + throw "invalid"; + } + auto i0 = indices[0]; + auto i1 = indices[1]; + return cache[i0] * cache[i1] * cache[i1]; + } else if (combo == Combo::x2y) { + if (indices.size() != 2) { + throw "invalid"; + } + auto i0 = indices[0]; + auto i1 = indices[1]; + return cache[i0] * cache[i0] * cache[i1]; + } else { + throw "invalid combo"; + } + } + size_t size() const { return combos.size(); } + void print_bounds() const { + for (size_t i = 0; i < lookup.size(); i++) { + cout << "bounds[" << i << "]: " << mins[i] << " to " << maxs[i] << endl; + } + } + static int log2(uint64_t feature_) { + for (size_t i = 0; i < 33; i++) { + if (feature_ & (1UL << i)) { + return i; + } + } + return 0; + } + static double hellinger(Point& p, Point& q); + double c_hellinger(Point& p, Point& q); + static double manhattan(Point& p, Point& q); + double c_manhattan(Point& p, Point& q); + static double euclidean(Point& p, Point& q); + double c_euclidean(Point& p, Point& q); + static double chi_squared(Point& p, Point& q); + double c_chi_squared(Point& p, Point& q); + static double normalized_vectors(Point& p, Point& q); + double c_normalized_vectors(Point& p, Point& q); + static double harmonic_mean(Point& p, Point& q); + double c_harmonic_mean(Point& p, Point& q); + static double jefferey_divergence(Point& p, Point& q); + double c_jefferey_divergence(Point& p, Point& q); + static double k_divergence(Point& p, Point& q); + double c_k_divergence(Point& p, Point& q); + static double pearson(Point& p, Point& q); + double c_pearson(Point& p, Point& q); + static double squaredchord(Point& a, Point& b); + double c_squaredchord(Point& a, Point& b); + static double kl_conditional(Point& a, Point& b); + double c_kl_conditional(Point& a, Point& b); + static double markov(Point& a, Point& b); + double c_markov(Point& a, Point& b); + static double intersection(Point& p, Point& q); + double c_intersection(Point& p, Point& q); + static double rre_k_r(Point& p, Point& q); + double c_rre_k_r(Point& p, Point& q); + static double d2z(Point& p, Point& q); + double c_d2z(Point& p, Point& q); + static double sim_mm(Point& p, Point& q); + double c_sim_mm(Point& p, Point& q); + static double euclidean_z(Point& p, Point& q); + double c_euclidean_z(Point& p, Point& q); + static double emd(Point& p, Point& q); + double c_emd(Point& p, Point& q); + static double spearman(Point& p, Point& q); + double c_spearman(Point& p, Point& q); + static double jaccard(Point& p, Point& q); + double c_jaccard(Point& p, Point& q); + static double length_difference(Point& p, Point& q); + static double d2s(Point& p, Point& q); + double c_d2s(Point& p, Point& q); + static double afd(Point& p, Point& q); + double c_afd(Point& p, Point& q); + static double mismatch(Point& p, Point& q); + double c_mismatch(Point& p, Point& q); + static double canberra(Point& p, Point& q); + double c_canberra(Point& p, Point& q); + static double kulczynski1(Point& a, Point& b); + double c_kulczynski1(Point& a, Point& b); + static double kulczynski2(Point& a, Point& b); + double c_kulczynski2(Point& a, Point& b); + static double simratio(Point& a, Point& b); + double c_simratio(Point& a, Point& b); + double jensen_shannon(Point& p, Point& q) const; + double c_jensen_shannon(Point& p, Point& q); + static double d2_star(Point& p, Point& q); + double c_d2_star(Point& p, Point& q); + double n2r(Point& p, Point& q) const; + double c_n2r(Point& p, Point& q); + double n2rc(Point& p, Point& q) const; + double c_n2rc(Point& p, Point& q); + double n2rrc(Point& p, Point& q) const; + double c_n2rrc(Point& p, Point& q); + + static double align(Point& a, Point& b, std::map, double> &atable); + std::function&,Point&)> get_func(uint64_t single_feat); + std::function&,Point&)> get_func_(uint64_t single_feat); + bool feat_is_sim(uint64_t single_flag) const; + bool get_save() const { return do_save; } + void set_save(bool save_) { + do_save = save_; + if (!save_) { + ltable.clear(); + } + } + std::vector > > get_combos() const { return combos; } + std::vector get_mins() const { return mins; }; + std::vector get_maxs() const { return maxs; }; + std::vector get_lookup() const { return lookup; }; +private: + + vector compute_all_raw(Point& p, Point& q); + void normalize_cache(vector& cache) const; + + + // double raw(uint64_t single_flag, Point& a, Point& b); + int index_of(uint64_t single_flag) const { + for (size_t i = 0; i < lookup.size(); i++) { + if (lookup[i] == single_flag) { + return i; + } + } + return -1; + } + void reset_funcs() { + raw_funcs.clear(); + for (auto f : lookup) { + raw_funcs.push_back(get_func(f)); + } + } + uint64_t get_flags() const { return flags; }; + + + std::vector get_sims() const { return is_sims; }; + std::vector get_finalized() const { return is_finalized; }; + + + + + + int k; int get_k() const { return k; }; + uint64_t flags; + bool do_save; + std::vector + > > combos; + + std::vector mins, maxs; + std::vector is_sims, is_finalized; + std::vector lookup; + std::vector reverse, reverse_complement; + std::vector&,Point&)> > raw_funcs; + + std::map, double> atable; + std::map, double> ltable; + +// std::map, double> * get_table() const { return ltable; } +}; + +// template +// class Feature { +// public: +// Feature(std::function)> combination, std::vector > sf) +// : features(sf), combo(combination) {} +// double operator()(Point*, Point*) const; + + +// static double manhattan(Point& p, Point& q); +// static double length_difference(Point& p, Point& q); +// static double n2rrc(Point& p, Point& q, const vector&, const vector &); +// static double rre_k_r(Point& p, Point& q); +// static double intersection(Point& p, Point& q); +// static double jenson_shannon(Point& p, Point& q); +// static double pearson(Point& p, Point& q); +// static double simratio(Point& a, Point& b); +// static double squaredchord(Point& a, Point& b); +// private: +// vector > features; +// std::function)> combo; +// }; +#endif diff --git a/src/cluster/src/GLM.cpp b/src/cluster/src/GLM.cpp new file mode 100644 index 0000000..f5ef4ba --- /dev/null +++ b/src/cluster/src/GLM.cpp @@ -0,0 +1,66 @@ +/* + * glm.cpp + * + * Created on: May 29, 2017 + * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa + * + * Modified by Benjamin T James + */ + +#include "GLM.h" +#include "Matrix.h" + +#include +#include +using namespace std; +// using namespace matrix; + +namespace matrix{ + +void GLM::train(Matrix& features, Matrix& labels){ + weights = features.transpose() * features; + weights = weights.pseudoInverse() * features.transpose() * labels; +} + +Matrix GLM::predict(Matrix& features) const { + Matrix labels; + labels = features * weights; + double log; + for(int i = 0; i < labels.getNumRow(); i++){ + log = round(1/(1 + exp(-(labels.get(i,0))))); + labels.set(i,0, log); + } + return labels; +} + +std::tuple GLM::accuracy(Matrix& oLabels, Matrix& pLabels) const { + int sum = 0; + int negSum = 0; + int negSame = 0; + int posSum = 0; + int posSame = 0; + for(int i = 0; i < oLabels.getNumRow(); i++){ + if(oLabels.get(i,0) == -1){ + negSum++; + if(oLabels.get(i,0) == pLabels.get(i, 0)){ + sum++; + negSame++; + } + }else{ + posSum++; + if(oLabels.get(i,0) == pLabels.get(i, 0)){ + sum++; + posSame++; + } + } + } + double acc = (((double)sum*100)/(oLabels.getNumRow())); + double sens = (((double)posSame*100)/(posSum)); + double spec = (((double)negSame*100)/(negSum)); + // cout << "Accuracy: " << acc << "% "; + // cout << "Sensitivity: " << sens << "% "; + // cout << "Specificity: " << spec << "% " << endl; + return make_tuple(acc, sens, spec); +} + +} diff --git a/src/cluster/src/GLM.h b/src/cluster/src/GLM.h new file mode 100644 index 0000000..d9e150b --- /dev/null +++ b/src/cluster/src/GLM.h @@ -0,0 +1,31 @@ +/* + * glm.h + * + * Created on: May 29, 2017 + * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa + * + * Modified by Benjamin T James + */ + +#ifndef SRC_MATRIX_GLM_H_ +#define SRC_MATRIX_GLM_H_ + +#include "Matrix.h" +#include +namespace matrix { + +class GLM { +private: + Matrix weights; + +public: + void load(Matrix weights_) { weights = weights_; } + void train(matrix::Matrix& features, matrix::Matrix& labels); + Matrix predict(matrix::Matrix& features) const; + std::tuple accuracy(matrix::Matrix& oLabels, matrix::Matrix& pLabels) const; + const Matrix& get_weights() const { return weights; }; +}; + +} + +#endif /* SRC_MATRIX_GLM_H_ */ diff --git a/src/cluster/src/HandleSeq.cpp b/src/cluster/src/HandleSeq.cpp new file mode 100644 index 0000000..041c22a --- /dev/null +++ b/src/cluster/src/HandleSeq.cpp @@ -0,0 +1,155 @@ +/** + * Author: Alex Baumgartner + * The Bioinformatics Toolsmith Laboratory, the University of Tulsa + * 5/15/2018 + * + * Purpose: + * The pupose of this module is to take a sequence and mutate it to returns + It also serves as a way to parse a file for all sequences + */ + +#include "HandleSeq.h" +#include +// d +HandleSeq::HandleSeq(int m) { + + mode = m & HandleSeq::BOTH; + enableTrans = m & HandleSeq::TRANSLOCATION; + enableRev = m & HandleSeq::REVERSION; + // disable = (m & HandleSeq::ATYPICAL) > 0 ? 0 : 1; +} + +pair, vector> HandleSeq::parseFile(string fileName) { + ifstream fileIn; + //Uses the file the user supplies to take in sequences + fileIn.open(fileName, ifstream::in); + if(fileIn.is_open()){ + vector sequences; + vector names; + string inString; + //Boolean to make sure that the first sequence + //has already been found, prevents a null string being written + bool foundFirst = false; + string currentLine; + while (!fileIn.eof()) { + getline(fileIn, currentLine); + //Skip the line if nothing is on it + if (currentLine.length() == 0) { + continue; + } + //If the line has a '>' symbol, the start of a new sequence + else if (currentLine.at(0) == '>' && foundFirst) { + //Push the current saved sequene onto the vector, + //then reset the strings value + sequences.push_back(inString); + names.push_back(currentLine.substr(1, currentLine.find_first_of(' '))); + inString = ""; + } + else if(currentLine.at(0) == '>' && !foundFirst){ + foundFirst = true; + names.push_back(currentLine.substr(1, currentLine.find_first_of(' '))); + } + //If this is the first >, set found first to true + else if (!foundFirst) { + foundFirst = true; + } + //Otherwise, add the current Line to + //the string of current lines + else { + inString = inString + currentLine; + } + } + //Push the last found string on + //(There is no > at the end of a .fa file) + sequences.push_back(inString); + fileIn.close(); + return {names, sequences}; + } + else{ + cout << "Could not find File" << endl; + exit(2); + } +} + +pair HandleSeq::mutate(string sequence, int muteRate) { + percMute = muteRate; + if (muteRate == 0) { + return std::make_pair(1, sequence); + } + auto nucls = countNucl(sequence); + //Assing the percent of each nucleotide in the sequence + int percAs = (nucls.at(0) * 100) / sequence.length(); + int percCs = (nucls.at(1) * 100) / sequence.length(); + int percGs = (nucls.at(2) * 100) / sequence.length(); + int percTs = (nucls.at(3) * 100) / sequence.length(); + int percMulti, percSing; + string * seq = new string(sequence); + int length = sequence.length(); + //If the user only wants single + if (mode == 1) { + percMulti = 0; + //Allocate all mutations to single + percSing = percMute; + } + //Or if the user only wants non single + else if (mode == 2) { + //Allocate all mutations to non-single + percSing = 0; + percMulti = percMute; + } + //Otherwise, assing a random percentage to both + else { + percMulti = rand() % percMute; + percSing = percMute - percMulti; + } + //Define a new multiple mutation + MultiMute multi(percAs, percCs, percGs, percTs, + percMulti, enableTrans, enableRev); + //Run the multiple mutations, + //get back its vector of what is valid to mutate and what isn't + vector mutes = multi.genMulti(seq); + uint64_t cnt = 0; + for (bool b : mutes) { + cnt += b ? 1 : 0; + } + + SingMute sing(percAs, percCs, percGs, percTs, + percSing, seq, mutes); + float alignmentLength = multi.getAlignmentLength() + sing.getAlignmentLength() + length; +// cout << "alignLength: " << alignmentLength << endl; + float IBP = length - multi.getIBP() - sing.getIBP(); +// cout << "ibp: " << IBP << endl; + float alignment = IBP / alignmentLength; +// cout << "ratio: size: " << mutes.size() << " expected: " << (float)cnt / mutes.size() << " found: " << ((float)length - multi.getIBP()) / ((float)multi.getAlignmentLength() + length) << " align: " << alignment << endl; + //assign the sequence to the + //value that the seq pointer stores to + //clear the heap + delete seq; + //Return the now mutated sequence + std::string outseq = sing.getSeq(); + return make_pair(alignment, outseq); +} + +vector HandleSeq::countNucl(string sequence) { + int a = 0; + int c = 0; + int g = 0; + int t = 0; + for (int i = 0; i < sequence.length(); i++) { + if (sequence.at(i) == 'A') { + a++; + } else if (sequence.at(i) == 'C') { + c++; + } else if (sequence.at(i) == 'G') { + g++; + } else if (sequence.at(i) == 'T') { + t++; + } + } + vector values; + values.push_back(a); + values.push_back(c); + values.push_back(g); + values.push_back(t); + return values; +} diff --git a/src/cluster/src/HandleSeq.h b/src/cluster/src/HandleSeq.h new file mode 100644 index 0000000..95a7718 --- /dev/null +++ b/src/cluster/src/HandleSeq.h @@ -0,0 +1,77 @@ +/** + * Author: Alex Baumgartner + * The Bioinformatics Toolsmith Laboratory, the University of Tulsa + * 5/15/2018 + * + * Purpose: + * The pupose of this module is to take a sequence and mutate it to returns + It also serves as a way to parse a file for all sequences + */ +#ifndef HANDLESEQ_H +#define HANDLESEQ_H + +#include +#include +#include +#include +#include "MultiMute.h" +#include "SingMute.h" + +using namespace std; + +class HandleSeq { +public: + // Single — point — mutations only + static const int SINGLE = (1 << 0); + // Non-single point mutations only + static const int NON_SINGLE = (1 << 1); + // Single and non-single mutations + static const int BOTH = SINGLE | NON_SINGLE; +// translocations and reversions + static const int TRANSLOCATION = (1 << 2); + static const int REVERSION = (1 << 3); + static const int ATYPICAL = TRANSLOCATION | REVERSION; + static const int ALL = ATYPICAL | BOTH; + /* + constructor + + @param: + int: the mode of the program + (Single only = 1, nonsingle only = 2, both = 3) + */ + HandleSeq(int); + /* + returns a vector of all sequences in a file inputted + + @param: + std::string: file name + int: the mutation rate + + @return: + std::vector: Vector of all found sequences + */ + pair, vector> parseFile(string); + /* + Mutates a sequence based on parameters inputted in constructor, + and returns the mutated sequence + */ + pair mutate(string, int); +private: + int mode; + int percMute; + bool enableTrans, enableRev; + /* + Counts the nucleotides in a file, + and returns a vector corresponding to their values {A, C, G, T} + + @param: + std::string: the sequences + + @return: + std::vector: vector containing ints of each nucleotide count + */ + vector countNucl(string); + +}; + +#endif diff --git a/src/cluster/src/Histogram.cpp b/src/cluster/src/Histogram.cpp new file mode 100644 index 0000000..a669687 --- /dev/null +++ b/src/cluster/src/Histogram.cpp @@ -0,0 +1,195 @@ +/* -*- C++ -*- + * + * Histogram.cpp + * + * Author: Benjamin T James + * + * Artifact from early development of MeShClust + */ +#ifndef HEADER_HACK +#include "Histogram.h" +#endif + +#include +#include + +template +double Histogram::distance_k1(const Point &p) const +{ + throw "Not implemented"; + const Histogram& h = dynamic_cast&>(p); + uint64_t dist = 0; + auto size = std::min(points.size(),h.points.size()); +/* + for (unsigned int i = 0; i < size; i++) { + T l = points.at(i); + T r = h.points.at(i); + dist += (l > r) ? (l - r) : (r - l); + } +*/ + uint64_t avg_mag = (magnitude() + h.magnitude()) / 2.0; + for (auto i = 0; i < size; i++) { + T l = points[i]; + T r = h.points[i]; + dist += min(l, r); + } + return 1.0 - dist / avg_mag; +} +template +Histogram::Histogram(std::vector pts, char mark) +{ + for (T t : pts) { + points.push_back(t); + } + to_delete = false; +} +template +Histogram::Histogram(std::vector pts) +{ + for (T t : pts) { + points.push_back(t); + } + to_delete = false; +} + +template +Histogram::Histogram(std::vector pts, bool toDelete) +{ + for (T t : pts) { + points.push_back(t); + } + to_delete = toDelete; +} + +template +Histogram::Histogram(unsigned int size) +{ + for (unsigned int i = 0; i < size; i++) { + points.push_back(0); + } + to_delete = false; +} + +template +void Histogram::operator*=(double d) +{ + for (T &t : points) { + t *= d; + } +} + +template +bool Histogram::operator<(Point& p) const +{ + const Histogram& h = dynamic_cast&>(p); + unsigned int size = std::min(points.size(),h.points.size()); + for (unsigned int i = 0; i < size; i++) { + if (points.at(i) >= h.points.at(i)) { + return false; + } + } + return true; +} + +template +void Histogram::operator/=(double d) +{ + unsigned int size = points.size(); + for (unsigned int i = 0; i < size; i++) { + points.at(i) = points.at(i) / d; + } +} + +template +void Histogram::operator+=(Point& p) +{ + const Histogram& h = dynamic_cast&>(p); + unsigned int size = std::min(points.size(),h.points.size()); + for (unsigned int i = 0; i < size; i++) { + points.at(i) += h.points.at(i); + } +} + +template +uint64_t Histogram::operator-(const Point& p) const +{ + return distance(p); +} + +template +void Histogram::set(Point& p) +{ + const Histogram& h = dynamic_cast&>(p); + points = h.points; +} + +template +void Histogram::display() const +{ + unsigned size = points.size(); + for (unsigned i = 0; i < size; i++) { + std::cout << points.at(i) << " "; + } + std::cout << std::endl; +} + +template +void Histogram::addOne() +{ + for (auto &a : points) { + a++; + } +} +template +void Histogram::subOne() +{ + for (auto &a : points) { + a--; + } +} + +template +void Histogram::zero() +{ + for (typename std::vector::iterator it = points.begin(); it != points.end(); ++it) { + *it = 0; + } +} + +template +uint64_t Histogram::distance(const Point& p) const +{ +/* + // Vectors should be the same width + const Histogram& h = dynamic_cast&>(p); + T dist = 0; + unsigned int size = std::min(points.size(),h.points.size()); + for (unsigned int i = 0; i < size; i++) { + T l = points.at(i); + T r = h.points.at(i); + dist += (l > r) ? (l - r) : (r - l); + } + return dist; +*/ + throw "Not implemented"; + return 0; +} + +template +uint64_t Histogram::magnitude() const +{ + uint64_t dist = 0; + for (auto const& p : points) { + dist += p; + } + return dist; +} + +#ifndef HEADER_HACK +template class Histogram; +template class Histogram; +template class Histogram; +template class Histogram; +template class Histogram; +template class Histogram; +#endif diff --git a/src/cluster/src/Histogram.h b/src/cluster/src/Histogram.h new file mode 100644 index 0000000..1813bb4 --- /dev/null +++ b/src/cluster/src/Histogram.h @@ -0,0 +1,80 @@ +/* -*- C++ -*- + * + * Histogram.h + * + * Author: Benjamin T James + * + * Artifact from early development of MeShClust + */ +#ifndef HISTOGRAM_H +#define HISTOGRAM_H +#include +#include "Point.h" + +template +class Histogram : public Point { +public: + Histogram(std::vector pts); + Histogram(std::vector pts, char marker); + Histogram(std::vector pts, bool to_delete); + Histogram(unsigned int size); + ~Histogram() {} + void operator*=(double d); + void operator/=(double d); + uint64_t operator-(const Point& p) const; + bool operator<(Point& p) const; + void operator+=(Point& p); + void set(Point& p); + void display() const; + void zero(); + void addOne(); + void subOne(); + double distance_k1(const Point& p) const; + double prob_under(Point& p) const { return distance(p); }; + uint64_t distance(const Point& p) const; + uint64_t magnitude() const; + uint64_t getRealMagnitude() const { return 0; }; + double distance_d(Point& p) const { + throw "not implemented"; + return 0; + } + void set_arg_to_this_d(Point& p) const { + throw "not implemented"; + } + Point* create_double() const { + throw "not implemented"; + return NULL; + } + Histogram* clone() const { + return new Histogram(points, to_delete); + } + Histogram* create() const { + return new Histogram(points.size()); + } + bool is_to_delete() const { + return to_delete; + } + void set_to_delete(bool b) { + to_delete = b; + } + const vector& get_data() const { return points; } + void set_id(uintmax_t c_id) { id = c_id; }; + const uintmax_t get_id() const { return id; }; + void set_length(unsigned long len) { nucl_length = len; }; + unsigned long get_length() const { return nucl_length; }; + unsigned long size() const { return points.size(); }; +private: + std::vector points; + bool to_delete; + uintmax_t id; + unsigned long nucl_length; +}; + +#ifdef HEADER_HACK +#ifndef HISTOGRAM_C +#define HISTORGRAM_C +#include "Histogram.cpp" +#endif +#endif + +#endif diff --git a/src/cluster/src/Loader.cpp b/src/cluster/src/Loader.cpp new file mode 100644 index 0000000..73691b6 --- /dev/null +++ b/src/cluster/src/Loader.cpp @@ -0,0 +1,111 @@ +/* -*- C++ -*- + * + * Loader.cpp + * + * Author: Benjamin T James + * + * Class which can 'preload' chunks of sequences from a file list, + * and then count the k-mers separately, which can be done in + * multiple threads + */ +#include "Loader.h" +#include "ClusterFactory.h" +#include "DivergencePoint.h" +#include + +template +bool Loader::done() const +{ + return file_idx == files.size(); +} + +template +void Loader::preload(int tid) +{ + if (file_idx == files.size()) { + return; + } + for (uint64_t j = 0; j < chunk_size; j++) { + auto chrom = next(); + if (chrom.first == "") { + return; + } + cache_list.at(tid).emplace_back(chrom.first, chrom.second); + } +} + + +template +Point* Loader::get_point(std::string header, const std::string &base, uintmax_t& id, int k) +{ + KmerHashTable table(k, 1); + KmerHashTable table_k1(1, 0); + std::vector values; + vector values_k1; + values.clear(); + ChromosomeOneDigit chrom; + chrom.setHeader(header); + chrom.appendToSequence(base); + chrom.finalize(); + fill_table(table, &chrom, values); + fill_table(table_k1, &chrom, values_k1); +// int tmplate = get_template(chrom->getHeader(), templates); + Point *p = new DivergencePoint(values, chrom.size()); +// cout << "mag: " << ((DivergencePoint*)p)->getPseudoMagnitude() << std::endl; + p->set_1mers(values_k1); + p->set_header(header); + p->set_length(chrom.getBase()->length()); + p->set_data_str(*chrom.getBase()); + DivergencePoint* q = dynamic_cast*>(p); + const auto N = q->points.size(); + double aq = (double) q->getPseudoMagnitude() / N; + double sq = 0; + for (auto i = 0; i < N; i++) { + double qdiff = q->points[i] - aq; + sq += qdiff * qdiff; + } + sq = sqrt(sq / N); + q->set_stddev(sq); + p->set_id(id); + #pragma omp atomic + id++; + return p; +} + +template +std::vector*> Loader::load_next(int tid) +{ + std::vector*> points; + for (size_t i = 0; i < cache_list.at(tid).size(); i++) { + auto pr = cache_list.at(tid).at(i); + Point* p = get_point(pr.first, *pr.second, id_list.at(tid), k); + points.push_back(p); + delete pr.second; + } + cache_list.at(tid).clear(); + return points; +} + +template +std::pair Loader::next() +{ + auto n = maker->next(); + if (n.first != "") { + return n; + } + delete maker; + maker = NULL; + file_idx++; + if (file_idx >= files.size()) { + return n; + } + maker = new SingleFileLoader(files.at(file_idx)); + return maker->next(); +} + +template class Loader; +template class Loader; +template class Loader; +template class Loader; +template class Loader; +template class Loader; diff --git a/src/cluster/src/Loader.h b/src/cluster/src/Loader.h new file mode 100644 index 0000000..28da845 --- /dev/null +++ b/src/cluster/src/Loader.h @@ -0,0 +1,73 @@ +/* -*- C++ -*- + * + * Loader.h + * + * Author: Benjamin T James + * + * Class which can 'preload' chunks of sequences from a file list, + * and then count the k-mers separately, which can be done in + * multiple threads + */ +#ifndef LOADER_H +#define LOADER_H + +#include "Point.h" +#include "SingleFileLoader.h" +#include "ClusterFactory.h" + +template +class Loader { +public: + Loader(std::vector files_, + uint64_t total_num_points_, + uint64_t chunk_size_, + int num_threads_, + int k_, + uint64_t start_id=0) + : + chunk_size(chunk_size_), + num_threads(num_threads_), + k(k_), + files(files_) { + + maker = new SingleFileLoader(files.at(0)); + uint64_t total_id = start_id; + for (int i = 0; i < num_threads_; i++) { + id_list.push_back(total_id); + total_id += total_num_points_; + cache_list.push_back(std::vector >()); + } +// preload(); + }; + + ~Loader() { + cache_list.clear(); + id_list.clear(); + if (maker != NULL) { + delete maker; + } + } + + // single threaded + void preload(int tnum); + + bool done() const; + // multi-thread accessible + std::vector*> load_next(int tid); + + static Point* get_point(std::string header, const std::string &base, uintmax_t& id, int k); +private: + + std::pair next(); + + uint64_t chunk_size; + int num_threads, k; + + std::vector > > cache_list; + std::vector id_list; + + std::vector files; + size_t file_idx = 0; + SingleFileLoader *maker = NULL; +}; +#endif diff --git a/src/cluster/src/LogTable.cpp b/src/cluster/src/LogTable.cpp new file mode 100644 index 0000000..0a05a9d --- /dev/null +++ b/src/cluster/src/LogTable.cpp @@ -0,0 +1,41 @@ +#include "LogTable.h" + +#include +#include + +LogTable::LogTable() : coeff(1000000 / 2) +{ + uintmax_t size = 1000000; + double imax = 2; +// map = new double[size]; + double lsize = log(size); + for (uintmax_t i = 0; i < size; i++) { + map[i] = log(imax * (i + 1)) - lsize; + } + std::cout << "dmax: " << coeff << std::endl; +} +LogTable::LogTable(uintmax_t size, double imax) : coeff(size / imax) +{ + //map = new double[size]; + double lsize = log(size); + for (uintmax_t i = 0; i < size; i++) { + map[i] = log(imax * (i + 1)) - lsize; + } + std::cout << "dmax: " << coeff << std::endl; +} + +LogTable::~LogTable() +{ + //delete[] map; +} + +double LogTable::at(double d) const +{ + size_t idx = d * coeff; + return map[idx]; +} +double LogTable::operator[](double d) const +{ + size_t index = d * coeff; + return map[index]; +} diff --git a/src/cluster/src/LogTable.h b/src/cluster/src/LogTable.h new file mode 100644 index 0000000..6fab42e --- /dev/null +++ b/src/cluster/src/LogTable.h @@ -0,0 +1,20 @@ +#ifndef LOGTABLE_H +#define LOGTABLE_H + +#include +#include + +#define TBLSIZE 1000000 +class LogTable { +public: + LogTable(); + LogTable(uintmax_t _size, double imax=2); + ~LogTable(); + double at(double d) const; + double operator[](double d) const; +private: + double map[TBLSIZE]; + + const double coeff; +}; +#endif diff --git a/src/cluster/src/Mat.h b/src/cluster/src/Mat.h new file mode 100644 index 0000000..eb711ed --- /dev/null +++ b/src/cluster/src/Mat.h @@ -0,0 +1,73 @@ +/* -*- C++ -*- + * + * Mat.h + * + * Author: Benjamin T James + */ +#ifndef MAT_H +#define MAT_H +#include +#include +using namespace std; +template +class Mat { +public: + Mat(function func, const long size) : n(size), table_size(size*(size+1)/2), compute(func) { + if (size <= 0) { + throw "Invalid size"; + } + table = new T[table_size]; + set = new bool[table_size](); + }; + ~Mat() { + delete[] table; + delete[] set; + }; + void fill() { + unsigned long long count = 0; + #ifdef OPENMP + #pragma omp parallel for collapse(2) shared(set) + #endif + for (long i = 0; i < n; i++) { + for (long j = 0; j < n; j++) { + const auto idx = addr(i, j); + if (!set[idx]) { + auto res = compute(i, j); + table[idx] = res; + set[idx] = true; + count++; + } + if (count % 10000 == 0) { + cout << count << " / " << table_size << endl; + } + } + } + + }; + T& operator[](pair index) { + const unsigned long idx = addr(index.first, index.second); + if (!set[idx]) { + table[idx] = compute(index.first, index.second); + set[idx] = true; + } + return table[idx]; + }; + bool exists(int i, int j) const { + return set[addr(i, j)]; + } +private: + T* table; + bool* set; + const unsigned long table_size; + const unsigned long n; + function compute; + + unsigned long addr(unsigned long i, unsigned long j) const { + if (i <= j) { + return i * n - (i - 1) * i / 2 + j - i; + } else { + return j * n - (j - 1) * j / 2 + i - j; + } + }; +}; +#endif diff --git a/src/cluster/src/Matrix.cpp b/src/cluster/src/Matrix.cpp new file mode 100644 index 0000000..997d1c7 --- /dev/null +++ b/src/cluster/src/Matrix.cpp @@ -0,0 +1,360 @@ +/* + * matrix.cpp + * + * Created on: May 10, 2017 + * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa + */ + +#include "Matrix.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +namespace matrix { + +Matrix::Matrix(int r, int c) : + numRow(r), numCol(c) { + m.resize(r); + for (int i = 0; i < r; i++) { + m.at(i) = vector(c); + } +} +Matrix::Matrix() : + numRow(0), numCol(0) { + +} + +Matrix::~Matrix() { + +} + +Matrix Matrix::operator+(Matrix n) { + if (numCol == n.numCol && numRow == n.numRow) { + Matrix mat = Matrix(numRow, numCol); + for (int i = 0; i < mat.numRow; i++) { + for (int j = 0; j < mat.numCol; j++) { + mat.set(i, j, (get(i, j) + n.get(i, j))); + } + } + return mat; + } else { + cerr << "Invalid input: array dimension mismatch." << endl; + throw exception(); + } +} + +Matrix Matrix::operator-(Matrix n) { + if (numCol == n.numCol && numRow == n.numRow) { + Matrix mat = Matrix(numRow, numCol); + for (int i = 0; i < mat.numRow; i++) { + for (int j = 0; j < mat.numCol; j++) { + mat.set(i, j, (get(i, j) - n.get(i, j))); + } + } + return mat; + } else { + cerr << "Invalid input: array dimension mismatch." << "\n"; + throw exception(); + } +} + +Matrix Matrix::operator*(Matrix n) { + + if (numCol == n.numRow) { + double curSum = 0; + Matrix mat = Matrix(numRow, n.numCol); +////#pragma omp parallel for collapse(2) + for (int i = 0; i < mat.numRow; i++) { + for (int j = 0; j < mat.numCol; j++) { + curSum = 0; + for (int k = 0; k < numCol; k++) { + curSum = curSum + get(i, k) * n.get(k, j); + } + mat.set(i, j, curSum); + } + } + return mat; + } else { + cerr << "Invalid input: array dimension mismatch." << endl; + throw exception(); + } +} + +Matrix Matrix::transpose() { + Matrix temp = Matrix(numCol, numRow); + for (int i = 0; i < numRow; i++) { + for (int j = 0; j < numCol; j++) { + temp.set(j, i, get(i, j)); + } + } + return temp; + +} + +Matrix Matrix::gaussJordanInverse() { + if (numRow == numCol) { //Checks if matrix is square + Matrix invert = Matrix(numRow, numCol); + Matrix temp = Matrix(numRow, numCol); + double pivotVal; + + temp.m = m; + + for (int i = 0; i < numRow; i++) {//Creates identity Matrix, which will become inverse matrix + invert.set(i, i, 1); + } + + for (int i = 0; i < numRow; i++) { + if (get(i, i) != 1) { //Checks if the pivot point is 1 + if (get(i, i) != 0) {//Check if the pivot point is 0, if not it performs a type 2 row operation to set the pivot point to 1 + pivotVal = get(i, i); + for (int j = 0; j < numCol; j++) { + set(i, j, (get(i, j) / pivotVal)); + invert.set(i, j, (invert.get(i, j) / pivotVal)); + } + } else {//If the pivot point is zero, it performs a type 1 row operation + bool properSwap = false; + int row = i + 1; + double valSwap; + double valSwap2; + while (!properSwap && row < numRow) { + if (get(row, i) != 0) { + properSwap = true; + } else { + row++; + } + } + if (properSwap) { + for (int j = 0; j < numCol; j++) { + valSwap = get(i, j); + valSwap2 = invert.get(i, j); + set(i, j, get(row, j)); + invert.set(i, j, (invert.get(row, j))); + set(row, j, valSwap); + invert.set(row, j, valSwap2); + } + } else {//If it cannot perform a type 1 row swap with a non zero pivot value, the Inverse does not exist. + cout << "Inverse does not exist\n"; + throw 0; + m = temp.m; + return temp; + } + pivotVal = get(i, i); + for (int j = 0; j < numCol; j++) {//Now perform a type 2 row operation to set the new pivot point to 1 + set(i, j, (get(i, j) / pivotVal)); + invert.set(i, j, (invert.get(i, j) / pivotVal)); + } + } + } + for (int below = i + 1; below < numRow; below++) { //Iterate through the elements below the pivot, performing type 3 row operations to set each to 0 + if (get(below, i) != 0) { + pivotVal = get(below, i); + for (int j = 0; j < numCol; j++) { + set(below, j, (get(below, j) - (pivotVal * get(i, j)))); + invert.set(below, j, + (invert.get(below, j) + - (pivotVal * invert.get(i, j)))); + } + } + } + } + // cout << "\n\n"; + for (int i = numRow - 1; i >= 0; i--) { //Now perform the same step as the last except on the elements above the pivot. + for (int above = 0; above < i; above++) { + if (get(above, i) != 0) { + pivotVal = get(above, i); + for (int j = 0; j < numCol; j++) { + set(above, j, (get(above, j) - (pivotVal * get(i, j)))); + invert.set(above, j, + (invert.get(above, j) + - (pivotVal * invert.get(i, j)))); + } + } + } + } + for (int i = 0; i < numRow; i++) {//Now check to make sure the original matrix is an identity matrix. + for (int j = 0; j < numCol; j++) { + if (i == j && get(i, j) != 1) { + cout << "Inverse does not exist\n"; + throw 0; + m = temp.m; + return temp; + } + if (i != j && get(i, j) != 0) { + cout << "Inverse does not exist\n"; + throw 0; + m = temp.m; + return temp; + } + } + } + m = temp.m; //Reset the original matrix + return invert; + } + cerr << "Invalid dimensions" << endl; + throw exception(); +} + +Matrix Matrix::pseudoInverse() { + if (numRow >= numCol) { + Matrix temp = transpose(); + Matrix transByOrig = temp * *this; + Matrix psuedoInv = (transByOrig.gaussJordanInverse()) * temp; + return psuedoInv; + } else { + Matrix temp = transpose(); + Matrix origByTrans = *this * temp; + Matrix psuedoInv = temp * (origByTrans.gaussJordanInverse()); + return psuedoInv; + } +} + +double Matrix::get(int r, int c) const { + return m.at(r).at(c); +} + +void Matrix::set(int r, int c, double val) { + m.at(r).at(c) = val; + //m[r][c] = val; +} + +void Matrix::print() { + for (int i = 0; i < numRow; i++) { + for (int j = 0; j < numCol; j++) { + cout << right << fixed; + cout << "[" << setprecision(4) << setw(7) << get(i, j) << "] "; + } + cout << endl; + } + cout << endl; +} + +void Matrix::printToFile(string fileName) { + ofstream outSequence(fileName.c_str()); + + for (int i = 0; i < numRow; i++) { + for (int j = 0; j < numCol; j++) { + outSequence << right << fixed; + outSequence << "[" << setprecision(4) << setw(7) << get(i, j) + << "] "; + } + outSequence << endl; + } + outSequence << endl; + + outSequence.close(); +} + +void Matrix::randFill(double low, double high) { + double x; + for (int i = 0; i < numRow; i++) { + for (int j = 0; j < numCol; j++) { + x = ((double) rand() * (high - low)) / (double) RAND_MAX + low; + set(i, j, x); + } + } +} + +void Matrix::userFill() { + double val; + for (int i = 0; i < numRow; i++) { + for (int j = 0; j < numCol; j++) { + cout << "input value for cell (" << i << ", " << j << ")?\n"; + cin >> val; + cout << endl; + set(i, j, val); + } + } +} + +void Matrix::fileFill(string filename) { + ifstream infile(filename.c_str()); + if (!infile) { + cerr << "file read fail" << endl; + throw exception(); + } + string line; + int i = -1; + while (getline(infile, line)) { + i++; + if (i >= numRow) { + addRow(0); + } + double num; + istringstream iss(line); + int j = -1; + while (iss >> num) { + j++; + if (j >= numCol) { + addCol(0); + } + //cout << num << endl; + set(i, j, num); + } + j = 0; + } + i = 0; +} + +void Matrix::addRow(double val) { + numRow++; + vector temp = vector(numCol, val); + m.push_back(temp); +} + +void Matrix::addCol(double val) { + numCol++; + for (int i = 0; i < numRow; i++) { + m.at(i).push_back(val); + } +} + +void Matrix::normalize(double a, double b) { + for (int j = 0; j < numCol; j++) { + int min = get(0, j); + int max = min; + for (int i = 1; i < numRow; i++) { + if (get(i, j) < min) { + min = get(i, j); + } else if (get(i, j) > max) { + max = get(i, j); + } + } + for (int i = 0; i < numRow; i++) { + set(i, j, (b - a) * ((get(i, j) - min) / (max - min)) + a); + } + } +} + +void Matrix::rowToVector(int row, vector& v) { + if (row >= numRow || row < 0) { + cerr << "Invalid Row (rowToVector)" << endl; + throw exception(); + } else { + v = m.at(row); + } +} + +void Matrix::colToVector(int col, vector& v) { + if (col >= numCol || col < 0) { + cerr << "Invalid Column (colToVector)" << endl; + throw exception(); + } else { + for (int j = 0; j < numRow; j++) { + v.push_back(m.at(j).at(col)); + } + } +} + +int Matrix::getNumRow() const { + return numRow; +} + +} diff --git a/src/cluster/src/Matrix.h b/src/cluster/src/Matrix.h new file mode 100644 index 0000000..46a73a6 --- /dev/null +++ b/src/cluster/src/Matrix.h @@ -0,0 +1,52 @@ +/* + * matrix.h + * + * Created on: May 10, 2017 + * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa + */ + + +#ifndef MATRIX_H_ +#define MATRIX_H_ + +#include +#include + +namespace matrix { + +class Matrix +{ +private: + std::vector > m; + int numRow; + int numCol; + + +public: + + Matrix(int r, int c); + Matrix(); + ~Matrix(); + Matrix operator+(Matrix n); + Matrix operator-(Matrix n); + Matrix operator*(Matrix n); + Matrix transpose(); + Matrix gaussJordanInverse(); + Matrix pseudoInverse(); + void userFill(); + double determinant(); + double get(int r, int c) const; + void set(int r, int c, double val); + void addRow(double); + void addCol(double); + void print(); + void printToFile(std::string); + void randFill(double low, double high); + void fileFill(std::string filename); + void normalize(double a, double b); + void rowToVector(int, std::vector&); + void colToVector(int, std::vector&); + int getNumRow() const; +}; +} +#endif /* MATRIX_H_ */ diff --git a/src/cluster/src/MultiMute.cpp b/src/cluster/src/MultiMute.cpp new file mode 100644 index 0000000..73ee242 --- /dev/null +++ b/src/cluster/src/MultiMute.cpp @@ -0,0 +1,455 @@ +/** + * Author: Alex Baumgartner + * The Bioinformatics Toolsmith Laboratory, the University of Tulsa + * 5/15/2018 + * + * Purpose: + * The pupose of this module is to perform non single mutations on sequences + */ + +#include "MultiMute.h" +#include +#include "Random.h" + +MultiMute::MultiMute(int a, int c, int g, int t, int alloc, bool enableTrans, bool enableRev) +{ + percAs = a; + percCs = c; + percGs = g; + percTs = t; + //Set all sub allocations to 0 if the total allocation is 0 + maxTrans = 0; + maxInsert = 0; + maxReverse = 0; + maxDup = 0; + maxDel = 0; + total_alloc = alloc; +// max_block_size = std::max(2, std::min(50, alloc / 10)); // Max mutation block size +// cout << "max block size: " << max_block_size << endl; + if (alloc == 0) { + return; + } + //Arbitrary, if only 1 percent is allocated overall, it is allocated to Insert + else if (alloc == 1) { + maxTrans = 0; + maxReverse = 0; + maxInsert = 1; + maxDup = 0; + maxDel = 0; + } else if (enableTrans) { + if (alloc > 1) { + maxTrans = rand() % alloc; + alloc -= maxTrans; + } + } else if (enableRev) { + if (alloc > 1) { + maxReverse = rand() % alloc; + alloc -= maxReverse; + } + } + + if (alloc > 1) { + maxDel = (rand() % alloc); + alloc -= maxDel; + } + if (alloc > 0) { + maxDup = rand() % alloc; + alloc -= maxDup; + } else { + maxDup = 0; + if (alloc == 1) { + alloc--; + maxDel++; + } + } + maxInsert = alloc; + +// cout << "Max Rev " << maxReverse << " maxDel " << maxDel << " maxTrans " << maxTrans << " maxInsert " << maxInsert << " maxDup " << maxDup << endl; +} + +int MultiMute::getAlignmentLength() { + return alignmentLength; +} + +int MultiMute::getIBP() { + return IBP; +} + +std::string abbreviated_vec(const vector& v) +{ + std::ostringstream oss; + char last = ' '; + int block_num = 1; + for (auto s : v) { + for (auto c : s) { + if (c == last) { + block_num++; + } else if (last != ' ') { + oss << block_num << last; + block_num = 1; + last = c; + } else { + last = c; + } + } + } + return oss.str(); +} + +int get_num_blocks(const vector& v) +{ + cout << "Abbrev: " << abbreviated_vec(v) << endl; + char last = '\0'; + int n_blocks = 0; + for (auto s : v) { + for (char c : s) { + if (c == 'S') { + last = '\0'; + } else if (c != last) { + last = c; + n_blocks++; + } + } + } + return n_blocks; +} +vector MultiMute::genMulti(string * sequence) +{ + seq = sequence; +// + double mut_bp = total_alloc * seq->length() / 100.0; + max_block_size = std::max((int64_t)2, std::min((int64_t)50, (int64_t)round(mut_bp / 10))); // Max mutation block size + //Calculate the number of nucleotides allocated to each type of mutation + maxNonMutations = (int64_t) ((float) ((100 - maxReverse - maxTrans - maxInsert - maxDup - maxDel) / 100.0) * seq->length()); + maxReverse = (int64_t) ((float) (maxReverse / 100.0) * seq->length()); + maxTrans = (int64_t) ((float) (maxTrans / 100.0) * seq->length()); + maxInsert = (int64_t) ((float) (maxInsert / 100.0) * seq->length()); + maxDel = (int64_t) ((float) (maxDel / 100.0) * seq->length()); + maxDup = (int64_t) ((float) (maxDup / 100.0) * seq->length()); + +// cout << "maxNonMutations: " << maxNonMutations << " maxReverse: " << maxReverse << " maxTrans: " << maxTrans << " maxInsert: " << maxInsert << " maxDel: " << maxDel << " maxDup: " << maxDup << endl; + //calculate alignment length and identical base pairs + alignmentLength = maxInsert + maxDup; + IBP = maxDel; + //Initialize and size vectors + int total = maxNonMutations + (2 * maxReverse) + maxTrans + maxInsert + maxTrans + maxDel + maxDup; + insertions = new vector(); + insertions->reserve(maxTrans + maxInsert); + mutationStrings = new vector(); + mutationStrings->reserve(total); + //Push 'S', which means that that is an index that wont be mutated, onto the vector + for (int i = 0; i < maxNonMutations; i++) { + mutationStrings->push_back("S"); + } + + reverse(mutationStrings); +//#pragma omp critical + { + insert(mutationStrings); + translocate(mutationStrings); + duplicate(mutationStrings); + deleteNucl(mutationStrings); + } + //Make sure no palindromes exist + checkForAllPalindromes(mutationStrings); + + //Generate a char vector from the now shuffled mutations vector + auto mutationChars = genCharVector(mutationStrings); + getTranslocations(mutationChars); + //Performs all mutations on the sequence + auto ret = formatString(seq->length() + maxTrans + maxInsert + maxDup, mutationChars); + delete mutationStrings; + delete mutationChars; + delete insertions; + return ret; +} + +void MultiMute::reverse(vector * toAddTo) +{ + //Keep forming strings until the allocation of reverse is used up + int size; + //cout << "maxReverse: " << maxReverse << endl; + while (maxReverse > 0) { + //Automatically make it 2 to avoid modulus error + if (maxReverse == 2) { + size = 2; + } else { + size = (rand() % (maxReverse - 2)) + 2; + //Add 1 to size if the remaining reverse allocation would be 1 + if (maxReverse - size == 1) { + size++; + } + } + //cout << "Reverse size: " << size << endl; + //Add a string of the randomized size to the vector + string toAdd(size, 'R'); + toAddTo->push_back(toAdd); + maxReverse -= size; + } +} + +void MultiMute::translocate(vector * toAddTo) +{ + int size; + //Keep forming strings until the allocation of Translocate is used up + while (maxTrans > 0) { + //Automatically make it 2 to avoid modulus error + if (maxTrans == 2) { + size = 2; + } else { + size = rng.randMod(std::min(max_block_size, maxTrans - 2)) + 2; + //Add 1 to size if the remaining reverse allocation would be 1 + if (maxTrans - size == 1) { + size++; + } + } + //Add a string of the randomized size to the vector, and an I for where to translocate to + //cout << "maxTrans=" << maxTrans << " Translocate: " << size << endl; + string toAdd(size, 'T'); + toAddTo->push_back(toAdd); + toAddTo->push_back("I"); + maxTrans -= size; + } +} + +void MultiMute::insert(vector * toAddTo) +{ + int size; + //Keep forming strings until the allocation of insert is used up + while (maxInsert > 0) { + //Automatically make it 2 to avoid modulus error + if (maxInsert == 2) { + size = 2; + } else { +// size = (rand() % (maxInsert - 2)) + 2; + size = rng.randMod(std::min(max_block_size, maxInsert - 2)) + 2; + //Add 1 to size if the remaining reverse allocation would be 1 + if (maxInsert - size == 1) { + size++; + } + } + // cout << "maxInsert=" << maxInsert << " insert " << size << endl; + //Add an I for where to insert, and add a generated string to the insetions vector + toAddTo->push_back("I"); + insertions->push_back(genInsert(size)); + maxInsert -= size; + } +} + +void MultiMute::deleteNucl(vector * toAddTo) +{ + int size; + //Keep forming strings until the allocation of deletion is used up + while (maxDel > 0) { + //Automatically make it 2 to avoid modulus error + if (maxDel == 2) { + size = 2; + } else { + size = rng.randMod(std::min(max_block_size, maxDel - 2)) + 2; + //size = (rand() % (maxDel - 2)) + 2; + //Add 1 to size if the remaining reverse allocation would be 1 + if (maxDel - size == 1) { + size++; + } + } + //Add a string of X's to show what nucleotides will be deleted +// cout << "maxDelete=" << maxDel << " delete " << size << endl; + string toAdd(size, 'X'); + toAddTo->push_back(toAdd); + maxDel -= size; + } +} + +void MultiMute::duplicate(vector * toAddTo) +{ + int size; + //Keep forming strings until the allocation of duplicate is used up + while (maxDup > 0) { + //Automatically make it 2 to avoid modulus error + if (maxDup == 2) { + size = 2; + } else { + size = rng.randMod(std::min(max_block_size, maxDup - 2)) + 2; +// size = (rand() % (maxDup - 2)) + 2; + //Add 1 to size if the remaining reverse allocation would be 1 + if (maxDup - size == 1) { + size++; + } + } +// cout << "maxDup=" << maxDup << " duplicate " << size << endl; + //Add a string of D's for duplicate to the vector + string toAdd(size, 'D'); + toAddTo->push_back(toAdd); + maxDup -= size; + } +} + +bool MultiMute::checkPalindrome(int start, int end) +{ + bool equal = false; + for (; start < end; start++, end--) { + if (seq->at(start) != seq->at(end)) { + equal = true; + } + } + return equal; +} + +string MultiMute::genInsert(int size) +{ + string toInsert; + toInsert.reserve(size); + int value; + //Keep adding characters based on the original distribution of nucleotides + for (int i = 0; i < size; i++) { + value = rand() % (percAs + percCs + percGs + percTs); + if (value < percAs) { + toInsert.push_back('A'); + } else if (value < percAs + percCs) { + toInsert.push_back('C'); + } else if (value < percAs + percCs + percGs) { + toInsert.push_back('G'); + } else { + toInsert.push_back('T'); + } + } + return toInsert; +} + +vector MultiMute::formatString(int maxSize, vector * mutationsChars) +{ + string temp; + temp.reserve(maxSize); + //vector that stores what indexes have/have not been mutated + vector validCharacters; + validCharacters.reserve(mutationsChars->size() * 2); + unsigned seed = 0; + // Use of shuffle to randomize the order + shuffle(insertions->begin(), insertions->end(), default_random_engine(seed)); + int j = 0; + int i = 0; + //Goes through until the end of the sequence or the end of the chars vector is reached (should always be seq first) + for (; i < seq->length() && j < mutationsChars->size();) { + //If it is a non-mutation character, simply add the current character, increment both positions + if (mutationsChars->at(j) == 'S') { + temp.push_back(seq->at(i)); + i++; + j++; + validCharacters.push_back(true); + } + //If it is an I, get the next insertion string and append it to the back of the mutaton string, as long as the insertion vector still has stuffing + else if (mutationsChars->at(j) == 'I') { + if (insertions->size() > 0) { + temp.append(insertions->back()); + insertions->pop_back(); + } + //Increment only the char vector + j++; + } + //For duplications, it will add each charceter, and then read a string of the added characters in the same order + else if (mutationsChars->at(j) == 'D') { + string temp2; + temp2.reserve(seq->length() - i); + for (; j < mutationsChars->size() && mutationsChars->at(j) == 'D' && i < seq->length(); j++, i++) { + temp2.push_back(seq->at(i)); + temp.push_back(seq->at(i)); + validCharacters.push_back(false); + validCharacters.push_back(false); + } + //I and J are not incremented because they are incremented in the loop + temp.append(temp2); + } + //Otherwise, skip over the nuleotide + else { + i++; + j++; + } + } + //Add any extra insertions of there are any + if (insertions->size() > 0) { + for (int k = 0; k < insertions->size(); k++) { + temp.append(insertions->at(k)); + } + } + //Reassign the string pointer + seq->erase(); + seq->reserve(temp.length()); + seq->append(temp); + return validCharacters; +} + + +void MultiMute::getTranslocations(vector * toParseFrom) { + for (int i = 0, j = 0; i < seq->length() && j < toParseFrom->size();) { + //If a T is found, the string of nucleotides with corresponding T's is copied and added to the insertion vector + if (toParseFrom->at(j) == 'T') { + string temp; + temp.reserve(seq->length() - i); + for (;j < toParseFrom->size() && toParseFrom->at(j) == 'T' && i < seq->length(); i++, j++) { + temp.push_back(seq->at(i)); + } + insertions->push_back(temp); + } + //Skip over the I's + else if (toParseFrom->at(j) == 'I') { + j++; + } + //Otherwise, increment both + else { + j++; + i++; + } + } +} + +vector * MultiMute::genCharVector(vector * toParseFrom) { + vector * charVector = new vector(); + charVector->reserve(seq->length()); + string temp; + //For every index + for (int i = 0; i < toParseFrom->size(); i++) { + temp = toParseFrom->at(i); + //Add each character in the string at the index, add it to the new character vector + for (int j = 0; j < temp.length(); j++) { + charVector->push_back(temp.at(j)); + } + } + return charVector; +} + +void MultiMute::checkForAllPalindromes(vector * toParseFrom) { + int insertionChanges = 0; + for (int i = 0, j = 0; i < seq->length() && j < toParseFrom->size();) { + //If it is not a reversal + if (toParseFrom->at(j).at(0) != 'R') { + //If it is an insertion character, only increment the vector integer + if (toParseFrom->at(j).at(0) == 'I') { + j++; + } + //Otherwise, increment the string iterator by the length of the current string in the vector, + //then increment the vector integer + else { + i += toParseFrom->at(j).length(); + j++; + + } + } else { + //If it is not a palindrome, incremtn as in the if statement + if (checkPalindrome(i, i + toParseFrom->at(j).length() - 1)) { + i += toParseFrom->at(j).length(); + j++; + } + //Otherwise, replace the reverse with a transversal + else { + string temp(toParseFrom->at(j).length(), 'T'); + toParseFrom->at(j) = temp; + insertionChanges++; + } + } + } + //Insert enough I's randomly for the amount of transversals that replaced reversals + for (int i = 0; i < insertionChanges; i++) { + int index = rand() % toParseFrom->size(); + toParseFrom->insert(toParseFrom->begin() + index, "I"); + } +} diff --git a/src/cluster/src/MultiMute.h b/src/cluster/src/MultiMute.h new file mode 100644 index 0000000..8d27d6e --- /dev/null +++ b/src/cluster/src/MultiMute.h @@ -0,0 +1,142 @@ +/** + * Author: Alex Baumgartner + * The Bioinformatics Toolsmith Laboratory, the University of Tulsa + * 5/15/2018 + * + * Purpose: + * The pupose of this module is to perform non single mutations on sequences + */ + +#ifndef MULTIMUTE_H +#define MULTIMUTE_H + +#include +#include +#include +#include +#include +#include "Random.h" + +using namespace std; + +class MultiMute { +public: + /* + Constructor, creates values + and assignes allocations based on inputted data + + @param: + int: percentage of A's + int: percentage of C's + int: percentage of G's + int: percentage of T's + int: The total allocation for non-single mutations + int: bool to exclude Translocate and reverse, 1 for disable, any other umber for include + */ + MultiMute(int, int, int, int, int, bool, bool); + /* + Takes in a string pointer, + and mutates it based on the allocation given to the constructor. + Returns a vector of all valid and invalid indexes + + @param: + std::string *: pointer to the string to be mutated + + @return: + std::vector: vector of mutations, + false means that index has been mutated + */ + std::vector genMulti(std::string *); + int getAlignmentLength(); + int getIBP(); + + private: + int percAs; + int percCs; + int percGs; + int percTs; + int64_t maxReverse; + int64_t maxInsert; + int64_t maxTrans; + int64_t maxDel; + int64_t maxDup; + int64_t maxNonMutations; + int64_t alignmentLength; + int64_t IBP; + int64_t total_alloc; + Random rng; + + int64_t max_block_size; + std::vector * insertions; + std::vector * mutationStrings; + std::string * seq; + /* + Takes in a vector + */ + void reverse(vector *); + /* + Translocates a random, nonmutaded part of the sequence, + no larger than its max allocation + */ + void translocate(vector *); + /* + Inserts at random, nonmutaded part of the sequence, + no larger than its max allocation + */ + void insert(vector *); + /* + Deletes a random, nonmutaded part of the sequence, + no larger than its max allocation + */ + void deleteNucl(vector *); + /* + Duplicates a random, nonmutaded part of the sequence, + no larger than its max allocation + to an index directly after the duplicated string + */ + void duplicate(vector *); + /* + Checks inclusively, [first, last], if a portion is valid + + @param: + int: The starting index (first) + int: The ending index (last) + + @return: + bool: true if all indexes in range are valid + */ + bool checkPalindrome(int, int); + void checkForAllPalindromes(vector *); + /* + Marks all indexes in the range as invalid + + @param: + int: first index to be marked false + int: last index tobe marked false + */ + vector formatString(int, vector *); + + /* + Generates a randomized string based on the inputed size + @param: + int: size of string to generate + @return + string: randomized string + */ + std::string genInsert(int); + /* + Adds all translocations to the insertions array + @param: + vector *: pointer to a char vector with mutation characters + */ + void getTranslocations(vector *); + /* + converts a vector of strings into a vector of chars + @param: + vector *: the vector to be converted + @return: + vector *: the vector of characters + */ + vector * genCharVector(vector *); +}; +#endif diff --git a/src/cluster/src/NearestNeighbor.h b/src/cluster/src/NearestNeighbor.h new file mode 100644 index 0000000..a59b87b --- /dev/null +++ b/src/cluster/src/NearestNeighbor.h @@ -0,0 +1,52 @@ +/* -*- C++ -*- + * + * NearestNeighbor.h + * + * Author: Benjamin T James + */ +#ifndef NEARESTNEIGHBOR_H +#define NEARESTNEIGHBOR_H +// #include +// #include "Point.h" +// template +// class NearestNeighbor { +// public: +// NearestNeighbor(const vector*> &pts) : points(pts) { +// const int dim = pts[0]->get_data().size(); +// const int maxPts = pts.size(); +// dataPts = annAllocPts(maxPts, dim); +// queryPt = annAllocPt(dim); +// for (int nPts = 0; nPts < maxPts; nPts++) { +// auto vec = pts[nPts]->get_data(); +// for (int i = 0; i < vec.size(); i++) { +// dataPts[nPts][i] = vec[i]; +// } +// } +// kd_tree = new ANNkd_tree(dataPts, maxPts, dim); +// nnIdx = new ANNidx[1]; +// dists = new ANNdist[1]; +// }; +// ~NearestNeighbor() { +// delete[] nnIdx; +// delete[] dists; +// delete kd_tree; +// annClose(); +// }; +// void find_nearest_neighbor(Point ¢er) const { +// auto vec = center.get_data(); +// for (int i = 0; i < vec.size(); i++) { +// queryPt[i] = vec[i]; +// } +// kd_tree->annkSearch(queryPt, 1, nnIdx, dists); +// ANNidx idx = nnIdx[0]; +// center.set(*points[idx]); +// }; +// private: +// ANNkd_tree *kd_tree = NULL; +// ANNpointArray dataPts; +// ANNpoint queryPt; +// ANNidxArray nnIdx; +// ANNdistArray dists; +// const vector*> &points; +// }; +#endif diff --git a/src/cluster/src/Point.h b/src/cluster/src/Point.h new file mode 100644 index 0000000..a70bc20 --- /dev/null +++ b/src/cluster/src/Point.h @@ -0,0 +1,83 @@ +/* -*- C++ -*- + * + * Point.h + * + * Author: Benjamin T James + * + * For some reason this class was made pure virtual + * in early development of MeShClust, making Histogram + * and DivergencePoint both derivatives that essentially + * did the same thing + */ +#ifndef POINT_H +#define POINT_H + +#include +#include "../../nonltr/ChromosomeOneDigit.h" + +/* + * Pure virtual class that defines behavior for + * points. Has clone() and create() that allow for + * polymorphic behavior + */ +template +class Point { +public: + virtual ~Point() { data.clear(); }; + virtual void operator*=(double d) = 0; + virtual void operator/=(double d) = 0; + virtual bool operator<(Point& p) const = 0; + virtual uint64_t operator-(const Point& p) const = 0; + virtual void operator+=(Point& p) = 0; + virtual void set(Point& p) = 0; + virtual void display() const = 0; + virtual uint64_t distance(const Point& p) const = 0; + virtual double distance_d(Point& p) const = 0; + virtual Point* clone() const = 0; + virtual Point* create() const = 0; + + virtual void zero() = 0; + virtual void addOne() = 0; + virtual double distance_k1(const Point& p) const = 0; + virtual double prob_under(Point& center) const = 0; + virtual void subOne() = 0; + virtual uint64_t getRealMagnitude() const = 0; +// virtual T magnitude() const = 0; + virtual bool is_to_delete() const = 0; + virtual void set_to_delete(bool b) = 0; + + virtual Point* create_double() const = 0; + virtual void set_arg_to_this_d(Point& p) const = 0; + + virtual const vector& get_data() const = 0; + + void set_header(const std::string c) { header = c; }; + const std::string get_header() const { return header; }; + + void set_data_str(const std::string& c) { data = c; }; + const std::string & get_data_str() const { return data; }; + + void set_1mers(const vector &vec) { + for (auto i = 0; i < 4; i++) { + one_mers[i] = vec[i]; + } + } + vector get_1mers() const { + vector vec; + for (auto i = 0; i < 4; i++) { + vec.push_back(one_mers[i]); + } + return vec; + } + virtual unsigned long size() const = 0; + virtual void set_id(uintmax_t c_id) = 0;//{ id = c_id; }; + virtual const uintmax_t get_id() const = 0;//{ return id; }; + virtual void set_length(unsigned long len) = 0; + virtual unsigned long get_length() const = 0; +private: + uint64_t one_mers[4]; + std::string header; + std::string data; +}; + +#endif diff --git a/src/cluster/src/Predictor.cpp b/src/cluster/src/Predictor.cpp new file mode 100644 index 0000000..8ff4755 --- /dev/null +++ b/src/cluster/src/Predictor.cpp @@ -0,0 +1,837 @@ +/* -*- C++ -*- + * + * Predictor.cpp + * + * Author: Benjamin T James + * + * Predictor implementation class + * train(vector<>...) is entry point, generates "semi-synthetic" sequences + * train() actually trains applicable GLM's. + * close() and similarity() are callable once trained + */ +#include "Predictor.h" +#include "Loader.h" +#include "Matrix.h" +#include "ClusterFactory.h" +#include "HandleSeq.h" +#include "Progress.h" +#include "Random.h" +#include + +template +void Predictor::save(std::string file) +{ + std::ofstream out(file); + out << "k: " << k << endl; + out << "mode: " << (unsigned int)mode << endl; + out << "max_features: " << max_num_feat << endl; + out << "ID: " << id << endl; + if (mode & PRED_MODE_CLASS) { + write_to(out, feat_c, c_glm); + } + if (mode & PRED_MODE_REGR) { + write_to(out, feat_r, r_glm); + } +} + +template +Predictor::Predictor(const std::string filename) +{ + std::ifstream in(filename); + std::string buf; + unsigned mode_ = 0; + in >> buf >> k; + cout << buf << k << endl; + in >> buf >> mode_; + mode = mode_; + cout << buf << mode << endl; + in >> buf >> max_num_feat; + cout << buf << max_num_feat << endl; + in >> buf >> id; + cout << buf << id << endl; + is_trained = true; + is_training = false; + if (mode & PRED_MODE_CLASS) { + auto pr = read_from(in, k); + c_glm = pr.first; + feat_c = pr.second; + } + if (mode & PRED_MODE_REGR) { + auto pr = read_from(in, k); + r_glm = pr.first; + feat_r = pr.second; + } +} + +template +void Predictor::write_to(std::ofstream &out, Feature* feat, matrix::GLM glm) +{ + auto combos = feat->get_combos(); + auto lookup = feat->get_lookup(); + auto mins = feat->get_mins(); + auto maxs = feat->get_maxs(); + out << std::endl << "n_combos: " << combos.size() << std::endl; + out << glm.get_weights().get(0, 0) << endl; + for (int j = 0; j < combos.size(); j++) { + auto cmb = combos[j]; + unsigned int val = 0; + uint64_t flags = 0; + for (auto i : cmb.second) { + flags |= lookup[i]; + } + switch (cmb.first) { + case Combo::xy: + val = 0; + break; + case Combo::xy2: + val = 1; + break; + case Combo::x2y: + val = 2; + break; + case Combo::x2y2: + val = 3; + break; + } + out << val << " "; + out << flags << " "; + out << glm.get_weights().get(j+1, 0) << std::endl; + } + out << std::endl << "n_singles: " << lookup.size() << std::endl; + for (int j = 0; j < lookup.size(); j++) { + out << lookup[j] << " "; + out << mins[j] << " "; + out << maxs[j] << std::endl; + } +} + + +template +pair*> Predictor::read_from(std::ifstream& in, int k_) +{ + matrix::GLM glm; + int c_num_raw_feat, c_num_combos; + Feature *feat = new Feature(k_); + std::string buf; + in >> buf >> c_num_combos; + cout << buf << "\"" << c_num_combos << "\"" << endl; + matrix::Matrix weights(c_num_combos+1, 1); + double d_; + in >> d_; + weights.set(0, 0, d_); + for (int i = 0; i < c_num_combos; i++) { + int cmb; + in >> cmb; + cout << (int)cmb << endl; + uint64_t flags; + in >> flags; + cout << flags << endl; + double d; + in >> d; + cout << "[" << 0 << "," << i << "] " << d << endl; + weights.set(i+1, 0, d);//push_back(d); + Combo cmb_ = Combo::xy; + switch (cmb) { + case 0: + cmb_ = Combo::xy; + break; + case 1: + cmb_ = Combo::xy2; + break; + case 2: + cmb_ = Combo::x2y; + break; + case 3: + cmb_ = Combo::x2y2; + break; + default: + cerr << "error reading weights file" << endl; + break; + } + feat->add_feature(flags, cmb_); + } + + in >> buf >> c_num_raw_feat; + cout << buf << "\"" << c_num_raw_feat << "\"" << endl; + for (int i = 0; i < c_num_raw_feat; i++) { + uint64_t single_flag; + double min_, max_; + in >> single_flag; + cout << single_flag << endl; + in >> min_; + cout << min_ << endl; + in >> max_; + cout << max_ << endl; + feat->set_normal(single_flag, min_, max_); + } + feat->finalize(); + glm.load(weights); + return {glm, feat}; +} + +template +void Predictor::add_feats(std::vector >& vec, uint64_t feat_flags) +{ + for (uint64_t i = 1; i <= feat_flags; i *= 2) { + if ((i & feat_flags) == 0) { + continue; + } + for (uint64_t j = 1; j <= i; j *= 2) { + if ((j & feat_flags) == 0) { + continue; + } + vec.emplace_back(i | j, Combo::xy); + vec.emplace_back(i | j, Combo::x2y2); + if (i != j) { + vec.emplace_back(i | j, Combo::x2y); + vec.emplace_back(i | j, Combo::xy2); + } + } + } +} +template +void Predictor::check() +{ + // if (!is_trained && training.size() >= threshold && !is_training) { + // omp_set_lock(&lock); + // is_training = true; + // train(); + // is_training = false; + // omp_unset_lock(&lock); + // } +} +template +double Predictor::similarity(Point* a, Point* b) +{ + if (!is_trained) { +// double d = Selector::align(a, b); + cerr << "alignment: we don't do that here" << endl; + throw "Bad"; + // return d; + // if (!is_training) { + // omp_set_lock(&lock); + // if (training.size() < testing.size() && training.size() < threshold) { + // training.push_back(pra(a, b, d)); + // } else if (training.size() >= testing.size() && testing.size() < threshold) { + // testing.push_back(pra(a, b, d)); + // } + // omp_unset_lock(&lock); + // } + return 0; + + } else { + return predict(a, b); + } +} + +template +bool Predictor::close(Point *a, Point *b) +{ + if (!is_trained) { +// double d = Selector::align(a, b); + cerr << "alignment shouldn't be used here" << endl; + throw "bad"; + // if (!is_training) { + // omp_set_lock(&lock); + // if (training.size() < testing.size() && training.size() < threshold) { + // training.push_back(pra(a, b, d)); + // } else if (training.size() >= testing.size() && testing.size() < threshold) { + // testing.push_back(pra(a, b, d)); + // } + // omp_unset_lock(&lock); + // } +// return d > id; + return false; + } + bool val = p_close(a, b); + if ((mode & PRED_MODE_REGR) && val) { + // val = p_predict(a, b) > id; + // if (!val) { + // cout << "FIXED" << endl; + // } + } + return val; +} + +template +double Predictor::p_predict(Point* a, Point* b) +{ + auto cache = feat_r->compute(*a, *b); + auto weights = r_glm.get_weights(); + double sum = weights.get(0, 0); + for (int col = 0; col < feat_r->size(); col++) { + double val = (*feat_r)(col, cache); + sum += weights.get(col+1, 0) * val; + } + if (sum < 0) { + sum = 0; + } else if (sum > 1) { + sum = 1; + } + return sum; +} +template +double Predictor::predict(Point* a, Point* b) +{ + if ((mode & PRED_MODE_CLASS) && !p_close(a, b)) { + return 0; + } + return p_predict(a, b); +} + +template +bool Predictor::p_close(Point* a, Point* b) +{ + auto weights = c_glm.get_weights(); + double sum = weights.get(0, 0); + auto cache = feat_c->compute(*a, *b); + for (int col = 1; col < weights.getNumRow(); col++) { + double d = (*feat_c)(col-1, cache); + sum += weights.get(col, 0) * d; + } + return sum > 0.0; +} + + +template +std::pair generate_feat_mat(const vector > &data, Feature& feat, double cutoff) +{ + bool classify = (cutoff >= 0); + int nrows = data.size(); + int ncols = feat.size()+1; + matrix::Matrix feat_mat(nrows, ncols); + matrix::Matrix labels(nrows, 1); + #pragma omp parallel for + for (int row = 0; row < data.size(); row++) { + auto kv = data.at(row); + vector cache; + // #pragma omp critical + // { + cache = feat.compute(*kv.first, *kv.second); + // } + feat_mat.set(row, 0, 1); + if (classify) { + labels.set(row, 0, kv.val >= cutoff ? 1 : -1); + } else { + labels.set(row, 0, kv.val); + } + for (int col = 1; col < ncols; col++) { + double val = feat(col-1, cache); + feat_mat.set(row, col, val); + } + } + return std::make_pair(feat_mat, labels); +} + +template +void Predictor::train(const vector *> &points, const vector* > &queries, uintmax_t &_id, size_t num_sample) +{ + if (is_trained) { return; } + + num_sample = min(num_sample, points.size()); + + vector*> f_points_tr, f_points_test; + size_t total_size = points.size();// + queries.size(); + for (int i = 0; i < num_sample; i++) { + int i1 = floor((double)i * total_size / (2 * num_sample)); + int i2 = floor((i + 1) * (double)total_size / (2 * num_sample)); + f_points_tr.push_back(points.at(i1)); + f_points_test.push_back(points.at(i2)); + } + // size_t q_sample = min(num_sample / 10, queries.size()); + // while (10 * f_points_tr.size() <= 11 * num_sample) { + // for (int i = 0; i < q_sample; i++) { + // int i1 = floor((double)i * queries.size() / (2 * q_sample)); + // int i2 = floor((i + 1) * (double)queries.size() / (2 * q_sample)); + // f_points_tr.push_back(queries.at(i1)); + // f_points_test.push_back(queries.at(i2)); + // } + // } + training.clear(); + testing.clear(); + if (mode & PRED_MODE_CLASS) { + + std::vector > pos_buf, neg_buf; + cout << "mutating sequences" << endl; + size_t counter = 0; + // struct timespec start, stop; + // clock_gettime(CLOCK_MONOTONIC, &start); + #pragma omp parallel for + for (size_t i = 0; i < f_points_tr.size(); i++) { + auto p = f_points_tr[i]; + mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id); + mutate_seqs(p, 5, pos_buf, neg_buf, min_id, 100 * id, _id); + #pragma omp critical + cout << "Generated " << ++counter << " / " << f_points_tr.size() << endl; + } + // clock_gettime(CLOCK_MONOTONIC, &stop); + // printf("took %lu\n", stop.tv_sec - start.tv_sec); + + counter = 0; + size_t buf_size = std::min(pos_buf.size(), neg_buf.size()); + cout << "training +: " << pos_buf.size() << endl; + cout << "training -: " << neg_buf.size() << endl; + std::random_shuffle(pos_buf.begin(), pos_buf.end()); + std::random_shuffle(neg_buf.begin(), neg_buf.end()); + for (size_t i = 0; i < buf_size; i++) { + training.push_back(pos_buf[i].deep_clone()); + training.push_back(neg_buf[i].deep_clone()); + } + for (auto p : pos_buf) { + delete p.first; + delete p.second; + } + for (auto p : neg_buf) { + delete p.first; + delete p.second; + } + pos_buf.clear(); + neg_buf.clear(); + #pragma omp parallel for + for (size_t i = 0; i < f_points_test.size(); i++) { + auto p = f_points_test[i]; + mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id); + mutate_seqs(p, 5, pos_buf, neg_buf, min_id, 100 * id, _id); +#pragma omp critical + cout << "Generated " << ++counter << " / " << f_points_test.size() << endl; + } + buf_size = std::min(pos_buf.size(), neg_buf.size()); + cout << "testing +: " << pos_buf.size() << endl; + cout << "testing -: " << neg_buf.size() << endl; + std::random_shuffle(pos_buf.begin(), pos_buf.end()); + std::random_shuffle(neg_buf.begin(), neg_buf.end()); + for (size_t i = 0; i < buf_size; i++) { + testing.push_back(pos_buf[i].deep_clone()); + testing.push_back(neg_buf[i].deep_clone()); + } + for (auto p : pos_buf) { + delete p.first; + delete p.second; + } + for (auto p : neg_buf) { + delete p.first; + delete p.second; + } + } else { + for (auto p : f_points_tr) { + mutate_seqs(p, 10, training, training, min_id, 100, _id); + } + for (auto p : f_points_test) { + mutate_seqs(p, 10, testing, testing, min_id, 100, _id); + } + } + + + train(); +} +template +std::pair regression_train(const vector > &data, Feature& feat) +{ + auto pr = generate_feat_mat(data, feat, -1); + matrix::GLM glm; + glm.train(pr.first, pr.second); + auto result1 = pr.first * glm.get_weights(); + auto diff1 = result1 - pr.second; + double sum = 0; + for (int i = 0; i < diff1.getNumRow(); i++) { + sum += fabs(diff1.get(i, 0)); + } + sum /= diff1.getNumRow(); + return {sum, glm}; +} + +template +std::pair class_train(vector > &data, Feature& feat, double cutoff) +{ + // vector > above, below; + + // for (auto d : data) { + // if (d.val > cutoff) { + // above.push_back(d); + // } else { + // below.push_back(d); + // } + // } + // size_t sz = std::min(above.size(), below.size()); + // data.clear(); + // for (size_t i = 0; i < sz; i++) { + // data.push_back(above[i]); + // data.push_back(below[i]); + // } + auto pr = generate_feat_mat(data, feat, cutoff); + matrix::GLM glm; + glm.train(pr.first, pr.second); + matrix::Matrix p = glm.predict(pr.first); + for (int row = 0; row < p.getNumRow(); row++) { + if (p.get(row, 0) == 0) { + p.set(row, 0, -1); + } + } + double acc = get<0>(glm.accuracy(pr.second, p)); + return {acc, glm}; +} + +template +double regression_test(const vector >& data, Feature& feat, const matrix::GLM& glm, std::string prefix="") +{ + auto pr = generate_feat_mat(data, feat, -1); + auto result1 = pr.first * glm.get_weights(); + auto diff1 = result1 - pr.second; + double sum = 0; + for (int i = 0; i < diff1.getNumRow(); i++) { + sum += fabs(diff1.get(i, 0)); + } + if (prefix != "") { + for (int row = 0; row < result1.getNumRow(); row++) { + cout << prefix << ";" << data[row].first->get_header() << ";" << data[row].second->get_header() << ";" << result1.get(row, 0) << ";" << pr.second.get(row, 0) << ";" << diff1.get(row, 0) << endl; + } + } + sum /= diff1.getNumRow(); + return sum; +} + +template +void print_wrong(matrix::Matrix oLabels, matrix::Matrix pLabels) +{ + for(int i = 0; i < oLabels.getNumRow(); i++){ + if(oLabels.get(i,0) == pLabels.get(i, 0)){ + cout << ""; + } + } +} + +template +tuple class_test(const vector >& data, Feature& feat, const matrix::GLM& glm, double cutoff, std::string prefix="") +{ + auto pr = generate_feat_mat(data, feat, cutoff); + matrix::Matrix p = glm.predict(pr.first); + for (int row = 0; row < p.getNumRow(); row++) { + if (p.get(row, 0) == 0) { + p.set(row, 0, -1); + } + if (prefix != "") { + cout << prefix << ";" << data[row].first->get_header() << ";" << data[row].second->get_header() << ";" << data[row].val << ";" << p.get(row, 0) << ";" << pr.second.get(row, 0) << endl; + } + } +// print_wrong(pr.second, p); + return glm.accuracy(pr.second, p); +} + +template +void Predictor::filter(std::vector > &vec, std::string prefix) +{ + std::vector > > bins; + std::vector limits; + size_t num_bins = 10; + size_t smallest_bin_size = vec.size(); + for (size_t i = 0; i < num_bins; i++) { + limits.push_back(id + i * (1 - id) / num_bins); + bins.push_back(std::vector >()); + } + limits.push_back(1); + for (auto p : vec) { + for (size_t i = 1; i < limits.size(); i++) { + if (p.val <= limits[i] && p.val > limits[i-1]) { + bins[i-1].push_back(p); + if (prefix != "") { + cout << prefix << " bin " << i - 1 << " " << p.val << endl; + } + break; + } + } + } + size_t bin_size = 0; + for (auto &v : bins) { + bin_size += v.size(); + // smallest_bin_size = std::min(smallest_bin_size, v.size()); + std::random_shuffle(v.begin(), v.end()); + } + smallest_bin_size = bin_size / bins.size(); + vec.clear(); + + for (auto &v : bins) { + for (size_t i = 0; i < std::min(v.size(), smallest_bin_size); i++) { + vec.push_back(v[i]); + } + } + cout << "new vector size: " << vec.size() << " divided into " << bins.size() << " equal parts" << endl; +} + +double rand_between(double mute, double rng, double low, double high) +{ + Random r; + double r_d = r.random(); + + double mn = std::max(mute - rng, low); + double mx = std::min(mute + rng, high); + return r_d * (mx - mn) + mn; +} + +template +void Predictor::mutate_seqs(Point* p, size_t num_seq, vector > &pos_buf, vector > &neg_buf, double id_begin, double id_end, uintmax_t& _id) +{ + HandleSeq h(mut_type); + ClusterFactory factory(k); + double inc = (id_end - id_begin) / num_seq; + std::string bin_seq = p->get_data_str(); + std::string seq; + for (auto c : bin_seq) { + switch (c) { + case 0: + seq += 'A'; + break; + case 1: + seq += 'C'; + break; + case 2: + seq += 'G'; + break; + case 3: + seq += 'T'; + break; + case 'N': + seq += 'C'; + break; + default: + cout << "Invalid character " << c << endl; + cout << "from sequence " << bin_seq << endl; + throw 3; + } + } + for (size_t i = 0; i < num_seq; i++) { + double iter_id = id_begin + inc * (i + 0.5); + double actual_id = rand_between(iter_id, inc, id_begin, id_end); + int mut = round(100 - actual_id); + auto newseq = h.mutate(seq, mut); + std::string chrom; + std::string header = p->get_header(); + Point* new_pt = Loader::get_point(header, newseq.second, _id, k); + pra pr; + pr.first = p->clone(); + pr.second = new_pt; + pr.val = newseq.first; +#pragma omp critical + { + if (pr.val > id) { + pos_buf.push_back(pr); + } else { + neg_buf.push_back(pr); + } + } + } +} +template +void Predictor::train() +{ + Feature feat(k); + feat.set_save(true); + + uint64_t max_feat = 0; + for (uint64_t i = 0; i < possible_feats.size(); i++) { + if (possible_feats.at(i).first > max_feat) { + max_feat |= possible_feats.at(i).first; + } + } + for (uint64_t i = 1; i <= max_feat; i *= 2) { + if (i & max_feat) { + feat.add_feature(i, Combo::xy); + } + } + feat.normalize(training); + feat.normalize(testing); + feat.finalize(); + + + + // cout << "Class Training:" << endl; + // for (auto p : training) { + // cout << p.val << " "; + // } + // cout << "Class Testing:" << endl; + // for (auto p : testing) { + // cout << p.val << " "; + // } + if (mode & PRED_MODE_CLASS) { + train_class(&feat); + if (mode & PRED_MODE_REGR) { + // vector*> f_points_tr, f_points_test; + // for (int i = 0; i < 10; i++) { + // f_points_tr.push_back(training[rand()%training.size()].first); + // f_points_test.push_back(training[rand()%training.size()].first); + // } + // training.clear(); + // testing.clear(); + // for (auto p : f_points_tr) { + // mutate_seqs(p, 50, training, 100 * id, 100); + // mutate_seqs(p, 50, training, 60, 100 * id); + // } + // for (auto p : f_points_test) { + // mutate_seqs(p, 50, testing, 100 * id, 100); + // mutate_seqs(p, 50, testing, 60, 100 * id); + // } + // filter(); + auto func = [&](pra pr) { + return pr.val <= id; + }; + training.erase(std::remove_if(training.begin(), training.end(), func), training.end()); + testing.erase(std::remove_if(testing.begin(), testing.end(), func), testing.end()); + filter(training);//, "training"); + filter(testing);//, "testing"); + + } + } + if (mode & PRED_MODE_REGR) { + train_regr(&feat); + } + cout << "Training size: " << training.size() << endl; + cout << "Testing size: " << testing.size() << endl; + // for (auto p : training) { + // cout << p.val << " "; + // } + cout << endl; + feat.set_save(false); + training.clear(); + testing.clear(); + possible_feats.clear(); + is_trained = true; +} + +template +void Predictor::train_class(Feature* feat) +{ + auto c_size = feat->get_combos().size(); + for (int i = 0; i < c_size; i++) { + feat->remove_feature(); + } + vector used_list; + double abs_best_acc = 0; +// cout << "possible feats at one step: " << possible_feats.size() << endl; + Progress prog(possible_feats.size() * max_num_feat, "Feature selection:"); + + std::ostringstream oss; + for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) { + double best_class_acc = abs_best_acc; + uintmax_t best_idx = -1, cur_idx = 1; + auto best_class_feat = possible_feats.front(); + for (uint64_t i = 0; i < possible_feats.size(); i++) { + if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) { + continue; + } + auto rfeat = possible_feats[i]; + feat->add_feature(rfeat.first, rfeat.second); + feat->normalize(training); + feat->finalize(); + auto name = feat->feat_names().back(); + auto pr = class_train(training, *feat, id); + auto class_ac = class_test(testing, *feat, pr.second, id); + feat->remove_feature(); + prog++; +// cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name << " acc: " << get<0>(class_ac) << " sens: " << get<1>(class_ac) << " spec: " << get<2>(class_ac) << endl; + if (get<0>(class_ac) > best_class_acc) { + best_class_acc = get<0>(class_ac); + best_class_feat = rfeat; + best_idx = i; + } + } + if (best_class_acc > abs_best_acc || num_feat <= min_num_feat) { + feat->add_feature(best_class_feat.first, best_class_feat.second); + feat->normalize(training); + feat->finalize(); + abs_best_acc = best_class_acc; + used_list.push_back(best_idx); + oss << "Feature added: " << best_class_feat.first << " " << (int)best_class_feat.second << endl; + oss << "Accuracy: " << best_class_acc << endl; + possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_class_feat), possible_feats.end()); + } + } + prog.end(); + cout << oss.str(); + feat_c = new Feature(*feat); + feat_c->set_save(false); + auto pr = class_train(training, *feat_c, id); + cout << "Training ACC: " << pr.first << endl; + c_glm = pr.second; + auto train_results = class_test(training, *feat_c, c_glm, id);//, "train"); + cout << "Training ACC: " << get<0>(train_results) << " " << get<1>(train_results) << " " << get<2>(train_results) << endl; + auto test_results = class_test(testing, *feat_c, c_glm, id);//, "test"); + double class_acc = get<0>(test_results); + cout << "Testing ACC: " << class_acc << " " << get<1>(test_results) << " " << get<2>(test_results) << endl; + + cout << "Features: "<< endl; + for (auto line : feat_c->feat_names()) { + cout << "\t" << line << endl; + } +} +template +void Predictor::train_regr(Feature* feat) +{ + auto c_size = feat->get_combos().size(); + for (int i = 0; i < c_size; i++) { + feat->remove_feature(); + } + vector used_list; + double abs_best_regr = 1000000; + for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) { + double best_regr_err = abs_best_regr; + uintmax_t best_idx = -1, cur_idx = 1; + auto best_regr_feat = possible_feats.front(); + for (uint64_t i = 0; i < possible_feats.size(); i++) { + if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) { + continue; + } + auto rfeat = possible_feats[i]; + feat->add_feature(rfeat.first, rfeat.second); + feat->normalize(training); + feat->finalize(); + auto pr = regression_train(training, *feat); + auto name = feat->feat_names().back(); + double regr_mse = regression_test(testing, *feat, pr.second); + feat->remove_feature(); + + cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name << " err: " << regr_mse << endl; + if (regr_mse < best_regr_err) { + best_regr_err = regr_mse; + best_regr_feat = rfeat; + best_idx = i; + } + } + if (best_regr_err < abs_best_regr) { + feat->add_feature(best_regr_feat.first, best_regr_feat.second); + feat->normalize(training); + feat->finalize(); + abs_best_regr = best_regr_err; + used_list.push_back(best_idx); + //possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_regr_feat), possible_feats.end()); + } + } + feat_r = new Feature(*feat); + feat_r->set_save(false); + auto pr = regression_train(training, *feat_r); + r_glm = pr.second; + double tr_regr_mse = regression_test(testing, *feat_r, r_glm); // "training" + cout << "Training Mean Error: " << pr.first << endl; + double regr_mse = regression_test(testing, *feat_r, r_glm);//, "testing"); + cout << "Testing Mean Error: " << regr_mse << endl; + cout << "Features: "<< endl; + for (auto line : feat_r->feat_names()) { + cout << "\t" << line << endl; + } + // auto w = r_glm.get_weights(); + // for (int r = 0; r < w.getNumRow(); r++) { + // for (int c = 0; c < w.getNumCol(); c++) { + // cout << w.get(r, c) << " "; + // } + // cout << endl; + // } + // for (auto combo : feat.get_combos()) { + // cout << combo.first << " " << + // } + +} + +template class Predictor; +template class Predictor; +template class Predictor; +template class Predictor; +template class Predictor; +template class Predictor; diff --git a/src/cluster/src/Predictor.h b/src/cluster/src/Predictor.h new file mode 100644 index 0000000..bf35036 --- /dev/null +++ b/src/cluster/src/Predictor.h @@ -0,0 +1,78 @@ +/* -*- C++ -*- + * + * Predictor.h + * + * Author: Benjamin T James + * + * Main class for training and prediction + * Does bulk training, but can be adapted for on-line training + */ + +#ifndef PREDICTOR_H +#define PREDICTOR_H + +#include "GLM.h" +#include "Point.h" +#include "Feature.h" +#include +#include +#define PRED_MODE_CLASS 1 +#define PRED_MODE_REGR 2 + +#define PRED_FEAT_FAST (FEAT_EUCLIDEAN | FEAT_MANHATTAN | FEAT_INTERSECTION | FEAT_KULCZYNSKI2 | FEAT_SIMRATIO | FEAT_NORMALIZED_VECTORS | FEAT_PEARSON_COEFF | FEAT_EMD | FEAT_LENGTHD ) +#define PRED_FEAT_DIV (FEAT_JEFFEREY_DIV | FEAT_JENSEN_SHANNON) +#define PRED_FEAT_ALL (FEAT_HELLINGER|FEAT_MANHATTAN|FEAT_EUCLIDEAN|FEAT_CHI_SQUARED|FEAT_NORMALIZED_VECTORS|FEAT_HARMONIC_MEAN|FEAT_JEFFEREY_DIV|FEAT_K_DIV|FEAT_PEARSON_COEFF|FEAT_SQCHORD|FEAT_KL_COND|FEAT_MARKOV|FEAT_INTERSECTION|FEAT_RRE_K_R|FEAT_D2z|FEAT_SIM_MM|FEAT_EUCLIDEAN_Z|FEAT_EMD|FEAT_SPEARMAN|FEAT_JACCARD|FEAT_LENGTHD|FEAT_D2s|FEAT_AFD|FEAT_MISMATCH|FEAT_CANBERRA|FEAT_KULCZYNSKI1|FEAT_KULCZYNSKI2|FEAT_SIMRATIO|FEAT_JENSEN_SHANNON|FEAT_D2_star|FEAT_N2R|FEAT_N2RC|FEAT_N2RRC) + +template +class Predictor { +public: + Predictor(int k_, double id_, uint8_t mode_, uint64_t feats, int mut_type_, int min_num_feat_=3, int max_num_feat_=5, double min_id_=0.35) : k(k_), id(id_), is_trained(false), is_training(false), mode(mode_), max_num_feat(max_num_feat_), mut_type(mut_type_), min_num_feat(min_num_feat_), min_id(min_id_ * 100) { + add_feats(possible_feats, feats); + feat_c = NULL; + feat_r = NULL; + omp_init_lock(&lock); + }; + Predictor(const std::string filename); + ~Predictor() { + possible_feats.clear(); + omp_destroy_lock(&lock); + if (feat_c) { + delete feat_c; + } + if (feat_r) { + delete feat_r; + } + training.clear(); + testing.clear(); + } + void train(const std::vector* >& vec, const std::vector* >& vecq, uintmax_t& _id, size_t num_sample); + double similarity(Point* a, Point* b); + bool close(Point* a, Point* b); + void save(std::string file); + void check(); + uint8_t get_mode() const { return mode; } + pair*, matrix::GLM> get_class() { return std::make_pair(new Feature(*feat_c), c_glm); } +private: + static void add_feats(std::vector >& vec, uint64_t flags); + static pair*> read_from(std::ifstream &in, int k_); + static void write_to(std::ofstream &out, Feature* f, matrix::GLM glm); + void filter(std::vector > &s, std::string prefix=""); + void train(); + void train_class(Feature* feat); + void train_regr(Feature* feat); + void train_class_regr(Feature* feat); + double predict(Point* a, Point* b); + bool p_close(Point* a, Point* b); + double p_predict(Point* a, Point* b); + void mutate_seqs(Point* p, size_t num_seq, vector > &,vector > & , double id_begin, double id_end, uintmax_t& _id); + Feature *feat_c, *feat_r; + matrix::GLM c_glm, r_glm; + vector > training, testing; + bool is_trained, is_training; + int min_num_feat, max_num_feat, k, mut_type; + uint8_t mode; + double id, min_id; + vector > possible_feats; + omp_lock_t lock; +}; +#endif diff --git a/src/cluster/src/Progress.cpp b/src/cluster/src/Progress.cpp new file mode 100644 index 0000000..e16ef06 --- /dev/null +++ b/src/cluster/src/Progress.cpp @@ -0,0 +1,65 @@ +#include "Progress.h" +#include +#include + +Progress::Progress(long num, std::string prefix_) +{ + pmax = num; + ended = 0; + pcur = 0; + prefix = prefix_; + last = ""; + barWidth = 70 - (prefix.size()+1); + print(); +} + +void Progress::print() +{ + std::ostringstream oss; + double prog = (double)pcur / pmax; + oss << prefix << " ["; + int pos = barWidth * prog; + for (int i = 0; i < barWidth; i++) { + if (i < pos) { + oss << "="; + } else if (i == pos) { + oss << ">"; + } else { + oss << " "; + } + } + oss << "] " << int(prog * 100.0) << " %\r"; + if (oss.str() != last) { + last = oss.str(); + std::cout << last; + std::cout.flush(); + } +} + +void Progress::end() +{ + if (!ended) { + pcur = pmax; + print(); + std::cout << std::endl; + } + ended = true; +} + +void Progress::operator++() +{ + pcur++; + print(); +} +void Progress::operator++(int) +{ + print(); + pcur++; +} + + +void Progress::operator+=(size_t num) +{ + pcur += num; + print(); +} diff --git a/src/cluster/src/Progress.h b/src/cluster/src/Progress.h new file mode 100644 index 0000000..f59d948 --- /dev/null +++ b/src/cluster/src/Progress.h @@ -0,0 +1,29 @@ +/* -*- C++ -*- + * + * Progress.h + * + * Author: Benjamin T James + */ +#include +#ifndef PROGRESS_H +#define PROGRESS_H + +class Progress { +public: + Progress(long num, std::string prefix_); + ~Progress() { end(); } + void end(); + void operator++(); + void operator++(int); + void operator+=(size_t); +private: + void print(); + long pmax; + long pcur; + bool ended; + std::string prefix; + int barWidth; + + std::string last; +}; +#endif diff --git a/src/cluster/src/Random.h b/src/cluster/src/Random.h new file mode 100644 index 0000000..3131b34 --- /dev/null +++ b/src/cluster/src/Random.h @@ -0,0 +1,22 @@ +#ifndef RANDOM_H // -*- C++ -*- +#define RANDOM_H +#include + +class Random { + std::mt19937 rng; +public: + Random() : rng(std::random_device()()) {} + + template + T randMod(T max) { + std::uniform_int_distribution distribution(0, max-1); + return distribution(rng); + } + + double random() { + std::uniform_real_distribution distribution(0.0, 1.0); + return distribution(rng); + } +}; + +#endif diff --git a/src/cluster/src/Runner.cpp b/src/cluster/src/Runner.cpp new file mode 100644 index 0000000..b53449d --- /dev/null +++ b/src/cluster/src/Runner.cpp @@ -0,0 +1,397 @@ +/* -*- C++ -*- + * + * Runner.cpp + * + * Author: Benjamin T James + */ +#include +#include +#include +#include +#include +#include "../../nonltr/ChromListMaker.h" +#include "../../utility/AffineId.h" +#include "Runner.h" +#include "Trainer.h" +#include "ClusterFactory.h" +#include "bvec.h" +#include "Progress.h" +#ifdef _OPENMP +#include +#endif +Runner::Runner(int argc, char **argv) +{ + get_opts(argc, argv); + if (k == -1) { + auto pr = find_k(); + k = pr.first; + } + // if (similarity < 0.6) { + // align = true; + // } + if (sample_size == 0) { + sample_size = 300; + } + srand(10); +} + +int Runner::run() +{ + largest_count = 0; + Progress progress(files.size(), "Reading in sequences"); + for (auto i = 0; i < files.size(); i++) { + auto f = files.at(i); + ChromListMaker maker(f); + auto chromList = maker.makeChromOneDigitList(); + + progress++; +// cout << "Reading in sequences from " << f << "..." << endl; + uint64_t local_largest_count = 0; +#pragma omp parallel for reduction(max:local_largest_count) + for (int i = 0; i < chromList->size(); i++) { + std::vector values; + KmerHashTable table(k, 1); + ChromosomeOneDigit *chrom = dynamic_cast(chromList->at(i)); + fill_table(table, chrom, values); + uint64_t l_count = *std::max_element(values.begin(), values.end()); + if (l_count > local_largest_count) { + local_largest_count = l_count; + } + } + if (local_largest_count > largest_count) { + largest_count = local_largest_count; + } + } + progress.end(); + + + if (largest_count <= std::numeric_limits::max()) { + cout << "Using 8 bit histograms" << endl; + return do_run(); + } else if (largest_count <= std::numeric_limits::max()) { + cout << "Using 16 bit histograms" << endl; + return do_run(); + } else if (largest_count <= std::numeric_limits::max()){ + cout << "Using 32 bit histograms" << endl; + return do_run(); + } else if (largest_count <= std::numeric_limits::max()) { + cout << "Using 64 bit histograms" << endl; + return do_run(); + } else { + throw "Too big sequence"; + } +} + +void usage(std::string progname) +{ + std::cout << "Usage: " << progname << " --id 0.x [OPTIONS] *.fasta" << std::endl << std::endl; + #ifndef VERSION + #define VERSION "(undefined)" + #endif + std::cout << "Version " << VERSION << " compiled on " << __DATE__ << " " << __TIME__; + #ifdef _OPENMP + std::cout << " with OpenMP " << _OPENMP; + #else + std::cout << " without OpenMP"; + #endif + std::cout << std::endl; + std::cout << "See README for detailed options" << std::endl << std::endl; +} + + +void Runner::get_opts(int argc, char **argv) +{ + for (int i = 1; i < argc; i++) { + string arg = argv[i]; + if (arg == "--id" && i + 1 < argc) { + try { + std::string opt = argv[i+1]; + similarity = std::stod(opt); + if (similarity <= 0 || similarity >= 1) { + throw std::invalid_argument(""); + } + } catch(std::exception e) { + cerr << "Similarity must be between 0 and 1" << endl; + exit(EXIT_FAILURE); + } + i++; + } else if (arg == "--min-id" && i + 1 < argc) { + try { + std::string opt = argv[i+1]; + min_id = std::stod(opt); + if (min_id <= 0 || min_id >= 1) { + throw std::invalid_argument(""); + } + } catch(std::exception e) { + cerr << "Similarity must be between 0 and 1" << endl; + exit(EXIT_FAILURE); + } + i++; + } else if ((arg == "-k" || arg == "--kmer") && i + 1 < argc) { + k = strtol(argv[i+1], NULL, 10); + if (errno) { + perror(argv[i+1]); + exit(EXIT_FAILURE); + } else if (k <= 0) { + fprintf(stderr, "K must be greater than 0.\n"); + exit(EXIT_FAILURE); + } + i++; + } else if ((arg == "-o" || arg == "--output") && i + 1 < argc) { + output = string(argv[i+1]); + i++; + } else if ((arg == "-s" || arg == "--sample") && i + 1 < argc) { + sample_size = strtol(argv[i+1], NULL, 10); + if (errno) { + perror(argv[i+1]); + exit(EXIT_FAILURE); + } else if (sample_size <= 0) { + fprintf(stderr, "Sample size must be greater than 0.\n"); + exit(EXIT_FAILURE); + } + i++; + // } else if ((arg == "-p" || arg == "--pivot") && i + 1 < argc) { + // pivots = strtol(argv[i+1], NULL, 10); + // if (errno) { + // perror(argv[i+1]); + // exit(EXIT_FAILURE); + // } else if (sample_size <= 0) { + // fprintf(stderr, "Points per pivot must be greater than 0.\n"); + // exit(EXIT_FAILURE); + // } + // i++; + } else if ((arg == "--mut-type") && i + 1 < argc) { + std::string opt = argv[i+1]; + if (opt == "all") { + mut_type = HandleSeq::BOTH | HandleSeq::ATYPICAL; + } else if (opt == "both") { + mut_type = HandleSeq::BOTH; + } else if (opt == "snp" || opt == "single") { + mut_type = HandleSeq::SINGLE; + } else if (opt == "nonsingle-typical") { + mut_type = HandleSeq::NON_SINGLE; + } else if (opt == "nonsingle-all") { + mut_type = HandleSeq::NON_SINGLE | HandleSeq::ATYPICAL; + } else if (opt == "all-but-reversion") { + mut_type = HandleSeq::BOTH | HandleSeq::TRANSLOCATION; + } else if (opt == "all-but-translocation") { + mut_type = HandleSeq::BOTH | HandleSeq::REVERSION; + } else { + cerr << "Options for mutation type are \"single\", \"nonsingle-typical\", \"both\" (for single and nonsingle-typical), \"nonsingle-all\", and \"all\" (single, nonsingle, and atypical nonsingle)." << endl; + exit(1); + } + i++; + } else if ((arg == "--feat" || arg == "-f") && i + 1 < argc) { + std::string opt = argv[i+1]; + if (opt == "fast") { + feat_type = PRED_FEAT_FAST; + } else if (opt == "slow") { + feat_type = PRED_FEAT_FAST | PRED_FEAT_DIV; + } else if (opt == "extraslow") { + feat_type = PRED_FEAT_ALL; + } else { + cerr << "Options for feature sets are \"fast\", \"slow\", and \"extraslow\"." << endl; + exit(1); + } + i++; + } else if ((arg == "--min" || arg == "--min-feat") && i + 1 < argc) { + try { + std::string opt = argv[i+1]; + int xx = std::stoi(opt); + if (xx <= 0) { + throw std::invalid_argument(""); + } + min_n_feat = xx; + } catch (std::exception e) { + cerr << "Minimum number of features must be greater than 0." << endl; + exit(1); + } + + i++; + } else if ((arg == "--max" || arg == "--max-feat") && i + 1 < argc) { + try { + std::string opt = argv[i+1]; + int xx = std::stoi(opt); + if (xx <= 0) { + throw std::invalid_argument(""); + } + max_n_feat = xx; + } catch (std::exception e) { + cerr << "Maximum number of features must be greater than 0." << endl; + exit(1); + } + + i++; + } else if ((arg == "-t" || arg == "--threads") && i + 1 < argc) { + try { + std::string opt = argv[i+1]; + int threads = std::stoi(opt); + if (threads <= 0) { + throw std::invalid_argument(""); + } + #ifdef _OPENMP + omp_set_num_threads(threads); + #endif + } catch (std::exception e) { + cerr << "Number of threads must be greater than 0." << endl; + exit(1); + } + + i++; + + } else if ((arg == "-d" || arg == "--delta") && i + 1 < argc) { + delta = strtol(argv[i+1], NULL, 10); + if (errno) { + perror(argv[i+1]); + exit(EXIT_FAILURE); + } else if (delta <= 0) { + fprintf(stderr, "Delta must be greater than 0.\n"); + exit(EXIT_FAILURE); + } + i++; + } else if ((arg == "-i" || arg == "--iter" || arg == "--iterations") && i + 1 < argc) { + iterations = strtol(argv[i+1], NULL, 10); + if (errno) { + perror(argv[i+1]); + exit(EXIT_FAILURE); + } else if (iterations <= 0) { + fprintf(stderr, "Iterations must be greater than 0.\n"); + exit(EXIT_FAILURE); + } + i++; + } else { + struct stat st; + stat(argv[i], &st); + if (S_ISREG(st.st_mode)) { + files.push_back(argv[i]); + } else { + usage(*argv); + exit(EXIT_FAILURE); + } + } + } + if (files.empty()) { + usage(*argv); + exit(EXIT_FAILURE); + } + if (min_n_feat > max_n_feat) { + cerr << "Minimum number of features (" << min_n_feat << ") cannot be greater than maximum number of features (" << max_n_feat << ")" << endl; + exit(1); + } +} + +pair Runner::find_k() +{ + unsigned long long count = 0, length = 0, largest_count = 0; + uint64_t longest_seq = 0; + uintmax_t num_sequences = 0; + for (auto f : files) { + ChromListMaker maker(f); + auto chromList = maker.makeChromOneDigitList(); + unsigned long long l = 0; + for (int i = 0; i < chromList->size(); i++) { + ChromosomeOneDigit *chrom = dynamic_cast(chromList->at(i)); + auto sz = chrom->size(); + l += sz; + if (sz > longest_seq) { + longest_seq = sz; + } + num_sequences++; + + } + l /= chromList->size(); + length += l; + } + length /= files.size(); + int newk = ceil(log(length) / log(4)) - 1; + cout << "avg length: " << length << endl; + cout << "Recommended K: " << newk << endl; + return make_pair(newk, longest_seq); +} + + +double global_mat[4][4] = {{1, -1, -1, -1}, + {-1, 1, -1, -1}, + {-1, -1, 1, -1}, + {-1, -1, -1, 1}}; +double global_sigma = -2; +double global_epsilon = -1; + +template +int Runner::do_run() +{ + using pvec = vector *>; + using pmap = map*, pvec*>; + + ClusterFactory factory(k); + auto points = factory.build_points(files, [&](nonltr::ChromosomeOneDigit *p){ return factory.get_divergence_point(p); }); + Trainer tr(points, sample_size, largest_count, similarity, pivots, global_mat, global_sigma, global_epsilon, align ? 0 : k); + tr.train(min_n_feat, max_n_feat, feat_type, mut_type, min_id); + vector lengths; + for (Point* p : points) { + if (!align) { + p->set_data_str(""); + } + lengths.push_back(p->get_length()); + } + // Initializing BVec + bvec bv(lengths, 1000); + lengths.clear(); + // Inserting points into BVec + uint64_t idx = 0; + for (Point* p : points) { + p->set_id(idx++); + bv.insert(p); + } + bv.insert_finalize(); +// cout << "bv size: " << bv.report() << endl; + // Point* mid = points[points.size()/2]; + // auto rng = bv.get_range(mid->get_length() * 0.99, + // mid->get_length() / 0.99); + // auto begin = bv.iter(rng.first); + // auto end = bv.iter(rng.second); + // size_t before = bv.report(); + // for (int i = 0; i < 1; i++) { + // bool is_min = false; + // Point* p = tr.get_close(mid, begin, end, is_min); + // size_t after = bv.report(); + // if (is_min) { + // string expr = (after + 1 == before) ? "true" : "false"; + // if (expr == "false") { + // throw expr; + // } + // cout << expr << endl; + // cout << "is min" << endl; + // } else { + // cout << "is not min" << endl; + // } + // } + factory.MS(bv, bandwidth, similarity, tr, output, iterations, delta); + return 0; +} + + +template +void Runner::print_output(const map*, vector*>*> &partition) const +{ + cout << "Printing output" << endl; + std::ofstream ofs; + ofs.open(output, std::ofstream::out); + int counter = 0; + for (auto const& kv : partition) { + if (kv.second->size() == 0) { + continue; + } + ofs << ">Cluster " << counter << endl; + int pt = 0; + for (auto p : *kv.second) { + string s = p->get_header(); + ofs << pt << "\t" << p->get_length() << "nt, " << s << "... " << endl; +// string fa = am.get(p->get_id()); +// ofs << writefa(fa) << endl; + pt++; + } + counter++; + } + ofs.close(); +} diff --git a/src/cluster/src/Runner.h b/src/cluster/src/Runner.h new file mode 100644 index 0000000..6e04ebf --- /dev/null +++ b/src/cluster/src/Runner.h @@ -0,0 +1,44 @@ +/* -*- C++ -*- + * + * Runner.h + * + * Author: Benjamin T James + */ +#ifndef RUNNER_H +#define RUNNER_H + +#include +#include +#include "Point.h" +#include "HandleSeq.h" +#include "Predictor.h" +using namespace std; + +class Runner { +public: + Runner(int argc, char** argv); + ~Runner() {}; + int run(); +private: + template int do_run(); + template void print_output(const map*, vector*>*> &m) const; + int k = -1; + int bandwidth; + double similarity = 0.90; + long largest_count = 0; + int iterations = 15; + int delta = 5; + bool align = false; + int sample_size = 0; + int pivots = 40; + int min_n_feat = 3; + int max_n_feat = 5; + int mut_type = HandleSeq::SINGLE; + uint64_t feat_type = PRED_FEAT_FAST; + double min_id = 0.35; + std::vector files; + string output = "output.clstr"; + void get_opts(int argc, char** argv); + pair find_k(); +}; +#endif diff --git a/src/cluster/src/SingMute.cpp b/src/cluster/src/SingMute.cpp new file mode 100644 index 0000000..45f1610 --- /dev/null +++ b/src/cluster/src/SingMute.cpp @@ -0,0 +1,116 @@ +#include "SingMute.h" +#include +#include "Random.h" + + + + +void generate_unique_set(size_t cmd_size, std::set& ret, int num_elts, const std::set& bad_set_1, const std::set& bad_set_2, const std::vector &valid, Random& rng) +{ + while (ret.size() <= num_elts) { + long idx = rng.randMod(cmd_size); + if (valid[idx] && + ret.find(idx) == ret.end() && + bad_set_1.find(idx) == bad_set_1.end() && + bad_set_2.find(idx) == bad_set_2.end()) { + + ret.insert(idx); + } + } +} +char SingMute::randNucl() +{ + char character; + int value = rng.randMod(percAs + percCs + percGs + percTs); + if (value < percAs) { + character = 'A'; + } else if (value < percAs + percCs) { + character = 'C'; + } else if (value < percAs + percCs + percGs) { + character = 'G'; + } else { + character = 'T'; + } + return character; +} +void SingMute::init(const std::vector &valid) +{ + maxInsert = 0; + maxDel = 0; + maxSwitch = 0; + if (num_mut == 1) { + maxInsert = 1; + maxDel = 0; + maxSwitch = 0; + } else if (num_mut == 0) { + out_seq = *seq; + return; + } else { + maxSwitch = rng.randMod(num_mut); + num_mut -= maxSwitch; + + if (maxSwitch % 2 == 1 && num_mut >= 1) { + maxSwitch++; + num_mut--; + } else if (num_mut == 0) { + maxSwitch--; + num_mut++; + } + if (num_mut > 1) { + maxInsert = rng.randMod(num_mut); + num_mut -= maxInsert; + } else { + maxInsert = num_mut; + num_mut -= maxInsert; + } + maxDel = num_mut; + } + size_t seq_len = seq->length(); + + maxDel *= seq_len / 100.0; + maxInsert *= seq_len / 100.0; + maxSwitch *= seq_len / 100.0; + alignmentLength = maxInsert; + IBP = maxDel + maxSwitch; + + + std::vector command_str(seq_len, 'S'); + + std::set s_ins, s_del, s_switch; + generate_unique_set(command_str.size(), s_ins, maxInsert, s_del, s_switch, valid, rng); + generate_unique_set(command_str.size(), s_del, maxDel, s_ins, s_switch, valid, rng); + generate_unique_set(command_str.size(), s_switch, maxSwitch, s_ins, s_del, valid, rng); + for (auto idx : s_ins) { + command_str[idx] = 'I'; + } + for (auto idx : s_del) { + command_str[idx] = 'D'; + } + for (auto idx : s_switch) { + command_str[idx] = 'W'; + } + out_seq = ""; + out_seq.reserve(maxInsert + seq_len - maxDel + 1); + + for (long i = 0; i < seq_len; i++) { + auto cmd = command_str.at(i); + switch (cmd) { + case 'I': { + out_seq += randNucl(); + out_seq += seq->at(i); + break; + } + case 'S': { + out_seq += seq->at(i); + break; + } + case 'D': { + break; + } + case 'W': { + out_seq += randNucl(); + break; + } + } + } +} diff --git a/src/cluster/src/SingMute.h b/src/cluster/src/SingMute.h new file mode 100644 index 0000000..c659afd --- /dev/null +++ b/src/cluster/src/SingMute.h @@ -0,0 +1,48 @@ + +#ifndef SINGMUTE_H +#define SINGMUTE_H + +#include +#include +#include "Random.h" + +class SingMute { +public: + /* + Constructor, creates values + and assignes allocations based on inputted data + + @param: + int: percentage of A's + int: percentage of C's + int: percentage of G's + int: percentage of T's + int: The total allocation for non-single mutations + */ + SingMute(int pa, int pc, int pg, int pt, uintmax_t tt, const std::string* s, const std::vector &valid_) : percAs(pa), + percCs(pc), percGs(pg), percTs(pt), num_mut(tt), seq(s) { + init(valid_); + } + long getAlignmentLength() { return alignmentLength; } + long getIBP() { return IBP; } + void init(const std::vector &valid); + std::string& getSeq() { return out_seq; }; + private: + uintmax_t num_mut; + int percAs; + int percCs; + int percGs; + int percTs; + + long maxDel; + long maxInsert; + long maxSwitch; + + long alignmentLength; + long IBP; + const std::string * seq; + std::string out_seq; + char randNucl(); + Random rng; +}; +#endif diff --git a/src/cluster/src/SingleFeature.cpp b/src/cluster/src/SingleFeature.cpp new file mode 100644 index 0000000..bdc441c --- /dev/null +++ b/src/cluster/src/SingleFeature.cpp @@ -0,0 +1,50 @@ +#include "SingleFeature.h" + +template +void SingleFeature::normalize(const vector*,Point*> > &pairs) +{ + for (auto p : pairs) { + double d; + if (rc.empty()) { + d = raw(p.first, p.second); + } else { + d = rraw(p.first, p.second, rc, rv); + } + if (!min_set || d < min) { + min = d; + min_set = true; + } + if (!max_set || d > max) { + max = d; + max_set = true; + } + } +} + +template +double SingleFeature::operator()(Point *a, Point *b) const +{ + double d; + if (rc.empty()) { + d = raw(a, b); + } else { + d = rraw(a, b, rc, rv); + } +// std::cout << "Raw: " << d << std::endl; + double f = (d - min) / (max - min); +// std::cout << "Normalized: " << f << std::endl; + f = std::min(1.0, std::max(0.0, f)); + if (is_sim) { + return f; + } else { + return 1.0 - f; + } +} + + +template class SingleFeature; +template class SingleFeature; +template class SingleFeature; +template class SingleFeature; +template class SingleFeature; +template class SingleFeature; diff --git a/src/cluster/src/SingleFeature.h b/src/cluster/src/SingleFeature.h new file mode 100644 index 0000000..efa882c --- /dev/null +++ b/src/cluster/src/SingleFeature.h @@ -0,0 +1,26 @@ +#ifndef SINGLEFEATURE_H +#define SINGLEFEATURE_H + +#include "Point.h" +#include + +template +class SingleFeature { +public: + SingleFeature(std::function*, Point*)> f, bool is_sim_=true) + : raw(f), is_sim(is_sim_), min_set(false), max_set(false) {} + SingleFeature(std::function*, Point*, const vector&, const vector&)> f, vector rrv, vector rrc, bool is_sim_=true) + : rraw(f), is_sim(is_sim_), min_set(false), max_set(false), rv(rrv), rc(rrc) {} + void normalize(const vector*,Point*> > &pairs); + double operator()(Point*, Point*) const; + double min, max; +private: + std::function*, Point*)> raw; + std::function*, Point*, const vector&, const vector&)> rraw; + vector rv, rc; + const bool is_sim; + bool max_set, min_set; + +}; + +#endif diff --git a/src/cluster/src/SingleFileLoader.cpp b/src/cluster/src/SingleFileLoader.cpp new file mode 100644 index 0000000..e62715f --- /dev/null +++ b/src/cluster/src/SingleFileLoader.cpp @@ -0,0 +1,84 @@ +/* -*- C++ -*- + * + * SingleFileLoader.cpp + * + * Author: Benjamin T James + * + * Reads sequences one by one from a file + */ +#include "SingleFileLoader.h" +#include +#include + +std::istream& safe_getline(std::istream& is, std::string& t) +{ + t.clear(); + std::istream::sentry se(is, true); + std::streambuf* sb = is.rdbuf(); + for(;;) { + int c = sb->sbumpc(); + switch (c) { + case '\n': + return is; + case '\r': + if (sb->sgetc() == '\n') { + sb->sbumpc(); + } + return is; + case std::streambuf::traits_type::eof(): + if (t.empty()) { + is.setstate(std::ios::eofbit); + } + return is; + default: + t += (char)c; + } + } +} + + +SingleFileLoader::SingleFileLoader(std::string filename) +{ + in = new std::ifstream(filename); + is_first = true; +} +std::pair SingleFileLoader::next() +{ + std::pair ret = std::make_pair("", (std::string*)NULL); + if (!in->good()) { + return ret; + } + clock_t begin = clock(); + ret.second = new std::string(""); + if (is_first) { + safe_getline(*in, buffer); + is_first = false; + } + do { + if (buffer[0] == '>') { + if (ret.first != "") { + return ret; + } + ret.first = buffer; + } else if (buffer[0] == ' ' || buffer[0] == '\t') { + bool all_spaces = true; + for (auto c : buffer) { + if (c != ' ' && c != '\t') { + all_spaces = false; + } + } + if (!all_spaces) { + std::ostringstream oss; + oss << ret.first << buffer; + std::string new_header = oss.str(); + ret.first = new_header; + } + } else { + ret.second->append(buffer); + } + safe_getline(*in, buffer); + } while (in->good()); + double diff = clock() - begin; +// std::cout << "next(): " << diff / CLOCKS_PER_SEC << std::endl; + return ret; +} diff --git a/src/cluster/src/SingleFileLoader.h b/src/cluster/src/SingleFileLoader.h new file mode 100644 index 0000000..d6b3c5d --- /dev/null +++ b/src/cluster/src/SingleFileLoader.h @@ -0,0 +1,29 @@ +/* -*- C++ -*- + * + * SingleFileLoader.h + * + * Author: Benjamin T James + * + * A way of reading in 1 sequence at a time + * from FASTA, sequence is heap allocated + */ +#ifndef SINGLEFILELOADER_H +#define SINGLEFILELOADER_H + +#include + +class SingleFileLoader { +public: + SingleFileLoader(std::string file); + ~SingleFileLoader() { + if (in != NULL) { + delete in; + } + } + std::pair next(); +private: + std::ifstream *in; + std::string buffer; + bool is_first; +}; +#endif diff --git a/src/cluster/src/SingleMute.cpp b/src/cluster/src/SingleMute.cpp new file mode 100644 index 0000000..1f435f7 --- /dev/null +++ b/src/cluster/src/SingleMute.cpp @@ -0,0 +1,221 @@ +/** + * Author: Alex Baumgartner + * The Bioinformatics Toolsmith Laboratory, the University of Tulsa + * 5/15/2018 + * + * Purpose: + * The pupose of this module is to perform single mutations on sequences + */ + +#include "SingleMute.h" +#include + +int intRandMod_(int max) { + static thread_local std::mt19937 generator; + std::uniform_int_distribution distribution(0, max-1); + return distribution(generator); +} + +SingleMute::SingleMute(int a, int c, int g, int t, int alloc) { + percAs = a; + percCs = c; + percGs = g; + percTs = t; + //If allocation is 0, all sub allocations are 0 + if (alloc == 0) { + maxDel = 0; + maxInsert = 0; + maxSwitch = 0; + } + //Arbitrary, if only 1 percent is allocated, then only insert gets an allocation + else if (alloc == 1) { + maxSwitch = 0; + maxDel = 0; + maxInsert = 1; + } + //Otherwise, allocations are assigned randomly + else { + //Max switch gets a random allocation, + //but allocation has to be even + //(don't want to switch something with itself) + maxSwitch = intRandMod_(alloc);//rand() % alloc; + alloc -= maxSwitch; + //If alloc is odd, + //and there is still percent that can be allocated + if (maxSwitch % 2 == 1 && alloc >= 1) { + //Make allocation 1 less, + //and switch allocation one more (now even) + maxSwitch++; + alloc--; + } + //Otherwise, make allocation one larger, + //switch allocation one less (even) + else if (alloc == 0) { + maxSwitch--; + alloc++; + } + //If alloc is greater than 1 (must be for % purposes), + //calculate random value for inerst allocation + if (alloc > 1) { + maxInsert = intRandMod_(alloc);//rand() % alloc; + alloc -= maxInsert; + } else { + maxInsert = alloc; + alloc -= maxInsert; + } + //Max delete is assigned whatever is left + maxDel = alloc; + } +} + +int SingleMute::getAlignmentLength(){ + return alignmentLength; +} + +int SingleMute::getIBP(){ + return IBP; +} + +void SingleMute::genSing(string * sequence, vector mutes) { + seq = sequence; + //Assign vector of mutes to inputted vector + validIndexes = new vector(); + validIndexes->reserve(mutes.size()); +// n_valid_indices = mutes.size(); + //Adds all valid indexes to the validIndexes vector + for(int i = 0; i < mutes.size(); i++){ + if(mutes.at(i)){ + validIndexes->push_back(i); + } + } + n_valid_indices = validIndexes->size(); + float tempFloat; + //Calculate number of characters each mutation can mutate + tempFloat = maxDel / 100.0; + maxDel = (int) (tempFloat * seq->length()); + tempFloat = maxInsert / 100.0; + maxInsert = (tempFloat * seq->length()); + tempFloat = maxSwitch / 100.0; + maxSwitch = (tempFloat * seq->length()); + //Calculates Alignment length and identical base pairs + alignmentLength = maxInsert; + IBP = maxDel + maxSwitch; + //Vectors to keep track of where insertions and deletions need to be made + insertions = new vector(); + insertions->reserve(maxInsert); + deletions = new vector(); + deletions->reserve(maxDel); + //Since switch makes 2 invalid, + //switchNucl is run maxSwitch/2 times + for (int i = 0; i < maxSwitch; i++) { + switchNucl(); + } + //Insert maxInsert times + for (int i = 0; i < maxInsert; i++) { + insert(); + } + //Delete maxDel nucleotides + for (int i = 0; i < maxDel; i++) { + deleteNucl(); + } + //perfroms deletions and insertions + performInsertAndDelete(); +} + +void SingleMute::insert() { + //Calculate the index to insert at + int index = intRandMod_(n_valid_indices);//rand() % validIndexes->size(); + insertions->push_back(validIndexes->at(index)); + std::swap(validIndexes->at(index), validIndexes->at(n_valid_indices-1)); + n_valid_indices--; + //Remove that as a valid index +// validIndexes->erase(validIndexes->begin() + index, validIndexes->begin() + index + 1); +} + +void SingleMute::deleteNucl() { + //Choose a valid index to delete + int index = intRandMod_(n_valid_indices);//rand() % validIndexes->size(); + deletions->push_back(validIndexes->at(index)); + std::swap(validIndexes->at(index), validIndexes->at(n_valid_indices-1)); + n_valid_indices--; + //Remove from the +// validIndexes->erase(validIndexes->begin() + index, validIndexes->begin() + index + 1); +} + +void SingleMute::switchNucl() { + //Pick a random valid index + int index = intRandMod_(n_valid_indices);//rand() % validIndexes->size(); + char character = seq->at(validIndexes->at(index)); + int value; + //Keep generating characters until one different than the one we are trying to switch is found + while(character == seq->at(validIndexes->at(index))){ + value = intRandMod_(percAs + percCs + percGs + percTs); + if (value < percAs) { + character = 'A'; + } else if (value < percAs + percCs) { + character = 'C'; + } else if (value < percAs + percCs + percGs) { + character = 'G'; + } else { + character = 'T'; + } + } + //Switch that character + seq->at(validIndexes->at(index)) = character; + std::swap(validIndexes->at(index), validIndexes->at(n_valid_indices-1)); + n_valid_indices--; + //Remove the chosen index as a valid index +// validIndexes->erase(validIndexes->begin() + index, validIndexes->begin() + index + 1); +} + +void SingleMute::performInsertAndDelete(){ + //sorts the vectors based + std::sort(insertions->begin(), insertions->end()); + std::sort(deletions->begin(), deletions->end()); + //Goes through both vectors untill all have been processed + for(int i = insertions->size() - 1, j = deletions->size() - 1; i >= 0 && j >= 0;){ + //If i is -1, all insertions have been processed + if(i == -1){ + removeNucl(deletions->at(j)); + j--; + } + //If i is -1, all deletions have been processed + else if(j == -1){ + insertNucl(insertions->at(i)); + i--; + } + else{ + //If the index of the current next insertion is higher than the next deletion, insert, else delete + if(insertions->at(i) > deletions->at(j)){ + insertNucl(insertions->at(i)); + i--; + } + else{ + removeNucl(deletions->at(j)); + j--; + } + } + } +} + +void SingleMute::removeNucl(int index){ + seq->erase(index, 1); +} + +void SingleMute::insertNucl(int index){ + string character; + //Use a weighted die to + //calculate which character to insert + int value = intRandMod_(percAs + percCs + percGs + percTs); + if (value < percAs) { + character = "A"; + } else if (value < percAs + percCs) { + character = "C"; + } else if (value < percAs + percCs + percGs) { + character = "G"; + } else { + character = "T"; + } + //insert at that index + seq->insert(index, character); +} diff --git a/src/cluster/src/SingleMute.h b/src/cluster/src/SingleMute.h new file mode 100644 index 0000000..b0bf93d --- /dev/null +++ b/src/cluster/src/SingleMute.h @@ -0,0 +1,89 @@ +/** + * Author: Alex Baumgartner + * The Bioinformatics Toolsmith Laboratory, the University of Tulsa + * 5/15/2018 + * + * Purpose: + * The pupose of this module is to perform single mutations on sequences + */ + +#ifndef SINGLEMUTE_H +#define SINGLEMUTE_H + +#include +#include +#include +#include + +using namespace std; + +class SingleMute { +public: + /* + Constructor, creates values + and assignes allocations based on inputted data + + @param: + int: percentage of A's + int: percentage of C's + int: percentage of G's + int: percentage of T's + int: The total allocation for non-single mutations + */ + SingleMute(int, int, int, int, int); + /* + Takes a string and mutates it based + on the allocation given in the constructor + + @param: + std::string *: pointer to the sequence to be mutated + std::vector : boolean vector of valid and invalid indexes + */ + void genSing(std::string *, std::vector); + + int getAlignmentLength(); + + int getIBP(); + + ~SingleMute(){delete validIndexes; delete insertions; delete deletions;}; + + private: + int percAs; + int percCs; + int percGs; + int percTs; + int maxDel; + int maxInsert; + int maxSwitch; + int alignmentLength; + int IBP; + std::vector * validIndexes; + size_t n_valid_indices = 0; + std::vector * deletions; + std::vector * insertions; + std::string * seq; + /* + Inserts a sequence randomly in the list + at a valid index + */ + void insert(); + /* + Deletes a random nucleotide + that has not been previously mutated + */ + void deleteNucl(); + /* + Switches two random nucleotides + that have not been mutated previously + */ + void switchNucl(); + /* + Performs necessary insertions and deletions in the string based on the insertion and deletion vectors + */ + void performInsertAndDelete(); + + void removeNucl(int); + + void insertNucl(int); +}; +#endif diff --git a/src/cluster/src/Trainer.cpp b/src/cluster/src/Trainer.cpp new file mode 100644 index 0000000..432d624 --- /dev/null +++ b/src/cluster/src/Trainer.cpp @@ -0,0 +1,930 @@ +#include "Trainer.h" +#include "HandleSeq.h" +#include "Loader.h" +#include "ClusterFactory.h" +#include +#include +#include +#include +#include "../../utility/GlobAlignE.h" +#include "../../utility/AffineId.h" +#include "needleman_wunsch.h" +#include "Predictor.h" +#include "GLM.h" +#include "Feature.h" +#include "Progress.h" +#include + +template +double Trainer::align(Point *a, Point* b) const +{ + auto sa = a->get_data_str(); + auto sb = b->get_data_str(); + int la = sa.length(); + int lb = sb.length(); + + // needleman_wunsch nw(sa, sb, 2, -3, 5, 2); + // return nw.identity(nw.align()); + GlobAlignE galign(sa.c_str(), 0, la-1, + sb.c_str(), 0, lb-1, + 1, -1, 2, 1); + + return galign.getIdentity(); + +} + + +template +std::tuple*,double,size_t,size_t> Trainer::get_close(Point *p, bvec_iterator istart, bvec_iterator iend, bool &is_min_r) const +{ + int ncols = weights.getNumRow(); +#pragma omp declare reduction(pmax:std::tuple*,double,size_t,size_t>: \ + omp_out = get<1>(omp_in) > get<1>(omp_out) ? omp_in : omp_out ) \ + initializer (omp_priv=std::make_tuple((Point*)NULL,-1,0,0)) + + std::tuple*, + double, + size_t, + size_t> result = std::tuple*, double, size_t, size_t>(NULL, + -1, + 0, + 0); + bool has_found = false; + + #ifdef DEBUG + cout << "begin " << istart.r << " " << istart.c << " end " << iend.r << " " << iend.c << endl; + for (auto data : *istart.col) { + cout << "\t" << data.size() << endl; + } + #endif +// #pragma omp parallel for reduction(pmin:result), reduction(||:has_found) +// for (bvec_iterator i = istart; i <= iend; i++) { +// if (i <= iend) { +// Point* pt = (*i).first; +// double sum = weights.get(0, 0); +// double dist = 0; +// for (int col = 1; col < ncols; col++) { +// if (col == 1) { +// dist = ff.at(col-1)(pt, p); +// sum += weights.get(col, 0) * dist; +// } else { +// sum += weights.get(col, 0) * ff.at(col-1)(pt, p); +// } +// } +// double res = round(1.0 / (1 + exp(-sum))); + +// // set second to true if result is not 1.0 +// // which means it will be removed +// result = std::make_pair(pt, dist); +// has_found = (res != 1.0); +// (*i).second = (res != 1.0); +// } +// } + bool is_min = true; +#pragma omp parallel for reduction(pmax:result), reduction(&&:is_min) + for (bvec_iterator i = istart; i <= iend; ++i) { + Point* pt = (*i).first; + double sum = weights.get(0, 0); + double dist = 0; + auto cache = feat->compute(*pt, *p); + for (int col = 1; col < ncols; col++) { + if (col == 1) { + dist = (*feat)(col-1, cache); + sum += weights.get(col, 0) * dist; + } else { + sum += weights.get(col, 0) * (*feat)(col-1, cache); + } + } + double res = round(1.0 / (1 + exp(-sum))); + //cout << "res: " << res << " " << dist << endl; +// set second to true if result is not 1.0 + // which means it will be removed + result = (dist > std::get<1>(result)) ? std::make_tuple(pt, dist, i.r, i.c) : result; + is_min = is_min && (res != 1.0); +// has_found = has_found || (res != 1.0); + if (res == 1.0) { + *i = std::make_pair(pt, true); +// (*i).second = true; + } + } + +// is_min = !has_found; + is_min_r = is_min; +// return get<0>(result); + return result; + +} + +template +long Trainer::merge(vector > ¢ers, long current, long begin, long last) const +{ +#pragma omp declare reduction(ldpmax:std::pair: \ + omp_out = omp_in.second > omp_out.second ? omp_in : omp_out ) \ + initializer (omp_priv=std::make_pair(0, std::numeric_limits::min())) + std::pair best = std::make_pair(0, std::numeric_limits::min()); + Point* p = centers[current].getCenter(); +#pragma omp parallel for reduction(ldpmax:best) + for (long i = begin; i <= last; i++) { + double sum = weights.get(0, 0); + double dist = 0; + Point* cen = centers[i].getCenter(); + auto cache = feat->compute(*cen, *p); + for (int col = 1; col < weights.getNumRow(); col++) { + double d = (*feat)(col-1, cache); + if (col == 1) { + dist = d; + } + sum += weights.get(col, 0) * d; + } + double res = round(1.0 / (1 + exp(-sum))); + + if (res == 1) { + best = best.second > dist ? best : std::make_pair(i, dist); + } + } + return best.first; +} + +template +vector*,Point*> > resize_vec(vector*,Point*>, double> > &vec, size_t new_size) +{ + cout << "Vector size: " << vec.size() << " min size: " << new_size << endl; + vector*, Point*> > data; + if (vec.size() <= new_size) { + for (int i = 0; i < vec.size(); i++) { + data.push_back(vec[i].first); + } + return data; + } + using k = pair*,Point*>, double>; + std::sort(vec.begin(), vec.end(), [](const k& a, const k& b) { + return a.second < b.second; + }); + double interval = (double)vec.size() / (vec.size() - new_size); + std::set indices; + int i = 0; + for (double index = 0; round(index) < vec.size() && i < (vec.size() - new_size); + i++, index += interval) { + int j = round(index); + indices.insert(j); + } + + std::cout << "index size: " << indices.size() << std::endl; + + // for (double index = 0; round(index) < vec.size() && indices.size() < new_size; + // index += interval) { + // int j = round(index); + // indices.insert(vec[j]); + // } + // vec.erase(vec.begin(), std::remove_if(vec.begin(), vec.end(), [&](const k& a) { + // return indices.find(a) == indices.end(); + // })); + for (auto iter = indices.rbegin(); iter != indices.rend(); iter++) { + int idx = *iter; + vec.erase(vec.begin() + idx); + } + if (vec.size() != new_size) { + cerr << "sizes are not the same: " << vec.size() << " " << new_size << endl; + throw "Resize did not work"; + } + for (auto a : vec) { + data.push_back(a.first); + } + return data; +} + +struct rng { + rng() { + srand(0); + } + int operator()(int n) const { + return rand() % n; + } +}; +template + pair*, + Point* + > >, + vector*, + Point*> > > Trainer::get_labels(vector*,Point*> > &vec, double cutoff) const +{ + + auto cmp = [](const pair*,Point*> a, const pair*,Point*> b) { + return a.first->get_header().compare(b.first->get_header()) < 0 + || + (a.first->get_header() == b.first->get_header() && a.second->get_header().compare(b.second->get_header()) < 0); + }; + auto scmp = [](const pair*,Point*>,double> a, const pair*,Point*>, double> b) { + return a.first.first->get_header().compare(b.first.first->get_header()) < 0 + || + (a.first.first->get_header() == b.first.first->get_header() && a.first.second->get_header().compare(b.first.second->get_header()) < 0); + }; + + // todo: convert to std::map + std::set*,Point*>, double>, decltype(scmp)> buf_pos(scmp), buf_neg(scmp); + std::vector*,Point*>, double> > buf_vpos, buf_vneg; +// std::sort(vec.begin(), vec.end(), cmp); + // cout << "Before Pair: " << vec[0].first->get_header() << ", " << vec[0].second->get_header() << endl; + // cout << "Before Pair: " << vec[vec.size()-1].first->get_header() << ", " << vec[vec.size()-1].second->get_header() << endl; + + rng gen; + random_shuffle(vec.begin(), vec.end(), gen); + // cout << "Pair: " << vec[0].first->get_header() << ", " << vec[0].second->get_header() << endl; + // cout << "Pair: " << vec[vec.size()-1].first->get_header() << ", " << vec[vec.size()-1].second->get_header() << endl; + vector scores(vec.size()); + Progress p(vec.size(), "Alignment"); +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < vec.size(); i++) { + double algn = align(vec[i].first, vec[i].second); + bool is_pos = algn >= cutoff; +#pragma omp critical + { + scores[i] = algn; + p++; + if (is_pos) { + buf_pos.insert(make_pair(vec[i], algn)); + //cout << vec[i].first->get_header() << " " << vec[i].second->get_header() << " " << algn << endl; + } else { + buf_neg.insert(make_pair(vec[i], algn)); + } + +#ifdef DEBUG + cout << vec[i].first->get_header() << " WITH " << vec[i].second->get_header() << " " << algn << endl; + #endif + + } + } + p.end(); + std::sort(scores.begin(), scores.end()); + std::cout << "positive=" << buf_pos.size() << " negative=" << buf_neg.size() << endl; + if (buf_pos.empty() || buf_neg.empty()) { + std::cout << "Identity value does not match sampled data: "; + if (buf_pos.empty()) { + std::cout << "Too many sequences below identity"; + } else { + std::cout << "Too many sequences above identity"; + } + std::cout << std::endl; + exit(0); + } + size_t m_size = std::min(buf_pos.size(), buf_neg.size()); + + std::cout << "resizing positive" << std::endl; + for (auto p : buf_pos) { + buf_vpos.push_back(p); + } + for (auto p : buf_neg) { + buf_vneg.push_back(p); + } + auto bp = resize_vec(buf_vpos, m_size); + std::cout << "resizing negative" << std::endl; + auto bn = resize_vec(buf_vneg, m_size); + auto ret = make_pair(bp, bn); + std::cout << "positive=" << ret.first.size() << " negative=" << ret.second.size() << endl; + return ret; + +} +template +void Trainer::filter(Point *p, vector *, bool> > &vec) const +{ + for (auto& pt : vec) { + double sum = weights.get(0, 0); + auto cache = feat->compute(*pt.first, *p); + for (int col = 1; col < weights.getNumRow(); col++) { + sum += weights.get(col, 0) * (*feat)(col-1, cache); + } + double res = round(1.0 / (1 + exp(-sum))); + pt.second = (res != 1); + } + vec.erase(std::remove_if(vec.begin(), vec.end(), [](pair*, bool> p) { + return p.second; + }), vec.end()); +} + +template +Point* Trainer::closest(Point *p, vector *, bool> > &vec) const +{ + Point* best_pt = NULL; + double best_dist = 0; + for (auto& pt : vec) { + double sum = weights.get(0, 0); + double dist = pt.first->distance_d(*p); + if (best_pt == NULL || dist < best_dist) { + best_dist = dist; + best_pt = pt.first; + } + } + return best_pt; +} + +template +std::pair Trainer::generate_feat_mat(pair *, Point *> >, vector *, Point *> > > &data, int ncols) +{ + int nrows = data.first.size() + data.second.size(); + matrix::Matrix feat_mat(nrows, ncols); + matrix::Matrix labels(nrows, 1); +#pragma omp parallel for + for (int i = 0; i < data.first.size(); i++) { + auto kv = data.first[i]; + int row = i; + auto cache = feat->compute(*kv.first, *kv.second); + for (int col = 0; col < ncols; col++) { + + if (col == 0) { + feat_mat.set(row, col, 1); + } else { +// double val = ff[col-1](kv.first, kv.second); + ////#pragma omp critical + double val = (*feat)(col-1, cache); + feat_mat.set(row, col, val); + } + + } + ////#pragma omp critical + labels.set(row, 0, 1); + } +#pragma omp parallel for + for (int i = 0; i < data.second.size(); i++) { + auto kv = data.second[i]; + int row = data.first.size() + i; + auto cache = feat->compute(*kv.first, *kv.second); + for (int col = 0; col < ncols; col++) { + + if (col == 0) { + feat_mat.set(row, col, 1); + } else { +// double val = ff[col-1](kv.first, kv.second); + ////#pragma omp critical + double val = (*feat)(col-1, cache); + feat_mat.set(row, col, val); + } + + } + ////#pragma omp critical + labels.set(row, 0, -1); + } + return std::make_pair(feat_mat, labels); +} +template +double Trainer::train_n(pair *, Point *> >, vector *, Point *> > > &data, int ncols) +{ + std::cout << "done" << endl; + cout << "Training on " << ncols << " columns" << endl; + int nrows = data.first.size() + data.second.size(); + + matrix::Matrix feat_mat(nrows, ncols); + matrix::Matrix labels(nrows, 1); + double avg_label = 0; +#pragma omp parallel for + for (int i = 0; i < data.first.size(); i++) { + auto kv = data.first[i]; + int row = i; + auto cache = feat->compute(*kv.first, *kv.second); + for (int col = 0; col < ncols; col++) { + + if (col == 0) { + feat_mat.set(row, col, 1); + } else { +// double val = ff[col-1](kv.first, kv.second); + ////#pragma omp critical + double val = (*feat)(col-1, cache); + feat_mat.set(row, col, val); + } + + } + ////#pragma omp critical + labels.set(row, 0, 1); + } +#pragma omp parallel for + for (int i = 0; i < data.second.size(); i++) { + auto kv = data.second[i]; + int row = data.first.size() + i; + auto cache = feat->compute(*kv.first, *kv.second); + for (int col = 0; col < ncols; col++) { + + if (col == 0) { + feat_mat.set(row, col, 1); + } else { +// double val = ff[col-1](kv.first, kv.second); + ////#pragma omp critical + double val = (*feat)(col-1, cache); + feat_mat.set(row, col, val); + } + + } + ////#pragma omp critical + labels.set(row, 0, -1); + } + for (int row = 0; row < nrows; row++) { + for (int col = 0; col < ncols; col++) { + double val = feat_mat.get(row, col); + std::cout << val << "\t"; + } + std::cout << endl; + } + glm.train(feat_mat, labels); + weights = glm.get_weights(); + #ifdef DEBUG + for (int i = 0; i < ncols; i++) { + cout << "weight: " << weights.get(i, 0) << endl; + + } + #endif + matrix::Matrix p = glm.predict(feat_mat); + for (int row = 0; row < nrows; row++) { + if (p.get(row, 0) == 0) { + p.set(row, 0, -1); + } + } + auto tup = glm.accuracy(labels, p); + return get<0>(tup); +} + +double random_between(double mute, double rng, double low, double high) +{ + double r_d = (double)rand() / RAND_MAX; + double mn = std::max(mute - rng, low); + double mx = std::min(mute + rng, high); + return r_d * (mx - mn) + mn; +} + +template +void Trainer::mutate_seqs(Point* p, size_t num_seq, vector > &pos_buf, vector > &neg_buf, double id_begin, double id_end, uintmax_t& _id) +{ + HandleSeq h(HandleSeq::BOTH); + ClusterFactory factory(k); + double inc = (id_end - id_begin) / num_seq; + std::string bin_seq = p->get_data_str(); + std::string seq; + for (auto c : bin_seq) { + switch (c) { + case 0: + seq += 'A'; + break; + case 1: + seq += 'C'; + break; + case 2: + seq += 'G'; + break; + case 3: + seq += 'T'; + break; + case 'N': + seq += 'C'; + break; + default: + cout << "Invalid character " << c << endl; + cout << "from sequence " << bin_seq << endl; + throw 3; + } + } + for (size_t i = 0; i < num_seq; i++) { + double iter_id = id_begin + inc * (i + 0.5); + double actual_id = random_between(iter_id, inc, id_begin, id_end); + int mut = round(100 - actual_id); + auto newseq = h.mutate(seq, mut); + std::string chrom; + std::string header = p->get_header(); + Point* new_pt = Loader::get_point(header, newseq.second, _id, k); + pra pr; + pr.first = p->clone(); + pr.second = new_pt; + pr.val = newseq.first; + if (pr.val > cutoff) { + pos_buf.push_back(pr); + } else { + neg_buf.push_back(pr); + } + } +} + +template +std::pair*,Point*> >, + vector*,Point*> > >, + std::pair*,Point*> >, + vector*,Point*> > > > +Trainer::new_get_labels(std::vector*> &points, size_t num_sample, double id, uintmax_t &_id) +{ + std::sort(points.begin(), points.end(), [](const Point* a, + const Point* b) -> bool { + return a->get_length() < b->get_length(); + }); + std::pair*,Point*> >, + vector*,Point*> > > training, testing; + num_sample = min(num_sample, points.size()); + vector*> f_points_tr, f_points_test; + size_t total_size = points.size(); + for (int i = 0; i < num_sample; i++) { + int i1 = floor((double)i * total_size / (2 * num_sample)); + int i2 = floor((i + 1) * (double)total_size / (2 * num_sample)); + f_points_tr.push_back(points.at(i1)); + f_points_test.push_back(points.at(i2)); + } + std::vector > pos_buf, neg_buf; + cout << "mutating sequences" << endl; + for (auto p : f_points_tr) { + mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id); + mutate_seqs(p, 5, pos_buf, neg_buf, 40, 100 * id, _id); + } + size_t buf_size = std::min(pos_buf.size(), neg_buf.size()); + cout << "training +: " << pos_buf.size() << endl; + cout << "training -: " << neg_buf.size() << endl; + std::vector > > bins; + size_t num_bins; + for (int i = 0; i < 10; i++) { + double max_identity = id * 100 + (100 - 100.0 * id) * (i+1) / 10.0; + double min_identity = id * 100 + (100 - 100.0 * id) * i / 10.0; + cout << "I = " << i << " " << min_identity << " -> " << max_identity << endl; + bins.push_back(std::vector >()); + for (auto p : pos_buf) { + if (p.val > min_identity && p.val < max_identity) { + bins[i].push_back(p); + } + } + for (auto p : neg_buf) { + if (p.val > min_identity && p.val < max_identity) { + bins[i].push_back(p); + } + } + } + std::random_shuffle(pos_buf.begin(), pos_buf.end()); + std::random_shuffle(neg_buf.begin(), neg_buf.end()); + for (size_t i = 0; i < buf_size; i++) { + cout << "TR: P " << pos_buf[i].val << endl; + cout << "TR: N " << neg_buf[i].val << endl; + if (pos_buf[i].val > id) { + training.first.emplace_back(pos_buf[i].first, pos_buf[i].second); + } else { + training.second.emplace_back(pos_buf[i].first, pos_buf[i].second); + } + if (neg_buf[i].val > id) { + training.first.emplace_back(neg_buf[i].first, neg_buf[i].second); + } else { + training.second.emplace_back(neg_buf[i].first, neg_buf[i].second); + } + } + pos_buf.clear(); + neg_buf.clear(); + for (auto p : f_points_test) { + mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id); + mutate_seqs(p, 5, pos_buf, neg_buf, 40, 100 * id, _id); + } + buf_size = std::min(pos_buf.size(), neg_buf.size()); + cout << "testing +: " << pos_buf.size() << endl; + cout << "testing -: " << neg_buf.size() << endl; + std::random_shuffle(pos_buf.begin(), pos_buf.end()); + std::random_shuffle(neg_buf.begin(), neg_buf.end()); + for (size_t i = 0; i < buf_size; i++) { + cout << "TE: P " << pos_buf[i].val << endl; + cout << "TE: N " << neg_buf[i].val << endl; + if (pos_buf[i].val > id) { + testing.first.emplace_back(pos_buf[i].first, pos_buf[i].second); + } else { + testing.second.emplace_back(pos_buf[i].first, pos_buf[i].second); + } + if (neg_buf[i].val > id) { + testing.first.emplace_back(neg_buf[i].first, neg_buf[i].second); + } else { + testing.second.emplace_back(neg_buf[i].first, neg_buf[i].second); + } + } + return make_pair(training, testing); +} +template +void Trainer::train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, double acc_cutoff) +{ + + if (k != 0) { + std::cout << "Splitting data" << endl; + uintmax_t _id = points.size(); + Predictor pred(k, cutoff, PRED_MODE_CLASS, feat_type, + mut_type, min_n_feat, max_n_feat, min_id); + pred.train(points, points, _id, n_points); + delete feat; + auto pr = pred.get_class(); + feat = pr.first; + glm = pr.second; + weights = glm.get_weights(); + return; + } else { + feat->add_feature(FEAT_ALIGN, Combo::xy); +// feat->normalize(training.first); + feat->finalize(); + weights = matrix::Matrix(2, 1); + weights.set(0, 0, -1 * cutoff); + weights.set(1, 0, 1); + return; + } +} + +template +vector*, Point*> > Trainer::split() +{ + // n_points total per side + // max_pts_from_one on each side + auto cmp = [](const pair*,Point*> a, const pair*,Point*> b) { + return a.first->get_header().compare(b.first->get_header()) < 0 +|| + (a.first->get_header() == b.first->get_header() && a.second->get_header().compare(b.second->get_header()) < 0); + }; + set*, Point*>, decltype(cmp)> pairs(cmp); +// vector*, Point*> > pairs; + const size_t total_num_pairs = n_points * 2; + int aerr = 0; + int bandwidth = (1.0 - cutoff) * 10000; + vector*> indices; + std::sort(points.begin(), points.end(), [](const Point* a, + const Point* b) -> bool { + return a->get_length() < b->get_length(); + }); + Point *begin_pt = points[points.size()/2]; + + std::sort(points.begin(), points.end(), [&](const Point* a, + const Point* b) -> bool { + return a->distance(*begin_pt) < b->distance(*begin_pt); + }); + int num_iterations = ceil(((double)n_points) / max_pts_from_one) - 1; + for (int i = 0; i <= num_iterations; i++) { + int idx = i * (points.size()-1) / num_iterations; + indices.push_back(points[idx]); + } + cout << "Point pairs: " << indices.size() << endl; + size_t to_add_each = max_pts_from_one / 2; + Progress prog(indices.size(), "Sorting data"); +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < indices.size(); i++) { + vector*> pts = points; + Point* p = indices[i]; + std::sort(pts.begin(), pts.end(), [&](const Point* a, + const Point* b) { + return a->distance(*p) < b->distance(*p); + }); + // do binary search with alignment + size_t offset = pts.size() / 4; + size_t pivot = offset; + double closest_algn = 20000; + size_t best_pivot = 2 * offset; + for (pivot = 2 * offset; offset > 0; offset /= 2) { + double algn = align(p, pts[pivot]); + // cout << "Pivot: " << pivot << " point: " << pts[pivot]->get_header() << " sim: " << align(p, pts[pivot]) << endl; + if (fabs(algn - cutoff) < closest_algn) { + closest_algn = fabs(algn - cutoff); + best_pivot = pivot; + } + if (algn < cutoff) { + pivot -= offset; + } else if (algn > cutoff) { + pivot += offset; + } else { + break; + } + } +// cout << "Pivot: " << pivot << " point: " << pts[pivot]->get_header() << " sim: " << align(p, pts[pivot]) << endl; + // before: [0, pivot) size: to_add_each + // after: [pivot, size) size: to_add_each + double before_inc = (double)pivot / to_add_each; + double after_inc = ((double)(pts.size() - pivot)) / to_add_each; +#pragma omp critical + { + prog++; + if (before_inc < 1) { + aerr = 1; + } else if (after_inc < 1) { + aerr = -1; + } + } + double before_start = 0; + double after_start = pivot; + double top_start = 0; + size_t size_before = pairs.size(); + vector*,Point*> > buf; + // Adds points above cutoff by adding before_inc + for (int i = 0; i < to_add_each; i++) { + int idx = round(before_start); + int dist = pts[idx]->distance(*p); + // cout << p->get_header() << " " << pts[idx]->get_header() << " " << dist << endl; + auto pr = p->get_header().compare(pts[idx]->get_header()) < 0 ? make_pair(p, pts[idx]) : make_pair(pts[idx], p); + buf.push_back(pr); + before_start += before_inc; + } + // Adds points before cutoff by adding after_inc + for (int i = 0; i < to_add_each && round(after_start) < pts.size(); i++) { + int idx = round(after_start); + int dist = pts[idx]->distance(*p); + // cout << p->get_header() << " " << pts[idx]->get_header() << " " << dist << endl; + auto pr = p->get_header().compare(pts[idx]->get_header()) < 0 ? make_pair(p, pts[idx]) : make_pair(pts[idx], p); + buf.push_back(pr); + after_start += after_inc; + } +#pragma omp critical + { + // Adds buffer to total pairs + // for (auto p : buf) { +// pairs.push_back(p); +// } + pairs.insert(std::begin(buf), std::end(buf)); + } +// cout << "added " << pairs.size() - size_before << " pairs" << endl; + } + prog.end(); + if (aerr < 0) { + cerr << "Warning: Alignment may be too small for sampling" << endl; + } else if (aerr > 0) { + cerr << "Warning: Alignment may be too large for sampling" << endl; + } + int i = 0; + for (auto a : pairs) { + cout << "Before Pair: " << a.first->get_header() << ", " << a.second->get_header() << endl; + if (++i == 4) { + break; + } + } + return std::vector*,Point*> >(pairs.begin(), pairs.end()); +} +template +std::pair*, Point*>, double>, + std::map*, Point*>, double> > +Trainer::split_old() { + using train_map = std::map*, Point*>, double>; + std::pair split; + int bandwidth = (1.0 - cutoff) * 10000; + size_t last_cutoff = points.size() / 2; + while (split.first.size() < n_points) { + Point *p = points[last_cutoff]; + std::sort(points.begin(), points.end(), [&](const Point* a, + const Point* b) -> bool { + return a->distance(*p) < b->distance(*p); + }); + int b_cutoff = points.size() / 2; + for (int offset = b_cutoff; offset >= 1; offset /= 2) { + int dist = p->distance(*points[b_cutoff]); + if (dist < bandwidth) { + b_cutoff += offset; + } else if (dist > bandwidth) { + b_cutoff -= offset; + } else { + break; + } + } + size_t cutoff_index = points.size(); + const size_t count = split.first.size(); + + if (b_cutoff >= max_pts_from_one) { + double ratio = (double)b_cutoff / max_pts_from_one; + double sum = 0; + for (size_t q = 0; q < max_pts_from_one; q++) { + size_t i = round(sum); + if (i >= points.size()) { + cerr << "this shouldn't happen" << endl; + throw "this shouldn't happen"; + } + double alignment = align(p, points[i]); + if (alignment < cutoff) { + cutoff_index = i + 10; + break; + } + if (split.first.size() < n_points) { + split.first[make_pair(p, points[i])] = alignment; + } + sum += ratio; + } + } else { + for (size_t i = 1; i < cutoff_index; i++) { + double alignment = align(p, points[i]); + if (alignment < cutoff) { + cutoff_index = i + 10; + break; + } + if (split.first.size() < n_points) { + split.first[make_pair(p, points[i])] = alignment; + } + } + } + size_t similar_points_added = split.first.size() - count; + size_t available_points = points.size() - cutoff_index; + if (available_points == 0 || available_points <= similar_points_added) { + cerr << "change cutoff value, points are too similar" << endl; + throw "change cutoff value, points are too similar"; + } + double ratio = (double)(available_points - 1.0) / (double)similar_points_added; + double sum = 0; + for (size_t q = 0; q < similar_points_added; q++) { + size_t i = cutoff_index + round(sum); + if (i >= points.size()) { + break; + } + double alignment = align(p, points[i]); + split.second[make_pair(p, points[i])] = alignment; + sum += ratio; + } + if (split.first.size() != split.second.size()) { + cerr << "something happened"; + throw "something happened"; + } + last_cutoff = cutoff_index; + } + for (auto p : points) { + p->set_data_str(""); + } + return split; +} + + +int gcd(int a, int b) +{ + if (b <= 0) { + return a; + } + return gcd(b, a % b); +} +int gcd_vec(std::vector v) +{ + int ret = v[0]; + for (size_t i = 1; i < v.size(); i++) { + if (v[i] == 0) { + continue; + } + ret = gcd(ret, v[i]); + } + return ret; +} + +inline int sign(double x) { + return (x > 0) - (x < 0); +} +void scale(double (&mat)[4][4], double &sigma, double& epsilon) +{ + double scale_factor = 100000; + std::vector signs, scaled; + signs.push_back(sign(sigma)); + scaled.push_back(round(scale_factor * fabs(sigma))); + signs.push_back(sign(epsilon)); + scaled.push_back(round(scale_factor * fabs(epsilon))); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + signs.push_back(sign(mat[i][j])); + scaled.push_back(round(scale_factor * fabs(mat[i][j]))); + } + } + double common_div = gcd_vec(scaled); + sigma = signs[0] * scaled[0] / common_div; + epsilon = signs[1] * scaled[1] / common_div; + int count = 2; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + mat[i][j] = signs[count] * scaled[count] / common_div; + count++; + } + } +} + +template +void Trainer::init(double (&matrix)[4][4], double sig, double eps) +{ + scale(matrix, sig, eps); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + mat[i][j] = (int)matrix[i][j]; + } + } + sigma = (int)sig; + eps = (int)eps; + // sf.emplace_back([](Point* a, Point *b) { + // return Feature::manhattan(*a, *b); + // }, false); + // sf.emplace_back([](Point* a, Point *b) { + // return Feature::length_difference(*a, *b); + // }, false); + // sf.emplace_back([](Point* a, Point *b) { + // return Feature::rree_k_r(*a, *b); + // }, false); + // sf.emplace_back([](Point* a, Point* b) { + // return Feature::length_difference(*a, *b); + // }, false); + // sf.emplace_back([](Point* a, Point* b) { + // return Feature::intersection(*a, *b); + // }, true); + // sf.emplace_back([](Point* a, Point* b) { + // return Feature::jenson_shannon(*a, *b); + // }, false); + // sf.emplace_back([](Point* a, Point* b) { + // return Feature::simratio(*a, *b); + // }, true); + // sf.emplace_back([](Point* a, Point* b) { + // return Feature::squaredchord(*a, *b); + // }, false); + // sf.emplace_back([](Point* a, Point* b) { + // return Feature::manhattan(*a, *b); + // }, false); + // sf.emplace_back([](Point* a, Point* b) { + // return Feature::pearson(*a, *b); + // }, true); + +} +template class Trainer; +template class Trainer; +template class Trainer; +template class Trainer; +template class Trainer; +template class Trainer; diff --git a/src/cluster/src/Trainer.h b/src/cluster/src/Trainer.h new file mode 100644 index 0000000..8801172 --- /dev/null +++ b/src/cluster/src/Trainer.h @@ -0,0 +1,67 @@ +/* -*- C++ -*- */ +#ifndef TRAINER_H +#define TRAINER_H + +#include "Point.h" +#include "GLM.h" +#include "Feature.h" +#include "bvec.h" +#include "Center.h" +#include "LogTable.h" +#include + +template +class Trainer { +public: + Trainer(std::vector*> v, size_t num_points, size_t largest_count, double cutoff_, size_t max_pts_from_one_, double (&matrix)[4][4], double sig, double eps, int ksize) : points(v), n_points(num_points), cutoff(cutoff_), max_pts_from_one(max_pts_from_one_), k(ksize) { + init(matrix, sig, eps); + uintmax_t size = 1000 * 1000 * 10; + feat = new Feature(k); + }; + ~Trainer() { delete feat_mat; delete feat; } + std::pair*, Point*>, double>, + std::map*, Point*>, double> > split_old(); + vector*,Point*> > split(); + double train_n(pair*, + Point* + > >, + vector*, + Point*> > > &data, int ncols); + void train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, double acc_cutoff=97.5); + void mutate_seqs(Point* p, size_t num_seq, vector > &pos_buf, vector > &neg_buf, double id_begin, double id_end, uintmax_t& _id); + std::tuple*,double,size_t,size_t> get_close(Point*, bvec_iterator istart, bvec_iterator iend, bool& is_min) const; +// vector > get_close(Point*, const vector*,int> > &, bool& is_min) const; + std::pair*,Point*> >, + vector*,Point*> > >, + std::pair*,Point*> >, + vector*,Point*> > > > + new_get_labels(std::vector*> &points, size_t num_sample, double id, uintmax_t &_id); + void filter(Point*, vector*,bool> >&) const; + Point* closest(Point*, vector*,bool> >&) const; + long merge(vector > ¢ers, long current, long begin, long end) const; +// Point* merge(Point*, vector*,double> >&) const; +private: + matrix::GLM glm; + matrix::Matrix weights; + double align(Point* a, Point* b) const; + std::pair generate_feat_mat(pair*, + Point* + > >, + vector*, + Point*> > > &data, int ncols); + void init(double (&matrix)[4][4], double sig, double eps); + pair*, + Point* + > >, + vector*, + Point*> > > get_labels(vector*,Point*> >&, double cutoff) const; + Feature *feat; + int mat[4][4]; + int sigma, epsilon; + std::vector*> points; + matrix::Matrix *feat_mat = NULL; + size_t n_points, max_pts_from_one; + double cutoff; + int k; +}; +#endif diff --git a/src/cluster/src/bvec.cpp b/src/cluster/src/bvec.cpp new file mode 100644 index 0000000..2efed1e --- /dev/null +++ b/src/cluster/src/bvec.cpp @@ -0,0 +1,332 @@ +/* -*- C++ -*- + * + * bvec.cpp + * + * Author: Benjamin T James + */ +#include "bvec.h" +#include +template +bvec::bvec(vector& lengths, uint64_t bin_size) +{ + uint64_t num_points = lengths.size(); + std::sort(std::begin(lengths), std::end(lengths)); + for (uint64_t i = 0; i < lengths.size(); i += bin_size) { + begin_bounds.push_back(lengths[i]); + // uint64_t last_index = std::min((uint64_t)lengths.size() - 1, + // i + bin_size - 1); + //std::cout << "[" << i << " " << last_index << "]" << std::endl; + } + data.reserve(begin_bounds.size()); + for (uint64_t i = 0; i < begin_bounds.size(); i++) { + data.push_back({}); + } +} + +template +Point* bvec::pop() +{ + for (auto& bin : data) { + if (!bin.empty()) { + Point* p = bin[0].first; + bin.erase(std::begin(bin)); + return p; + } + } + return NULL; +} + +template +Point* bvec::peek() const +{ + for (auto& bin : data) { + if (!bin.empty()) { + Point* p = bin[0].first; + return p; + } + } + return NULL; +} + +template +bool bvec::inner_index_of(uint64_t length, size_t &idx, size_t *pfront, size_t *pback) const +{ + + if (data.at(idx).empty() || idx == data.size()) { + if (pfront) { + for (size_t i = 0; i < data.size(); i++) { + if (!data.at(i).empty()) { + idx = i; + *pfront = 0; + break; + } + } + } + if (pback) { + for (int i = data.size()-1; i >= 0; i--) { + if (!data.at(i).empty()) { + idx = i; + *pback = 0; + break; + } + } + } + return true; + } + size_t front = 0, back = 0; + size_t low = 0, high = data.at(idx).size() - 1; + bool found = false; + if (length < data[idx][low].first->get_length() && pfront != NULL) { + *pfront = low; + } + if (length > data[idx][high].first->get_length() && pback != NULL) { + *pback = high; + } + for (;low <= high;) { + size_t mid = (low + high) / 2; + uint64_t d = data[idx][mid].first->get_length(); + if (d == length) { + front = mid; + back = mid; + found = true; + break; + } else if (length < d) { + high = mid; + } else if (length > d) { + low = mid + 1; + } + if (low == high) { + found = true; + front = low; + back = high; + break; + } + } + if (pfront) { + for (long i = front; i >= 0 + && data[idx][i].first->get_length() == length; i--) { + front = i; + } + *pfront = front; + } + if (pback) { + for (long i = back; i < data[idx].size() + && data[idx][i].first->get_length() == length; i++) { + back = i; + } + *pback = back; + } + return true; +} + +template +bool bvec::index_of(uint64_t point, size_t* pfront, size_t* pback) const +{ + size_t low = begin_bounds.size()-1, high = 0; + + for (size_t i = 0; i < begin_bounds.size(); i++) { + size_t prev = 0; + size_t prev_index = 0; + if (i > 0) { + prev_index = i - 1; + prev = begin_bounds[i-1]; + } + if (point >= prev && point <= begin_bounds[i]) { + low = std::min(low, prev_index); + high = std::max(high, prev_index); + } + } + if (point >= begin_bounds[begin_bounds.size()-1]) { + high = std::max(high, begin_bounds.size()-1); + } + if (pfront) { + *pfront = low; + } + if (pback) { + *pback = high; + } + return true; +} + +template +void bvec::insert(Point *p) +{ + uint64_t len = p->get_length(); + size_t front = 0, back = 0; + bool good = index_of(len, &front, &back); + if (!good || front > back) { + std::cerr << "error: list is not sorted" << std::endl; + } + std::vector min_sizes; + size_t minimum = std::numeric_limits::max(); + for (size_t i = front; i <= back; i++) { + size_t sz = data[i].size(); + if (sz < minimum) { + minimum = sz; + min_sizes.clear(); + min_sizes.push_back(i); + } else if (sz == minimum) { + min_sizes.push_back(i); + } + } + if (min_sizes.empty()) { + std::cerr << "error: no bins to insert into, item not inserted" << std::endl; + } + auto mid_min = min_sizes[min_sizes.size() / 2]; + data.at(mid_min).push_back(std::make_pair(p, false)); +} + +template +size_t bvec::size() const +{ + size_t num_bins = data.size(); + size_t total_size = 0; + for (size_t i = 0; i < num_bins; i++) { + total_size += data[i].size(); + } + return total_size; +} + +template +size_t bvec::report() const +{ + cout << "BVec: "; + size_t num_bins = data.size(); + cout << "num_bins=" << num_bins << endl; + size_t total_size = 0; + for (size_t i = 0; i < num_bins; i++) { + uint64_t next_bound = std::numeric_limits::max(); + if (i + 1 < num_bins) { + next_bound = begin_bounds[i+1]; + } + cout << "Bin " << i << ": [" << begin_bounds[i] << " " << next_bound << "] size=" << data[i].size() << endl; + total_size += data[i].size(); + } + cout << "total_size=" << total_size << endl; + return total_size; +} +template +void bvec::insert_finalize() +{ + auto sorter = [](const std::pair*,bool> a, const std::pair*,bool> b) { + return a.first->get_length() < b.first->get_length(); + }; + for (size_t i = 0; i < data.size(); i++) { + std::sort(std::begin(data[i]), std::end(data[i]), sorter); + data[i].shrink_to_fit(); + } +} + +template +bool bvec::empty() const +{ + bool is_empty = true; + for (auto bin : data) { + if (!bin.empty()) { + is_empty = false; + break; + } + } + return is_empty; +} + + +template +uint64_t bvec::absolute_idx(bvec_idx_t idx) const +{ + uint64_t ptr = 0; + for (int i = 0; i < idx.first; i++) { + ptr += data[i].size(); + } + ptr += idx.second; + return ptr; +} + +template +std::pair +bvec::get_range(uint64_t begin_len, uint64_t end_len) const +{ + /* perform binary search to find bin */ + bvec_idx_t front, back; + front.first = 0; + front.second = 0; + back.first = data.size()-1; + back.second = data[back.first].size() - 1; + if (!index_of(begin_len, &front.first, NULL)) { + throw 100; + } + if (!index_of(end_len, NULL, &back.first)) { + throw 100; + } + if (!inner_index_of(begin_len, front.first, &front.second, NULL)) { + throw 100; + } + if (!inner_index_of(end_len, back.first, NULL, &back.second)) { + throw 100; + } + // if (back.first != data.size()) { // ++ to make it an end iterator + // if (back.second != data[back.first].size()) { + // back.second++; + // } else { + // back.first++; + // back.second = 0; + // } + // } else { + // throw 101; + // } + return std::make_pair(front, back); +} + +template +void bvec::erase(size_t r, size_t c) +{ + data.at(r).erase(data.at(r).begin() + c); +} + +/* + * TODO: change available to Center class so no intermediate copying is done + */ +template +void bvec::remove_available(bvec_idx_t begin, bvec_idx_t end, std::vector*> &available) +{ + size_t a = begin.first; + size_t b = end.first; + int num = 0, new_num = 0; + auto func = [](const bv_data_type d) { return d.second; }; + auto inserter = [&](const std::pair*,bool> p) { + if (p.second) { +#pragma omp critical + available.push_back(p.first); + } + }; + #pragma omp parallel for + for (size_t i = a; i <= b; i++) { + /* move marked points to end of vector, then copy, then erase */ + //const auto last = std::remove_if(std::begin(data[i]), std::end(data[i]), func); + for (int j = 0; j < data[i].size(); j++) { + auto kv = data[i][j]; + if (kv.second) { +#pragma omp critical + { + available.push_back(kv.first); + } + } + } + data[i].erase(std::remove_if(std::begin(data[i]), std::end(data[i]), func), std::end(data[i])); + } +} + + +template +bvec_iterator bvec::iter(bvec_idx_t idx) +{ + return bvec_iterator(idx.first, idx.second, &data); +} + + +template class bvec; +template class bvec; +template class bvec; +template class bvec; +template class bvec; +template class bvec; diff --git a/src/cluster/src/bvec.h b/src/cluster/src/bvec.h new file mode 100644 index 0000000..43384e9 --- /dev/null +++ b/src/cluster/src/bvec.h @@ -0,0 +1,69 @@ +/* -*- C++ -*- + * + * bvec.h + * + * Author: Benjamin T James + */ +#ifndef BVEC_H +#define BVEC_H + +#include "Point.h" +#include "bvec_iterator.h" + +typedef struct bvec_idx { + size_t first, second; +} bvec_idx_t; + +/* + * operations needed: + * + * find bounds (range) + * get available or min and remove + * + */ +template +using bv_data_type = std::pair*, bool>; + +template +using bv_row_type = vector >; + +template +using bv_col_type = vector >; + +template +class bvec { +public: + bvec(vector& lengths, uint64_t bin_size=1000); + + Point* pop(); + Point* peek() const; + void insert(Point* data); + void insert_finalize(); /* sorts bins */ + + + bool index_of(uint64_t length, size_t* front, size_t* back) const; + bool inner_index_of(uint64_t length, size_t& idx, size_t *front, size_t *back) const; + bool empty() const; + + std::pair + get_range(uint64_t begin_len, uint64_t end_len) const; + + void remove_available(bvec_idx_t begin, bvec_idx_t end, std::vector*> &); + + uint64_t absolute_idx(bvec_idx_t idx) const; + + bvec_iterator iter(bvec_idx_t idx); + typedef bvec_iterator iterator; + typedef bvec_iterator const_iterator; + + size_t report() const; + size_t size() const; + + void erase(size_t r, size_t c); +private: + bv_col_type data; + vector begin_bounds; +}; + + +#endif diff --git a/src/cluster/src/bvec_iterator.cpp b/src/cluster/src/bvec_iterator.cpp new file mode 100644 index 0000000..f8d1c76 --- /dev/null +++ b/src/cluster/src/bvec_iterator.cpp @@ -0,0 +1,28 @@ +#include "bvec_iterator.h" + +template +bvec_iterator bvec_iterator::operator++() +{ + if (r != col->size()) { + if (c + 1 < col->at(r).size()) { + c++; + } else { + r++; + c = 0; + while (r < col->size() && col->at(r).empty()) { + r++; + } + } + } else { + cerr << "tried incrementing null iterator" << endl; + throw 10; + } + return *this; +} + +template class bvec_iterator; +template class bvec_iterator; +template class bvec_iterator; +template class bvec_iterator; +template class bvec_iterator; +template class bvec_iterator; diff --git a/src/cluster/src/bvec_iterator.h b/src/cluster/src/bvec_iterator.h new file mode 100644 index 0000000..6be6ba8 --- /dev/null +++ b/src/cluster/src/bvec_iterator.h @@ -0,0 +1,84 @@ +/* -*- C++ -*- + * + * bvec_iterator.h + * + * Author: Benjamin T James + */ +#include "bvec.h" +#ifndef BVEC_ITERATOR_H +#define BVEC_ITERATOR_H + + +template +class bvec_iterator { +public: + // iterator: split ALL possible points into chunks by indices + using dtype = std::pair*,bool>; + using vtype = vector >; + bvec_iterator(size_t _r, + size_t _c, + vtype* col_) : r(_r), c(_c), col(col_) {} + + bvec_iterator operator++(); + bvec_iterator operator++(int x) { + return ++(*this); + } + dtype& operator*() { + return col->at(r).at(c); + } + void operator+=(int64_t n) { + if (n < 0) { + throw "oops"; + } + for (int i = 0; i < n; i++) { + operator++(); + } + } + bool operator==(const bvec_iterator& rhs) const { + return rhs.c == c && rhs.r == r; + } + bool operator<(const bvec_iterator& rhs) const { + if (r < rhs.r) { + return true; + } else if (r == rhs.r) { + return c < rhs.c; + } else { + return false; + } + } + bool operator<=(const bvec_iterator& rhs) const { + if (r < rhs.r) { + return true; + } else if (r == rhs.r) { + return c <= rhs.c; + } else { + return false; + } + } + bool operator!=(const bvec_iterator& rhs) const { + return r != rhs.r || c != rhs.c; + } + int64_t operator-(const bvec_iterator& rhs) const { + int64_t sum = 0; + if (*this < rhs) { + return -1 * (rhs - *this); + } + // subtract cols until last row is reached + if (r == rhs.r) { + return c - rhs.c; + } + sum += c; + sum += col->at(rhs.r).size() - rhs.c; + for (size_t i = rhs.r + 1; i < r; i++) { + sum += col->at(i).size(); + } + return sum; + } + // bvec_iterator operator[](uint64_t idx) { + + // } +//private: + size_t r,c; + vtype* col; +}; +#endif diff --git a/src/cluster/src/main.cpp b/src/cluster/src/main.cpp new file mode 100644 index 0000000..562fd96 --- /dev/null +++ b/src/cluster/src/main.cpp @@ -0,0 +1,12 @@ +/* -*- C++ -*- + * + * main.cpp + * + * Author: Benjamin T James + */ +#include "Runner.h" +int main(int argc, char **argv) +{ + Runner runner(argc, argv); + return runner.run(); +} diff --git a/src/cluster/src/needleman_wunsch.cpp b/src/cluster/src/needleman_wunsch.cpp new file mode 100644 index 0000000..46d0b5b --- /dev/null +++ b/src/cluster/src/needleman_wunsch.cpp @@ -0,0 +1,153 @@ +/* -*- C++ -*- + * + * needleman_wunsch.cpp + * + * Author: Benjamin T James + */ +#include "needleman_wunsch.h" + + +//flags that can be combined +#define HORIZ 1 +#define VERT 2 +#define DIAG 4 +void needleman_wunsch::fill(int i, int j) +{ + if (i == 0 || j == 0) { + if (i == j) { + int offset = at(i, j); + score[offset] = 0; + direction[offset] = DIAG; // for backtracking + horiz_gap_len[offset] = 0; + vert_gap_len[offset] = 0; + } else if (i == 0) { + int offset = at(0, j); + int last_offset = at(0, j-1); + score[offset] = score[last_offset] + gap(j); + horiz_gap_len[offset] = 0; + vert_gap_len[offset] = j; + direction[offset] = VERT; + } else { // j == 0 + int offset = at(i, 0); + int last_offset = at(i-1, 0); + score[offset] = score[last_offset] + gap(i); + horiz_gap_len[offset] = i; + vert_gap_len[offset] = 0; + direction[offset] = HORIZ; + } + return; + } + int i_diag = at(i-1, j-1); + int i_horiz = at(i-1, j); + int i_vert = at(i, j-1); + int i_cur = at(i, j); + + int hlen = horiz_gap_len[i_horiz] + 1; + int vlen = vert_gap_len[i_vert] + 1; + + int diag_score = score[i_diag] + match_score(s1[i], s2[j]); + int horiz_score = score[i_horiz] + gap(hlen); + int vert_score = score[i_vert] + gap(vlen); + score[i_cur] = std::max(std::max(diag_score, horiz_score), vert_score); + direction[i_cur] = 0; + + // we could match multiple high scores + if (score[i_cur] == diag_score) { + direction[i_cur] |= DIAG; + } + if (score[i_cur] == vert_score) { + direction[i_cur] |= VERT; + vert_gap_len[i_cur] = vlen; + } else { + vert_gap_len[i_cur] = 0; + } + if (score[i_cur] == horiz_score) { + direction[i_cur] |= HORIZ; + horiz_gap_len[i_cur] = hlen; + } else { + horiz_gap_len[i_cur] = 0; + } +} + +std::pair +needleman_wunsch::backtrack() +{ + std::string a1 = "", a2 = ""; + int cur_i = l1 - 1; + int cur_j = l2 - 1; + while (cur_i >= 0 && cur_j >= 0) { + uint8_t dir = direction[at(cur_i, cur_j)]; + if (dir & DIAG) { + a1 += s1[cur_i--]; + a2 += s2[cur_j--]; + } else if (dir & HORIZ) { + a1 += s1[cur_i--]; + a2 += '-'; + } else if (dir & VERT) { + a1 += '-'; + a2 += s2[cur_j--]; + } + } + std::string r1(a1.rbegin(), a1.rend()); + std::string r2(a2.rbegin(), a2.rend()); + return std::make_pair(r1, r2); +} + + +std::pair +needleman_wunsch::align() +{ + for (int i = 0; i < l1; i++) { + for (int j = 0; j < l2; j++) { + fill(i, j); + } + } + return backtrack(); +} +double needleman_wunsch::identity(std::pair alignment) const +{ + int len = alignment.first.length(); + double count = 0; + for (int i = 0; i < len; i++) { + if (alignment.first[i] == alignment.second[i]) { + count++; + } + } + return 1.0 * count / len; +} + +int needleman_wunsch::gap(int gaplen) const +{ + return sigma + (gaplen - 1) * epsilon; +} + +int needleman_wunsch::match_score(char a, char b) const +{ + return a == b ? match : mismatch; +} + +needleman_wunsch::needleman_wunsch(const std::string &s1_, const std::string& s2_, int match_, int mismatch_, int sigma_, int epsilon_) +{ + int l1_ = s1_.length(); + int l2_ = s2_.length(); + if (l1_ >= l2_) { + l1 = l1_; + l2 = l2_; + s1 = s1_; + s2 = s2_; + } else { + l1 = l2_; + l2 = l1_; + s1 = s2_; + s2 = s1_; + } + sigma = -sigma_; + epsilon = -epsilon_; + match = match_; + mismatch = mismatch_; + int matlen = l1 * l2; + score = new int[matlen]; + direction = new uint8_t[matlen]; + horiz_gap_len = new int[matlen]; + vert_gap_len = new int[matlen]; +} diff --git a/src/cluster/src/needleman_wunsch.h b/src/cluster/src/needleman_wunsch.h new file mode 100644 index 0000000..031ea10 --- /dev/null +++ b/src/cluster/src/needleman_wunsch.h @@ -0,0 +1,43 @@ +/* -*- C++ -*- + * + * needleman_wunsch.h + * + * Author: Benjamin T James + */ + +#ifndef NEEDLEMAN_WUNSCH_H +#define NEEDLEMAN_WUNSCH_H + +#include + +class needleman_wunsch { +public: + needleman_wunsch(const std::string& s1, const std::string& s2, int match_, int mismatch_, int sigma_, int epsilon_); + ~needleman_wunsch() { + delete[] score; + delete[] direction; + delete[] horiz_gap_len; + delete[] vert_gap_len; + } + double identity(std::pair p) const; + std::pair + align(); +private: + int gap(int gap_len) const; + int match_score(char a, char b) const; + inline int at(int a, int b) const { return a * l2 + b; }; + void fill(int,int); + std::pair backtrack(); + int match, mismatch; + int sigma, epsilon; + std::string s1, s2; + int l1, l2; + + int *score; + uint8_t *direction; + int *horiz_gap_len; + int *vert_gap_len; +}; + + +#endif diff --git a/src/exception/FileDoesNotExistException.cpp b/src/exception/FileDoesNotExistException.cpp new file mode 100644 index 0000000..9093f5a --- /dev/null +++ b/src/exception/FileDoesNotExistException.cpp @@ -0,0 +1,25 @@ +/* + * FileDoesNotExistException.cpp + * + * Created on: Apr 30, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#include "FileDoesNotExistException.h" + +#include +#include + +using namespace std; + +namespace exception{ + +FileDoesNotExistException::FileDoesNotExistException(string massage) { + cerr << "File Does Not Exist Exception" << endl; + cerr << massage << endl; +} + +FileDoesNotExistException::~FileDoesNotExistException() { + // TODO Auto-generated destructor stub +} +} diff --git a/src/exception/FileDoesNotExistException.h b/src/exception/FileDoesNotExistException.h new file mode 100644 index 0000000..c8ec3ae --- /dev/null +++ b/src/exception/FileDoesNotExistException.h @@ -0,0 +1,23 @@ +/* + * FileDoesNotExistException.h + * + * Created on: Apr 30, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef FILEDOESNOTEXISTEXCEPTION_H_ +#define FILEDOESNOTEXISTEXCEPTION_H_ + +#include + +using namespace std; + +namespace exception { + class FileDoesNotExistException { + public: + FileDoesNotExistException(string); + ~FileDoesNotExistException(); + }; +} + +#endif /* FILEDOESNOTEXISTEXCEPTION_H_ */ diff --git a/src/exception/InvalidInputException.cpp b/src/exception/InvalidInputException.cpp new file mode 100644 index 0000000..d69f67c --- /dev/null +++ b/src/exception/InvalidInputException.cpp @@ -0,0 +1,24 @@ +/* + * InvalidInputException.cpp + * + * Created on: May 1, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#include "InvalidInputException.h" + +#include +#include + +using namespace std; +namespace exception{ + +InvalidInputException::InvalidInputException(string msg) { + cerr << "Invalid Input Exception" << endl; + cerr << msg << endl; +} + +InvalidInputException::~InvalidInputException() { + // TODO Auto-generated destructor stub +} +} diff --git a/src/exception/InvalidInputException.h b/src/exception/InvalidInputException.h new file mode 100644 index 0000000..9db2534 --- /dev/null +++ b/src/exception/InvalidInputException.h @@ -0,0 +1,23 @@ +/* + * InvalidInputException.h + * + * Created on: May 1, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef INVALIDINPUTEXCEPTION_H_ +#define INVALIDINPUTEXCEPTION_H_ + +#include + +using namespace std; + +namespace exception { + class InvalidInputException { + public: + InvalidInputException(string); + ~InvalidInputException(); + }; +} + +#endif /* INVALIDINPUTEXCEPTION_H_ */ diff --git a/src/exception/InvalidOperationException.cpp b/src/exception/InvalidOperationException.cpp new file mode 100644 index 0000000..8d1a6f6 --- /dev/null +++ b/src/exception/InvalidOperationException.cpp @@ -0,0 +1,19 @@ +/* + * InvalidOperationException.cpp + * + * Created on: Dec 20, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#include +#include "InvalidOperationException.h" + + +namespace exception { + +InvalidOperationException::InvalidOperationException(string msg) : std::runtime_error(msg) { + cerr << "Invalid Operation Exception." << endl; + cerr << what() << endl; +} + +} diff --git a/src/exception/InvalidOperationException.h b/src/exception/InvalidOperationException.h new file mode 100644 index 0000000..74eb1e7 --- /dev/null +++ b/src/exception/InvalidOperationException.h @@ -0,0 +1,26 @@ +/* + * InvalidOperationException.h + * + * Created on: Dec 20, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef INVALIDOPERATIONEXCEPTION_H_ +#define INVALIDOPERATIONEXCEPTION_H_ + +#include +#include + +using namespace std; + +namespace exception { + +class InvalidOperationException : public std::runtime_error{ +public: + InvalidOperationException(string msg); + //virtual ~InvalidOperationException(); +}; + +} + +#endif /* INVALIDOPERATIONEXCEPTION_H_ */ diff --git a/src/exception/InvalidOrderOfOperationsException.cpp b/src/exception/InvalidOrderOfOperationsException.cpp new file mode 100644 index 0000000..cb51650 --- /dev/null +++ b/src/exception/InvalidOrderOfOperationsException.cpp @@ -0,0 +1,24 @@ +/* + * InvalidOrderOfOperationsException.cpp + * + * Created on: Apr 26, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#include "InvalidOrderOfOperationsException.h" + +#include +#include + +using namespace std; +namespace exception{ + +InvalidOrderOfOperationsException::InvalidOrderOfOperationsException(string massage) { + cerr << "Invalid Order Of Operations Exception" << endl; + cerr << massage << endl; +} + +InvalidOrderOfOperationsException::~InvalidOrderOfOperationsException() { + // TODO Auto-generated destructor stub +} +} diff --git a/src/exception/InvalidOrderOfOperationsException.h b/src/exception/InvalidOrderOfOperationsException.h new file mode 100644 index 0000000..b813d1a --- /dev/null +++ b/src/exception/InvalidOrderOfOperationsException.h @@ -0,0 +1,23 @@ +/* + * InvalidOrderOfOperationsException.h + * + * Created on: Apr 26, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef INVALIDORDEROFOPERATIONSEXCEPTION_H_ +#define INVALIDORDEROFOPERATIONSEXCEPTION_H_ + +#include + +using namespace std; + +namespace exception{ + class InvalidOrderOfOperationsException { + public: + InvalidOrderOfOperationsException(string); + ~InvalidOrderOfOperationsException(); + }; +} + +#endif /* INVALIDORDEROFOPERATIONSEXCEPTION_H_ */ diff --git a/src/exception/InvalidScoreException.cpp b/src/exception/InvalidScoreException.cpp new file mode 100644 index 0000000..2e2829f --- /dev/null +++ b/src/exception/InvalidScoreException.cpp @@ -0,0 +1,24 @@ +/* + * InvalidScoreException.cpp + * + * Created on: Apr 27, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#include "InvalidScoreException.h" + +#include +#include + +using namespace std; +namespace exception{ + +InvalidScoreException::InvalidScoreException(string massage) { + cerr << "Invalid Score Exception." << endl; + cerr << massage << endl; +} + +InvalidScoreException::~InvalidScoreException() { + // TODO Auto-generated destructor stub +} +} diff --git a/src/exception/InvalidScoreException.h b/src/exception/InvalidScoreException.h new file mode 100644 index 0000000..89bdd34 --- /dev/null +++ b/src/exception/InvalidScoreException.h @@ -0,0 +1,23 @@ +/* + * InvalidScoreException.h + * + * Created on: Apr 27, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef INVALIDSCOREEXCEPTION_H_ +#define INVALIDSCOREEXCEPTION_H_ + +#include + +using namespace std; + +namespace exception{ + class InvalidScoreException { + public: + InvalidScoreException(string); + virtual ~InvalidScoreException(); + }; +} + +#endif /* INVALIDSCOREEXCEPTION_H_ */ diff --git a/src/exception/InvalidStateException.cpp b/src/exception/InvalidStateException.cpp new file mode 100644 index 0000000..d39f985 --- /dev/null +++ b/src/exception/InvalidStateException.cpp @@ -0,0 +1,25 @@ +/* + * InvalidStateException.cpp + * + * Created on: Aug 9, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#include +#include +#include "InvalidStateException.h" + +using namespace std; + + +namespace exception { +InvalidStateException::InvalidStateException(string msg) : + std::runtime_error(msg) { + cerr << "Invalid State Exception." << endl; + cerr << what() << endl; +} +} + +//InvalidStateException::~InvalidStateException() { +// TODO Auto-generated destructor stub +//} diff --git a/src/exception/InvalidStateException.h b/src/exception/InvalidStateException.h new file mode 100644 index 0000000..826e59f --- /dev/null +++ b/src/exception/InvalidStateException.h @@ -0,0 +1,23 @@ +/* + * InvalidStateException.h + * + * Created on: Aug 9, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef INVALIDSTATEEXCEPTION_H_ +#define INVALIDSTATEEXCEPTION_H_ + +#include +#include + +using namespace std; + +namespace exception { + class InvalidStateException : public std::runtime_error{ + public: + InvalidStateException(string); + }; +} + +#endif /* INVALIDSTATEEXCEPTION_H_ */ diff --git a/src/nonltr/ChromDetector.cpp b/src/nonltr/ChromDetector.cpp new file mode 100644 index 0000000..58d3a7a --- /dev/null +++ b/src/nonltr/ChromDetector.cpp @@ -0,0 +1,41 @@ +/* + * ChromDetector.cpp + * + * Created on: Nov 8, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#include + +#include "ChromDetector.h" +#include "Detector.h" +#include "../utility/Util.h" + +using namespace std; +using namespace nonltr; +using namespace utility; + +ChromDetector::ChromDetector(double s, double w, double pDelta, double b, + double mDelta, vector * scores, + const vector *> * segmentList) { + + regions = new vector *>(); + + for (int i = 0; i < segmentList->size(); i++) { + Detector * detector = new Detector(segmentList->at(i)->at(0), + segmentList->at(i)->at(1), s, w, pDelta, b, mDelta, scores); + vector *> * segRegions = detector->getRegions(); + regions->insert(regions->end(), segRegions->begin(), segRegions->end()); + delete detector; + } +} + +ChromDetector::~ChromDetector() { + Util::deleteInVector(regions); + regions->clear(); + delete regions; +} + +vector *> * ChromDetector::getRegions() { + return regions; +} diff --git a/src/nonltr/ChromDetector.h b/src/nonltr/ChromDetector.h new file mode 100644 index 0000000..e745295 --- /dev/null +++ b/src/nonltr/ChromDetector.h @@ -0,0 +1,29 @@ +/* + * ChromDetector.h + * + * Created on: Nov 8, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef CHROMDETECTOR_H_ +#define CHROMDETECTOR_H_ + +#include + +using namespace std; + +namespace nonltr{ +class ChromDetector { + +private: + vector *> * regions; + +public: + ChromDetector(double, double, double, double, double, vector *, + const vector *> *); + virtual ~ChromDetector(); + vector *> * getRegions(); +}; +} + +#endif /* CHROMDETECTOR_H_ */ diff --git a/src/nonltr/ChromDetectorMaxima.cpp b/src/nonltr/ChromDetectorMaxima.cpp new file mode 100644 index 0000000..51f7900 --- /dev/null +++ b/src/nonltr/ChromDetectorMaxima.cpp @@ -0,0 +1,94 @@ +/* + * ChromDetectorMaxima.cpp + * + * Created on: Jun 6, 2013 + * Author: Hani Zakaria Girgis, PhD + */ + +#include "ChromDetectorMaxima.h" + +namespace nonltr { + +ChromDetectorMaxima::ChromDetectorMaxima(double s, double w, double m, + double t, double p, int e, vector * oScores, + ChromosomeOneDigit * chrom) { + header = chrom->getHeader(); + start(s, w, m, t, p, e, oScores, chrom->getSegment()); + +} + +ChromDetectorMaxima::ChromDetectorMaxima(double s, double w, double m, + double t, double p, int e, vector * oScores, const vector *> * segmentList) { + header = string("chrUnknown"); + start(s, w, m, t, p, e, oScores, segmentList); +} + +void ChromDetectorMaxima::start(double s, double w, double m, double t, + double p, int e, vector * oScores, + const vector *> * segmentList) { + + regionList = new vector (); + + int segmentCount = segmentList->size(); + for (int i = 0; i < segmentCount; i++) { + int segStart = segmentList->at(i)->at(0); + int segEnd = segmentList->at(i)->at(1); + + // The effective length is shorter than the actual length by 2w + int effLen = 2 * w + 10; + int segLen = segEnd - segStart + 1; + + if (segLen > effLen) { + DetectorMaxima * detector = new DetectorMaxima(segStart, segEnd, s, + w, m, t, p, e, oScores); + + const vector * segRegions = detector->getRegionList(); + int segRegionCount = segRegions->size(); + for (int h = 0; h < segRegionCount; h++) { + regionList->push_back(new Location(*(segRegions->at(h)))); + } + + delete detector; + } else { + cout << "\tSkipping a short segment: "; + cout << segStart << "-" << segEnd << endl; + } + } +} + +ChromDetectorMaxima::~ChromDetectorMaxima() { + Util::deleteInVector(regionList); + regionList->clear(); + delete regionList; +} + +void ChromDetectorMaxima::printIndex(string outputFile) { + printIndex(outputFile, false); +} + +void ChromDetectorMaxima::printIndex(string outputFile, bool canAppend) { + ofstream outIndex; + + if (canAppend) { + outIndex.open(outputFile.c_str(), ios::out | ios::app); + } else { + outIndex.open(outputFile.c_str(), ios::out); + } + + // Write the index of the repeat segment [x,y[ + for (int j = 0; j < regionList->size(); j++) { + outIndex << header << ":"; + outIndex << ((int) (regionList->at(j)->getStart())) << "-"; + outIndex << ((int) (regionList->at(j)->getEnd() + 1)) << " "; + outIndex << endl; + } + + outIndex.close(); +} + +const vector* ChromDetectorMaxima::getRegionList() const { + return regionList; +} + +} /* namespace nonltr */ diff --git a/src/nonltr/ChromDetectorMaxima.h b/src/nonltr/ChromDetectorMaxima.h new file mode 100644 index 0000000..c3c58df --- /dev/null +++ b/src/nonltr/ChromDetectorMaxima.h @@ -0,0 +1,47 @@ +/* + * ChromDetectorMaxima.h + * + * Created on: Jun 6, 2013 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef CHROMDETECTORMAXIMA_H_ +#define CHROMDETECTORMAXIMA_H_ + +#include +#include + +#include "ChromosomeOneDigit.h" +#include "DetectorMaxima.h" + +#include "../utility/Util.h" +#include "../utility/ILocation.h" +#include "../utility/Location.h" + +using namespace std; +using namespace utility; + +namespace nonltr { + +class ChromDetectorMaxima { +private: + vector * regionList; + string header; + + void start(double, double, double, double, double, int, vector *, + const vector *> *); + +public: + ChromDetectorMaxima(double, double, double, double, double, int, + vector *, ChromosomeOneDigit *); + ChromDetectorMaxima(double, double, double, double, double, int, + vector *, const vector *> *); + virtual ~ChromDetectorMaxima(); + const vector* getRegionList() const; + void printIndex(string); + void printIndex(string, bool); + +}; + +} /* namespace nonltr */ +#endif /* CHROMDETECTORMAXIMA_H_ */ diff --git a/src/nonltr/ChromListMaker.cpp b/src/nonltr/ChromListMaker.cpp new file mode 100644 index 0000000..e684c3a --- /dev/null +++ b/src/nonltr/ChromListMaker.cpp @@ -0,0 +1,123 @@ +/* + * ChromListMaker.cpp + * + * Created on: Mar 13, 2014 + * Author: Hani Zakaira Girgis + */ + +#include "ChromListMaker.h" + +namespace nonltr { + +ChromListMaker::ChromListMaker(string seqFileIn) { + seqFile = seqFileIn; + chromList = new vector(); +} + +ChromListMaker::~ChromListMaker() { + Util::deleteInVector(chromList); + delete chromList; +} + + +std::istream& safe_getline(std::istream& is, std::string& t) +{ + t.clear(); + std::istream::sentry se(is, true); + std::streambuf* sb = is.rdbuf(); + for(;;) { + int c = sb->sbumpc(); + switch (c) { + case '\n': + return is; + case '\r': + if (sb->sgetc() == '\n') { + sb->sbumpc(); + } + return is; + case std::streambuf::traits_type::eof(): + if (t.empty()) { + is.setstate(std::ios::eofbit); + } + return is; + default: + t += (char)c; + } + } +} + +const vector * ChromListMaker::makeChromList() { + ifstream in(seqFile.c_str()); + bool isFirst = true; + Chromosome * chrom; + + while (in.good()) { + string line; + safe_getline(in, line); + if (line[0] == '>') { + if (!isFirst) { + chrom->finalize(); + chromList->push_back(chrom); + } else { + isFirst = false; + } + + chrom = new Chromosome(); + chrom->setHeader(line); + } else if (line[0] == ' ' || line[0] == '\t') { + bool all_spaces = true; + for (auto c : line) { + if (c != ' ' && c != '\t') { + all_spaces = false; + } + } + if (all_spaces) { + continue; + } + std::ostringstream oss; + oss << chrom->getHeader() << line; + std::string new_header = oss.str(); + chrom->setHeader(new_header); + } else { + chrom->appendToSequence(line); + } + } + chrom->finalize(); + chromList->push_back(chrom); + in.close(); + + return chromList; +} + +const vector * ChromListMaker::makeChromOneDigitList() { + ifstream in(seqFile.c_str()); + bool isFirst = true; + ChromosomeOneDigit * chrom; + + while (in.good()) { + string line; + safe_getline(in, line); + if (line[0] == '>') { + if (!isFirst) { + chrom->finalize(); + chromList->push_back(chrom); + } else { + isFirst = false; + } + + chrom = new ChromosomeOneDigit(); + chrom->setHeader(line); + } else { + chrom->appendToSequence(line); + } + } + + chrom->finalize(); + chromList->push_back(chrom); + in.close(); + + return chromList; +} + +} +/* namespace nonltr */ diff --git a/src/nonltr/ChromListMaker.h b/src/nonltr/ChromListMaker.h new file mode 100644 index 0000000..a60fe2f --- /dev/null +++ b/src/nonltr/ChromListMaker.h @@ -0,0 +1,38 @@ +/* + * ChromListMaker.h + * + * Created on: Mar 13, 2014 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef CHROMLISTMAKER_H_ +#define CHROMLISTMAKER_H_ + +#include +#include + +#include "Chromosome.h" +#include "ChromosomeOneDigit.h" + +#include "../utility/Util.h" + +using namespace std; +using namespace utility; + +namespace nonltr { + +class ChromListMaker { +private: + vector * chromList; + string seqFile; + +public: + ChromListMaker(string); + virtual ~ChromListMaker(); + const vector * makeChromList(); + const vector * makeChromOneDigitList(); + +}; + +} /* namespace nonltr */ +#endif /* CHROMLISTMAKER_H_ */ diff --git a/src/nonltr/Chromosome.cpp b/src/nonltr/Chromosome.cpp new file mode 100644 index 0000000..2bea802 --- /dev/null +++ b/src/nonltr/Chromosome.cpp @@ -0,0 +1,308 @@ +/* + * Chromosome.cpp + * + * Created on: Mar 26, 2012 + * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH + */ +#include "Chromosome.h" + +Chromosome::Chromosome() { + header = string(""); + base = string(""); + isHeaderReady = false; + isBaseReady = false; + isFinalized = false; +} + +Chromosome::Chromosome(string fileName) { + chromFile = fileName; + readFasta(); + help(1000000, true); +} + +Chromosome::Chromosome(string fileName, bool canMerge) { + chromFile = fileName; + readFasta(); + help(1000000, canMerge); +} + +Chromosome::Chromosome(string fileName, int len) { + chromFile = fileName; + readFasta(); + help(len, true); +} + +Chromosome::Chromosome(string &seq, string &info) { + header = info; + base = seq; + help(1000000, true); +} + +Chromosome::Chromosome(string &seq, string &info, int len) { + header = info; + base = seq; + help(len, true); +} + +void Chromosome::setHeader(string& info) { + if (isFinalized) { + string msg("This chromosome has been finalized. "); + msg.append("The header cannot be modified."); + throw InvalidOperationException(msg); + } else { + header = info; + isHeaderReady = true; + } +} + +/** + * This method can waste memory if the sequence is large. + * Consider using the method appendToSequence instead + */ +void Chromosome::setSequence(string& seq) { + if (isFinalized) { + string msg("This chromosome has been finalized. "); + msg.append("The sequence cannot be modified."); + throw InvalidOperationException(msg); + } else { + base = seq; + isBaseReady = true; + } +} + +void Chromosome::appendToSequence(const string& line) { + if (isFinalized) { + string msg("This chromosome has been finalized. "); + msg.append("The sequence cannot be modified."); + throw InvalidOperationException(msg); + } else { + base.append(line); + isBaseReady = true; + } +} + +void Chromosome::finalize() { + if (isFinalized) { + string msg("This chromosome has been already finalized. "); + msg.append("Finalize can be only called once."); + throw InvalidOperationException(msg); + } else if (!(isHeaderReady && isBaseReady)) { + string msg( + "The header and the sequence must be set before calling finalize"); + throw InvalidOperationException(msg); + } else { + help(1000000, true); + isFinalized = true; + } +} + +void Chromosome::help(int len, bool canMerge) { + effectiveSize = 0; + segLength = len; + segment = new vector *>(); +// segment->reserve(100); + + toUpperCase(); + removeN(); + if (canMerge) { + mergeSegments(); + } + makeSegmentList(); + calculateEffectiveSize(); +} + +Chromosome::~Chromosome() { + base.clear(); + + Util::deleteInVector(segment); + segment->clear(); + delete segment; +} + +void Chromosome::readFasta() { + bool isFirst = true; + header = string(""); + base = string(""); + + ifstream in(chromFile.c_str()); + while (in.good()) { + string line; + getline(in, line); + if (line[0] == '>') { + if (!isFirst) { + string msg = "Chromosome file: "; + msg = msg + chromFile; + msg = + msg + + " must have one sequence only. But it has more than one."; + throw InvalidInputException(msg); + } else { + header = line; + isFirst = false; + } + } else { + base.append(line); + } + } + in.close(); +} + +/** + * Convert alphabet to upper case if it has not been done before + **/ +void Chromosome::toUpperCase() { + for (int i = 0; i < base.length(); i++) { + base[i] = toupper(base[i]); + } +} + +/** + * Segment coordinates are inclusive [s,e] + **/ +void Chromosome::removeN() { + // Store non-N index + int start = -1; + for (int i = 0; i < base.size(); i++) { + if (base[i] != 'N' && start == -1) { + start = i; + } else if (base[i] == 'N' && start != -1) { + vector * v = new vector(); + v->push_back(start); + v->push_back(i - 1); + segment->push_back(v); + + start = -1; + } else if (i == base.size() - 1 && base[i] != 'N' && start != -1) { + vector * v = new vector(); + v->push_back(start); + v->push_back(i); + + segment->push_back(v); + start = -1; + } + } +} + +/** + * If the gap between two consecutive segments is less than 10 bp. + * Segments that are shorter than 20 bp are not added. + */ +void Chromosome::mergeSegments() { + vector *> * mSegment = new vector *>(); + + int s = segment->at(0)->at(0); + int e = segment->at(0)->at(1); + + for (int i = 1; i < segment->size(); i++) { + int s1 = segment->at(i)->at(0); + int e1 = segment->at(i)->at(1); + + if (s1 - e < 10) { + e = e1; + } else { + if (e - s + 1 >= 20) { + vector * seg = new vector(); + seg->push_back(s); + seg->push_back(e); + mSegment->push_back(seg); + } + + s = s1; + e = e1; + } + } + + // Handle the last index + if (e - s + 1 >= 20) { + vector * seg = new vector(); + seg->push_back(s); + seg->push_back(e); + mSegment->push_back(seg); + } + + Util::deleteInVector(segment); + segment->clear(); + segment = mSegment; +} + +void Chromosome::makeSegmentList() { + vector *> * segmentList = new vector *>(); + int segmentCount = segment->size(); + for (int oo = 0; oo < segmentCount; oo++) { + int s = segment->at(oo)->at(0); + int e = segment->at(oo)->at(1); + + if (e - s + 1 > segLength) { + int fragNum = (int) (e - s + 1) / segLength; + + for (int h = 0; h < fragNum; h++) { + int fragStart = s + (h * segLength); + int fragEnd = + (h == fragNum - 1) ? e : fragStart + segLength - 1; + vector * v = new vector(); + v->push_back(fragStart); + v->push_back(fragEnd); + segmentList->push_back(v); + } + } else { + vector * v = new vector(); + v->push_back(segment->at(oo)->at(0)); + v->push_back(segment->at(oo)->at(1)); + segmentList->push_back(v); + } + } + + Util::deleteInVector(segment); + delete segment; + segment = segmentList; +} + +const string* Chromosome::getBase() { + return &base; +} + +const vector *> * Chromosome::getSegment() { + return segment; +} + +void Chromosome::printSegmentList(){ + int l = segment->size(); + cout << "Segment list size = " << l << endl; + for(int i = 0; i < l; i++){ + cout << segment->at(i)->at(0) << "\t"; + cout << segment->at(i)->at(1) << endl; + } +} + +string Chromosome::getHeader() { + return header; +} + +int Chromosome::size() { + return base.size(); +} + +void Chromosome::calculateEffectiveSize() { + int segmentCount = segment->size(); + for (int oo = 0; oo < segmentCount; oo++) { + int s = segment->at(oo)->at(0); + int e = segment->at(oo)->at(1); + effectiveSize += (e - s + 1); + } +} + +int Chromosome::getEffectiveSize() { + return effectiveSize; +} + +int Chromosome::getGcContent() { + int gc = 0; + int size = base.size(); + for (int i = 0; i < size; i++) { + char n = base.at(i); + if (n == 'C' || n == 'G') { + gc++; + } + } + return gc; +} diff --git a/src/nonltr/Chromosome.h b/src/nonltr/Chromosome.h new file mode 100644 index 0000000..0632458 --- /dev/null +++ b/src/nonltr/Chromosome.h @@ -0,0 +1,78 @@ +/* + * Chromosome.h + * + * Created on: Mar 26, 2012 + * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH + */ +#ifndef CHROMOSOME_H_ +#define CHROMOSOME_H_ + +#include +#include +#include +#include +#include + +#include "IChromosome.h" +#include "../exception/InvalidOperationException.h" +#include "../exception/InvalidInputException.h" +#include "../utility/Util.h" + +using namespace std; +using namespace nonltr; +using namespace utility; +using namespace exception; + +namespace nonltr { +class Chromosome: public IChromosome { +public: + Chromosome(); + Chromosome(string); + Chromosome(string, bool); + Chromosome(string, int); + Chromosome(string &, string&); + Chromosome(string &, string&, int); + + int getGcContent(); + + virtual ~Chromosome(); + + virtual const string* getBase(); + virtual const vector *> * getSegment(); + virtual void printSegmentList(); + virtual string getHeader(); + virtual int size(); + virtual int getEffectiveSize(); + virtual void setHeader(string&); + virtual void setSequence(string&); + virtual void appendToSequence(const string&); + virtual void finalize(); + + +protected: + string chromFile; + string header; + string base; + int effectiveSize; + int segLength; + + vector *> * segment; + void readFasta(); + void toUpperCase(); + void removeN(); + void mergeSegments(); + virtual void help(int, bool); + void makeSegmentList(); + void calculateEffectiveSize(); + +private: + bool isHeaderReady; + bool isBaseReady; + bool isFinalized; + + void reverseSegments(); + +}; +} + +#endif /* CHROMOSOME_H_ */ diff --git a/src/nonltr/ChromosomeOneDigit.cpp b/src/nonltr/ChromosomeOneDigit.cpp new file mode 100644 index 0000000..9af2c51 --- /dev/null +++ b/src/nonltr/ChromosomeOneDigit.cpp @@ -0,0 +1,246 @@ +/* + * ChromosomeOneDigit.cpp + * + * Created on: Jul 31, 2012 + * Author: Hani Zakaria Girgis, PhD at the NCB1/NLM/NIH + * A A + * T T + * G G + * C C + * R G or A + * Y T or C + * M A or C + * K G or T + * S G or C + * W A or T + * H A or C or T + * B G or T or C + * V G or C or A + * D G or T or A + * N G or T or A or C + */ +#include +#include + +#include "Chromosome.h" +#include "ChromosomeOneDigit.h" +#include "../exception/InvalidInputException.h" + +using namespace exception; + +namespace nonltr { + +ChromosomeOneDigit::ChromosomeOneDigit() : + Chromosome() { +} + +ChromosomeOneDigit::ChromosomeOneDigit(string fileName) : + Chromosome(fileName) { + help(); +} + +ChromosomeOneDigit::ChromosomeOneDigit(string seq, string info) : + Chromosome(seq, info) { + help(); +} + +void ChromosomeOneDigit::help() { + // Build codes + buildCodes(); + // Modify the sequence in the super class + encodeNucleotides(); +} + +void ChromosomeOneDigit::finalize() { + Chromosome::finalize(); + help(); +} + +void ChromosomeOneDigit::buildCodes() { + // Make map + codes = new map(); + + // Certain nucleotides + codes->insert(map::value_type('A', (char) 0)); + codes->insert(map::value_type('C', (char) 1)); + codes->insert(map::value_type('G', (char) 2)); + codes->insert(map::value_type('T', (char) 3)); + + // Common uncertain nucleotide + // codes->insert(map::value_type('N', (char) 4)); + + // Uncertain nucleotides + codes->insert(map::value_type('R', codes->at('G'))); + codes->insert(map::value_type('Y', codes->at('C'))); + codes->insert(map::value_type('M', codes->at('A'))); + codes->insert(map::value_type('K', codes->at('T'))); + codes->insert(map::value_type('S', codes->at('G'))); + codes->insert(map::value_type('W', codes->at('T'))); + codes->insert(map::value_type('H', codes->at('C'))); + codes->insert(map::value_type('B', codes->at('T'))); + codes->insert(map::value_type('V', codes->at('A'))); + codes->insert(map::value_type('D', codes->at('T'))); + codes->insert(map::value_type('N', codes->at('C'))); + codes->insert(map::value_type('X', codes->at('G'))); +} + +ChromosomeOneDigit::~ChromosomeOneDigit() { + codes->clear(); + delete codes; +} + +/** + * This method converts nucleotides in the segments to single digit codes + */ +void ChromosomeOneDigit::encodeNucleotides() { + + for (int s = 0; s < segment->size(); s++) { + int segStart = segment->at(s)->at(0); + int segEnd = segment->at(s)->at(1); + for (int i = segStart; i <= segEnd; i++) { + if (codes->count(base[i]) > 0) { + base[i] = codes->at(base[i]); + } else { + string msg = "Invalid nucleotide: "; + msg.append(1, base[i]); + throw InvalidInputException(msg); + } + } + } + + // Digitize skipped segments + int segNum = segment->size(); + if(segNum > 0){ + // The first interval - before the first segment + int segStart = 0; + int segEnd = segment->at(0)->at(0)-1; + + for (int s = 0; s <= segNum; s++) { + for (int i = segStart; i <= segEnd; i++) { + char c = base[i]; + if(c != 'N'){ + if (codes->count(c) > 0) { + base[i] = codes->at(c); + } else { + string msg = "Invalid nucleotide: "; + msg.append(1, c); + throw InvalidInputException(msg); + } + } + } + + // The regular intervals between two segments + if(s < segNum-1){ + segStart = segment->at(s)->at(1)+1; + segEnd = segment->at(s+1)->at(0)-1; + } + // The last interval - after the last segment + else if(s == segNum - 1){ + segStart = segment->at(s)->at(1)+1; + segEnd = base.size()-1; + } + } + } +} + +/* +void ChromosomeOneDigit::encodeNucleotides() { + int seqLen = base.size(); + + for (int i = 0; i < seqLen; i++) { + if (codes->count(base[i]) > 0) { + base[i] = codes->at(base[i]); + } else { + string msg = "Invalid nucleotide: "; + msg.append(1, base[i]); + throw InvalidInputException(msg); + } + } + +} +*/ + +/** + * Cannot be called on already finalized object. + */ +void ChromosomeOneDigit::makeR() { + //cout << "Making reverse ..." << endl; + makeReverse(); + reverseSegments(); +} + +/** + * Cannot be called on already finalized object. + */ +void ChromosomeOneDigit::makeRC() { + //cout << "Making reverse complement ..." << endl; + makeComplement(); + makeReverse(); + reverseSegments(); +} + +void ChromosomeOneDigit::makeComplement() { + map complement; + + // Certain nucleotides + complement.insert(map::value_type((char) 0, (char) 3)); + complement.insert(map::value_type((char) 1, (char) 2)); + complement.insert(map::value_type((char) 2, (char) 1)); + complement.insert(map::value_type((char) 3, (char) 0)); + + // Unknown nucleotide + complement.insert(map::value_type('N', 'N')); + // complement.insert(map::value_type((char) 4, (char) 4)); + + // Convert a sequence to its complement + int seqLen = base.size(); + for (int i = 0; i < seqLen; i++) { + if (complement.count(base[i]) > 0) { + base[i] = complement.at(base[i]); + } else { + cerr << "Error: The digit " << (char) base[i]; + cerr << " does not represent a base." << endl; + exit(2); + } + } +} + +void ChromosomeOneDigit::makeReverse() { + int last = base.size() - 1; + + // Last index to be switched + int middle = base.size() / 2; + + for (int i = 0; i < middle; i++) { + char temp = base[last - i]; + base[last - i] = base[i]; + base[i] = temp; + } +} + +void ChromosomeOneDigit::reverseSegments() { + int segNum = segment->size(); + int lastBase = size() - 1; + + // Calculate the coordinate on the main strand + for (int i = 0; i < segNum; i++) { + vector * seg = segment->at(i); + + int s = lastBase - seg->at(1); + int e = lastBase - seg->at(0); + seg->clear(); + seg->push_back(s); + seg->push_back(e); + } + + // Reverse the regions within the list + int lastRegion = segNum - 1; + int middle = segNum / 2; + for (int i = 0; i < middle; i++) { + vector * temp = segment->at(lastRegion - i); + (*segment)[lastRegion - i] = segment->at(i); + (*segment)[i] = temp; + } +} + +} diff --git a/src/nonltr/ChromosomeOneDigit.h b/src/nonltr/ChromosomeOneDigit.h new file mode 100644 index 0000000..384698f --- /dev/null +++ b/src/nonltr/ChromosomeOneDigit.h @@ -0,0 +1,43 @@ +/* + * ChromosomeOneDigit.h + * + * Created on: Jul 31, 2012 + * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH + */ + +#ifndef CHROMOSOMEONEDIGIT_H_ +#define CHROMOSOMEONEDIGIT_H_ + +#include +#include "Chromosome.h" + +namespace nonltr { +class ChromosomeOneDigit: public Chromosome { + +private: + /* Fields */ + map * codes; + + /* Methods */ + void help(); + void buildCodes(); + void encodeNucleotides(); + + void makeReverse(); + void makeComplement(); + void reverseSegments(); + +public: + /* Methods */ + ChromosomeOneDigit(); + ChromosomeOneDigit(string); + ChromosomeOneDigit(string, string); + virtual ~ChromosomeOneDigit(); + virtual void finalize(); + + void makeR(); + void makeRC(); +}; +} + +#endif /* CHROMOSOMEONEDIGIT_H_ */ diff --git a/src/nonltr/ChromosomeRandom.cpp b/src/nonltr/ChromosomeRandom.cpp new file mode 100644 index 0000000..68ae15b --- /dev/null +++ b/src/nonltr/ChromosomeRandom.cpp @@ -0,0 +1,363 @@ +/* + * ChromosomeRandom.cpp + * + * Created on: Feb 4, 2013 + * Author: Hani Zakaria Girgis, PhD + * + */ + +#include +#include +#include +#include +#include +#include + +#include "ChromosomeRandom.h" +#include "../exception/InvalidInputException.h" +#include "../exception/InvalidStateException.h" +#include "../utility/Util.h" + +using namespace std; +using namespace exception; +using namespace utility; + +namespace nonltr { + +ChromosomeRandom::ChromosomeRandom(int nIn, IChromosome* oChromIn, + char unreadIn, vector* alphaIn) { + // Check the order + if (nIn < 0) { + string msg("The Markov order must be non-negative. "); + msg.append("The order received is: "); + msg.append(Util::int2string(nIn)); + msg.append("."); + throw InvalidInputException(msg); + } + + // n here is the length of the word, i.e. the order + 1 + n = nIn + 1; + oChrom = oChromIn; + unread = unreadIn; + alpha = alphaIn; + + // Initialize the random sequence + int size = oChrom->getBase()->size(); + rBase = new string(size, unread); + + // Initialize key list + keyList = new vector(); + + // Initialize the table + table = new map(); + + // Handle unusual characters in the first word of a segment + // Make map + codes = new map(); + codes->insert(map::value_type('A', 'A')); + codes->insert(map::value_type('C', 'C')); + codes->insert(map::value_type('G', 'G')); + codes->insert(map::value_type('T', 'T')); + codes->insert(map::value_type('R', 'G')); + codes->insert(map::value_type('Y', 'C')); + codes->insert(map::value_type('M', 'A')); + codes->insert(map::value_type('K', 'T')); + codes->insert(map::value_type('S', 'G')); + codes->insert(map::value_type('W', 'T')); + codes->insert(map::value_type('H', 'C')); + codes->insert(map::value_type('B', 'T')); + codes->insert(map::value_type('V', 'A')); + codes->insert(map::value_type('D', 'T')); + codes->insert(map::value_type('N', 'C')); + codes->insert(map::value_type('X', 'G')); + + // Start operations + cout << "\tFilling key list ..." << endl; + fillKeyList(); + + cout << "\tInitializing table ..." << endl; + initializeTable(); + + cout << "\tCounting words ..." << endl; + countWords(); + + cout << "\tCalculating probabilities ..." << endl; + convertToProbabilities(); + + //cout << "\tPrinting the table ..." << endl; + //printTable(); + + cout << "\tGenerating the random sequence ..." << endl; + generateRandomSequence(); +} + +ChromosomeRandom::~ChromosomeRandom() { + codes->clear(); + delete codes; + + keyList->clear(); + delete keyList; + + table->clear(); + delete table; + + delete rBase; +} + +void ChromosomeRandom::fillKeyList() { + // Collect keys + int alphaCount = alpha->size(); + + // Order 0 + + for (int h = 0; h < alphaCount; h++) { + string s(""); + s.append(1, alpha->at(h)); + keyList->push_back(s); + } + + // Order 1 and higher + for (int g = 1; g < n; g++) { + vector o; + int keyListSize = keyList->size(); + for (int i = 0; i < keyListSize; i++) { + for (int j = 0; j < alphaCount; j++) { + string s(keyList->at(i)); + s.append(1, alpha->at(j)); + o.push_back(s); + } + } + keyList->clear(); + (*keyList) = o; + } +} + +void ChromosomeRandom::initializeTable() { + int keyListSize = keyList->size(); + for (int i = 0; i < keyListSize; i++) { + table->insert(valType(keyList->at(i), 1)); + } +} + +void ChromosomeRandom::countWords() { + // Get the original sequence + const string* oBase = oChrom->getBase(); + + // Count words + const vector *> * segmentList = oChrom->getSegment(); + int segmentCount = segmentList->size(); + for (int i = 0; i < segmentCount; i++) { + int s = segmentList->at(i)->at(0); + int e = segmentList->at(i)->at(1); + if (e - s + 1 >= n) { + + int limit = e - n + 1; + + for (int h = s; h <= limit; h++) { + // Check if the current base is a standard one. + // Words including non-standard bases are not counted. + + char c = oBase->at(h); + + int alphaCount = alpha->size(); + bool isStandard = false; + for (int a = 0; a < alphaCount; a++) { + if (alpha->at(a) == c) { + isStandard = true; + break; + } + } + + // Increment the count + if (isStandard) { + string word = oBase->substr(h, n); + if (table->count(word) > 0) { + (*table)[word] = table->at(word) + 1; + } else { + cout << "\t\tIgnoring " << word << endl; + } + } + } + } + } +} + +void ChromosomeRandom::convertToProbabilities() { + int alphaCount = alpha->size(); + int keyListSize = keyList->size(); + for (int i = 0; i < keyListSize; i += alphaCount) { + double sum = 0; + for (int j = 0; j < alphaCount; j++) { + string key = keyList->at(i + j); + sum += table->at(key); + } + for (int j = 0; j < alphaCount; j++) { + string key = keyList->at(i + j); + (*table)[key] = ((double) table->at(key)) / sum; + } + } +} + +void ChromosomeRandom::generateRandomSequence() { + // Get the original sequence + const string* oBase = oChrom->getBase(); + + // Alphabet count + int alphaCount = alpha->size(); + + // Get the original segments + const vector *> * segmentList = oChrom->getSegment(); + int segmentCount = segmentList->size(); + + // Generate random segments + for (int i = 0; i < segmentCount; i++) { + int s = segmentList->at(i)->at(0); + int e = segmentList->at(i)->at(1); + + if (e - s + 1 > n) { + //string order = oBase->substr(s, n - 1); + string order(""); + // The first order is based on the original sequence. + for (int w = s; w < s + n - 1; w++) { + (*rBase)[w] = codes->at(oBase->at(w)); + order.append(1, codes->at(oBase->at(w))); + } + + for (int h = s + n - 1; h <= e; h++) { + // Subsequent orders are based on the random sequence. + order = rBase->substr(h - n + 1, n - 1); + vector > lottery; + int chanceSoFar = 0; + for (int k = 0; k < alphaCount; k++) { + string temp = order; + temp.append(1, alpha->at(k)); + if (table->count(temp) > 0) { + int periodStart = chanceSoFar; + int periodEnd = periodStart + (100 * table->at(temp)); + chanceSoFar = periodEnd + 1; + vector entry; + entry.push_back(alpha->at(k)); + entry.push_back(periodStart); + entry.push_back(periodEnd); + lottery.push_back(entry); + } else { + string msg("This word must exist in the table: "); + msg.append(temp); + msg.append("."); + throw InvalidStateException(msg); + } + } + + if (lottery.size() > 0) { + int randInt = rand() % chanceSoFar; + + for (int tt = 0; tt < alphaCount; tt++) { + vector entry = lottery.at(tt); + if (randInt >= entry.at(1) && randInt <= entry.at(2)) { + (*rBase)[h] = entry.at(0); + break; + } + } + lottery.clear(); + } else { + string msg("The lottery vector cannot be empty."); + throw InvalidStateException(msg); + } + } + } + } + + // Make sure that the generated sequence has the same length as the original sequence + if (oBase->size() != rBase->size()) { + cerr << "The original sequence and the random sequence "; + cerr << "do not have the same size." << endl; + cerr << "Original sequence size is: " << oBase->size() << endl; + cerr << "Generated sequence size is: " << rBase->size() << endl; + } +} + +void ChromosomeRandom::printTable() { + map::iterator iterStart = table->begin(); + map::iterator iterEnd = table->end(); + while (iterStart != iterEnd) { + cout << (*iterStart).first << " -> " << (*iterStart).second << endl; + iterStart++; + } +} + +/** + * Returns the segments of the original chromosome + */ +const vector *> * ChromosomeRandom::getSegment() { + return oChrom->getSegment(); +} + +/** + * Returns the random sequence + */ +const string* ChromosomeRandom::getBase() { + return rBase; +} + +/** + * Returns the header indicating the order of the Markov chain + */ +string ChromosomeRandom::getHeader() { + string header = oChrom->getHeader(); +//header.append(" - Random based on "); +//header.append(Util::int2string(n - 1)); +//header.append("-order Markov chain."); + return header; +} + +void ChromosomeRandom::printEffectiveSequence(string outputFile) { + int totalSize = rBase->size(); + string * effectiveRBase = new string(""); + for (int i = 0; i < totalSize; i++) { + char b = rBase->at(i); + if (b != unread) { + effectiveRBase->append(1, b); + } + } + + // Make sure that the effective sequence is shorter than the original + // length + if (effectiveRBase->size() > totalSize) { + cerr << "The effective length must be <= the original length." << endl; + cerr << "Generated sequence size is: " << totalSize << endl; + cerr << "The effective size is: " << effectiveRBase->size() << endl; + + } + + printSequence(outputFile, effectiveRBase); + + delete effectiveRBase; +} + +void ChromosomeRandom::printSequence(string outputFile) { + printSequence(outputFile, rBase); +} + +void ChromosomeRandom::printSequence(string outputFile, string * baseToPrint) { + cout << "Printing chromosome to file ..." << endl; + ofstream outSequence; + outSequence.open(outputFile.c_str(), ios::out); + + int step = 50; + + outSequence << getHeader() << endl; + int len = baseToPrint->size(); + + for (int i = 0; i < len; i = i + step) { + int e = (i + step - 1 > len - 1) ? len - 1 : i + step - 1; + for (int k = i; k <= e; k++) { + outSequence << baseToPrint->at(k); + } + outSequence << endl; + } + outSequence << endl; + + outSequence.close(); +} + +} /* namespace nonltr */ diff --git a/src/nonltr/ChromosomeRandom.h b/src/nonltr/ChromosomeRandom.h new file mode 100644 index 0000000..a837575 --- /dev/null +++ b/src/nonltr/ChromosomeRandom.h @@ -0,0 +1,51 @@ +/* + * ChromosomeRandom.h + * + * Created on: Feb 4, 2013 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef CHROMOSOMERANDOM_H_ +#define CHROMOSOMERANDOM_H_ + +#include + +#include "IChromosome.h" + +namespace nonltr { + +class ChromosomeRandom: public nonltr::IChromosome { + // Key-value pair type. + typedef map::value_type valType; + +private: + int n; + char unread; + IChromosome * oChrom; + vector * alpha; + map * table; + string * rBase; + vector * keyList; + map * codes; + + void fillKeyList(); + void initializeTable(); + void countWords(); + void convertToProbabilities(); + void printTable(); + void generateRandomSequence(); + +public: + ChromosomeRandom(int, IChromosome*, char, vector*); + virtual ~ChromosomeRandom(); + + virtual const string* getBase(); + virtual const vector *> * getSegment(); + virtual string getHeader(); + virtual void printSequence(string); + void printSequence(string, string *); + void printEffectiveSequence(string); +}; + +} /* namespace nonltr */ +#endif /* CHROMOSOMERANDOM_H_ */ diff --git a/src/nonltr/DetectorMaxima.cpp b/src/nonltr/DetectorMaxima.cpp new file mode 100644 index 0000000..90043e3 --- /dev/null +++ b/src/nonltr/DetectorMaxima.cpp @@ -0,0 +1,518 @@ +/* + * DetectorMaxima.cpp + * + * Created on: May 31, 2013 + * Author: Hani Zakaria Girgis, PhD + */ + +#include "DetectorMaxima.h" +#include "../utility/Util.h" +#include "../utility/Location.h" +#include "../exception/InvalidStateException.h" + +#include +// Delete start +#include +using namespace std; +// Delete end + +using namespace exception; + +namespace nonltr { + +DetectorMaxima::DetectorMaxima(int segStartIn, int segEndIn, double sIn, + double wIn, double mIn, double tIn, double pIn, int eIn, + vector * oScoresIn) { + + // ToDo: make sure that segStart and segEnd are within the input scores. + segStart = segStartIn; + segEnd = segEndIn; + s = sIn; + w = wIn; + m = mIn; + t = tIn; + p = pIn; + e = eIn; + oScores = oScoresIn; + + halfS = s; + //s / 2; + + mask = new vector(); + // Complete + scores = new vector(); + + // Trimmed on both sides + first = new vector(); + + // Trimmed on both sides + second = new vector(); + + // Coordinates according to the complete sequence + maxima = new vector(); + + // Coordinates according to the complete sequence + // allMaxima = new vector *>(); + + // Coordinates according to the complete sequence + separatorList = new vector(); + + // Coordinates according to the complete sequence + regionList = new vector(); + + makeMask(); + + smooth(); + + deriveFirst(); + + deriveSecond(); + + // Free memory start + mask->clear(); + delete mask; + scores->clear(); + delete scores; + // Free memory end + + findMaxima(); + + // Free memory start + first->clear(); + delete first; + second->clear(); + delete second; + // Free memory end + + findSeparators(); + + findRegions(); + + // Free memory start + maxima->clear(); + delete maxima; + Util::deleteInVector(separatorList); + separatorList->clear(); + delete separatorList; + // Free memory end + + extendRegions(); +} + +/* + const vector *>* DetectorMaxima::getAllMaxima() const { + return allMaxima; + } + */ + +const vector* DetectorMaxima::getFirst() const { + return first; +} + +const vector* DetectorMaxima::getSecond() const { + return second; +} + +const vector * DetectorMaxima::getRegionList() const { + return regionList; +} + +DetectorMaxima::~DetectorMaxima() { + /* + Util::deleteInVector (allMaxima); + allMaxima->clear(); + delete allMaxima; + */ + + Util::deleteInVector(regionList); + regionList->clear(); + delete regionList; +} + +void DetectorMaxima::makeMask() { + const double PI = 3.14159265358979323846; + double sigma = (double) s / 3.5; + const double PART_1 = 1 / sqrt(2 * PI * pow(sigma, 2)); + + int l = 2 * s + 1; + for (int i = 0; i < l; i++) { + double g = PART_1 * exp(-1 * pow(i - s, 2) / (2 * pow(sigma, 2))); + mask->push_back(g); + } + + // For testing only + /* + for (int i = 0; i < l; i++) { + cout << i << "\t" << mask->at(i) << endl; + } + cout << endl; + cout << endl; + */ + // End testing +} + +void DetectorMaxima::smooth() { + for (int i = segStart; i <= segEnd; i++) { + int winS = i - s; + int maskS = 0; + if (winS < segStart) { + maskS = -1 * (winS - segStart); + winS = segStart; + } + + int winE = (i + s > segEnd) ? segEnd : i + s; + // int winL = winE - winS + 1; + + double sum = 0.0; + double maskSum = 0.0; + + int j = winS; + int h = maskS; + + while (j <= winE) { + double weight = mask->at(h); + sum += oScores->at(j) * weight; + maskSum += weight; + + j++; + h++; + } + + if (maskSum <= 0.0) { + string msg("The sum of the weights in the mask must be > 0"); + throw InvalidStateException(msg); + } + + scores->push_back(sum / maskSum); + // scores->push_back(sum / winL); + } + + // Testing - start + /* + cout << "The smoothed scores ... " << endl; + for (int k = 0; k < scores->size(); k++) { + if (k % 25 == 0) { + cout << endl; + } + cout << scores->at(k) << " "; + } + cout << endl; + cout << endl; + */ + // Testing - end +} + +void DetectorMaxima::deriveFirst() { + double l = 0.0; + double r = 0.0; + + for (int i = 0; i < w; i++) { + l += scores->at(i); + } + + for (int i = w + 1; i <= 2 * w; i++) { + r += scores->at(i); + } + + first->push_back(round(-1 * l + r)); + + for (int i = w + 1; i < scores->size() - w; i++) { + l -= scores->at(i - w - 1); + l += scores->at(i - 1); + r -= scores->at(i); + r += scores->at(i + w); + first->push_back(round(-1 * l + r)); + } + + // For testing only + /* + for (int i = 0; i < first->size(); i++) { + cout << first->at(i) << " "; + } + cout << endl; + */ +} + +void DetectorMaxima::deriveSecond() { + double l = 0.0; + double r = 0.0; + double d = 2 * w; + + for (int i = 0; i < w; i++) { + l += scores->at(i); + } + + for (int i = w + 1; i <= 2 * w; i++) { + r += scores->at(i); + } + + second->push_back(round(l + r - d * scores->at(w))); + + for (int i = w + 1; i < scores->size() - w; i++) { + l -= scores->at(i - w - 1); + l += scores->at(i - 1); + r -= scores->at(i); + r += scores->at(i + w); + second->push_back(round(l + r - d * scores->at(i))); + } + + // For testing only + /* + for (int i = 0; i < second->size(); i++) { + cout << second->at(i) << " "; + } + cout << endl; + */ +} + +void DetectorMaxima::findMaxima() { + int firstSize = first->size(); + + for (int i = 1; i < firstSize; i++) { + double magnitude = abs(first->at(i - 1) - first->at(i)); + + if (first->at(i) == 0 || (first->at(i - 1) < 0 & first->at(i) > 0) + || (first->at(i - 1) > 0 && first->at(i) < 0)) { + if (second->at(i) < 0) { + // Adjust index + int peakIndex = i + w + segStart; + + // Record the index of the peak and its magnitude + /* + vector * pair = new vector(); + pair->push_back(peakIndex); + pair->push_back(magnitude); + allMaxima->push_back(pair); + */ + + // Make sure that the peak is in a high-scoring region of width s centered on the peak + if (magnitude > m) { + // Make sure that the peak is in a high-scoring region of width s centered on the peak + int peakStart = peakIndex - halfS; + if (peakStart < segStart) { + peakStart = segStart; + } + int peakEnd = peakIndex + halfS; + if (peakEnd > segEnd) { + peakEnd = segEnd; + } + + double count = countLessThan(oScores, peakStart, peakEnd, + t); + double v = (100.00 * count) + / ((double) peakEnd - peakStart + 1); + if (v < p) { + maxima->push_back(peakIndex); + } + } + } + } + } + + // Testing - start + /* + cout << "Maxima: " << endl; + for (int i = 0; i < maxima->size(); i++) { + cout << maxima->at(i) << " "; + } + cout << endl << endl; + */ + // Testing - end +} + +int DetectorMaxima::countLessThan(vector * list, int s, int e, double t) { + int count = 0; + for (int u = s; u <= e; u++) { + if (list->at(u) < t) { + count++; + } + } + return count; +} + +void DetectorMaxima::findSeparators() { + int n = maxima->size(); + + if (n > 0) { + for (int i = 0; i < n - 1; i++) { + int j = i + 1; + int s = maxima->at(i); + int e = maxima->at(j); + + double count = countLessThan(oScores, s, e, t); + double v = (100.00 * count) / ((double) e - s + 1); + if (v >= p) { + separatorList->push_back(new Location(s, e)); + } + } + } + + // For testing only + /* + cout << "Separators: " << endl; + for (int h = 0; h < separatorList->size(); h++) { + cout << separatorList->at(h)->toString() << endl; + } + cout << endl; + */ +} + +void DetectorMaxima::findRegions() { + // Determine regions + int maximaCount = maxima->size(); + if (maximaCount > 0) { + int segStart = maxima->at(0); + int separatorCount = separatorList->size(); + for (int k = 0; k < separatorCount; k++) { + int segEnd = separatorList->at(k)->getStart(); + regionList->push_back(new Location(segStart, segEnd)); + segStart = separatorList->at(k)->getEnd(); + } + regionList->push_back( + new Location(segStart, maxima->at(maximaCount - 1))); + } + + // For testing only + /* + cout << "Regions: " << endl; + for (int r = 0; r < regionList->size(); r++) { + cout << regionList->at(r)->toString() << endl; + } + cout << endl; + */ + // End testing +} + +/* + * + */ +void DetectorMaxima::extendRegions() { + int regionCount = regionList->size(); + int gg = 0; + while (gg < regionCount) { + ILocation * region = regionList->at(gg); + + int regionStart = region->getStart(); + int regionEnd = region->getEnd(); + + // Handle the case where the region is made of one nucleotide + if (regionStart == regionEnd) { + regionStart = regionStart - halfS; + if (regionStart < segStart) { + regionStart = segStart; + } + region->setStart(regionStart); + + regionEnd = regionEnd + halfS; + if (regionEnd > segEnd) { + regionEnd = segEnd; + } + region->setEnd(regionEnd); + } + + // Left end: Extend step by step + int lEnd = (gg == 0) ? segStart : regionList->at(gg - 1)->getEnd(); + for (int u = regionStart; u >= lEnd; u = u - e) { + int d = u - e + 1; + if (d < lEnd) { + d = lEnd; + } + double v = (100.0 * countLessThan(oScores, d, u, t)) / ((double) e); + if (v >= p) { + break; + } else { + regionStart = d; + } + } + + // Left end: Extend or erode base by base + if (oScores->at(regionStart) < t) { + for (int a = regionStart; a < regionEnd; a++) { + if (oScores->at(a) >= t) { + regionStart = a; + break; + } + } + } else { + for (int a = regionStart; a >= lEnd; a--) { + if (oScores->at(a) >= t) { + regionStart = a; + } else { + break; + } + } + } + + // Set new start to check for validity + region->setStart(regionStart); + + // Right end: extend to the right step by step + int rEnd = + (gg == regionCount - 1) ? + segEnd : regionList->at(gg + 1)->getStart(); + for (int u = regionEnd; u <= rEnd; u = u + e) { + int d = u + e - 1; + if (d > rEnd) { + d = rEnd; + } + double v = (100.0 * countLessThan(oScores, u, d, t)) / ((double) e); + if (v >= p) { + break; + } else { + regionEnd = d; + } + } + + // Right end: extend or erod base by base + if (oScores->at(regionEnd) < t) { + for (int a = regionEnd; a > regionStart; a--) { + if (oScores->at(a) >= t) { + regionEnd = a; + break; + } + } + } else { + for (int a = regionEnd; a <= rEnd; a++) { + if (oScores->at(a) >= t) { + regionEnd = a; + } else { + break; + } + } + } + + // Set new end to check for validity + region->setEnd(regionEnd); + + // Merge overlapping regions + if (gg > 0) { + ILocation * pRegion = regionList->at(gg - 1); + int pStart = pRegion->getStart(); + int pEnd = pRegion->getEnd(); + + if (Util::isOverlapping(pStart, pEnd, regionStart, regionEnd)) { + pRegion->setEnd(regionEnd); + regionList->erase(regionList->begin() + gg); + regionCount = regionList->size(); + } else { + gg++; + } + } + + if (gg == 0) { + gg++; + } + } + + // Testing - Start + /* + cout << "Extended regions: " << endl; + for (int r = 0; r < regionList->size(); r++) { + cout << regionList->at(r)->toString() << endl; + } + cout << endl; + */ + // Testing - End +} + +} /* namespace nonltr */ diff --git a/src/nonltr/DetectorMaxima.h b/src/nonltr/DetectorMaxima.h new file mode 100644 index 0000000..7aca5d5 --- /dev/null +++ b/src/nonltr/DetectorMaxima.h @@ -0,0 +1,77 @@ +/* + * DetectorMaxima.h + * + * Created on: May 31, 2013 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef DETECTORMAXIMA_H_ +#define DETECTORMAXIMA_H_ + +#include +#include + +#include "../utility/ILocation.h" + +using namespace std; +using namespace utility; + +namespace nonltr { + +class DetectorMaxima { +private: + + int segStart; + int segEnd; + double s; + double w; + double m; + double t; + double p; + int e; + int halfS; + + vector * oScores; + vector * scores; + vector * mask; + vector * first; + vector * second; + vector * maxima; + // vector *> * allMaxima; + + vector * separatorList; + vector * regionList; + + void makeMask(); + void smooth(); + void deriveFirst(); + void deriveSecond(); + void findMaxima(); + + void findSeparators(); + void findRegions(); + + void extendRegions(); + + int countLessThan(vector *, int, int, double); + + /** + * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c + */ + inline double round(double number) { + return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5); + } + +public: + DetectorMaxima(int, int, double, double, double, double, double, int, + vector *); + virtual ~DetectorMaxima(); + const vector* getRegionList() const; + const vector* getFirst() const; + const vector* getSecond() const; + + // const vector *>* getAllMaxima() const; +}; + +} /* namespace nonltr */ +#endif /* DETECTORMAXIMA_H_ */ diff --git a/src/nonltr/EnrichmentMarkovView.cpp b/src/nonltr/EnrichmentMarkovView.cpp new file mode 100644 index 0000000..f886ac8 --- /dev/null +++ b/src/nonltr/EnrichmentMarkovView.cpp @@ -0,0 +1,217 @@ +/* + * EnrichmentMarkovView.cpp + * + * Created on: Apr 17, 2013 + * Author: Hani Zakaria Girgis, PhD + */ + +namespace nonltr { + +/** + * The Markov order. It start at 0. + */ +template +EnrichmentMarkovView::EnrichmentMarkovView(int k, int order, int m) : + minObs(m), factor(10000.00), KmerHashTable(k) { + initialize(order); +} + +template +EnrichmentMarkovView::EnrichmentMarkovView(int k, V initValue, int order, + int m) : + minObs(m), factor(10000.00), KmerHashTable(k, initValue) { + initialize(order); +} + +template +void EnrichmentMarkovView::initialize(int order) { + // Test start + // cout << "Testing: " << minObs << endl; + // Test end + + o = order; + if (o < 0) { + string msg("The Markov order must be non-negative integer. "); + msg.append("The invalid input is: "); + msg.append(Util::int2string(o)); + msg.append("."); + throw InvalidInputException(msg); + } + + if (o >= KmerHashTable::k) { + string msg("The Markov order cannot be >= k (k-mer)."); + throw InvalidInputException(msg); + } + + l = 0; + modelList = new vector *>(); + + for (int i = 1; i <= o + 1; i++) { + modelList->push_back(new KmerHashTable(i)); + } +} + +template +EnrichmentMarkovView::~EnrichmentMarkovView() { + Util::deleteInVector(modelList); + delete modelList; +} + +/** + * This method count words of size 1 to order+1 in the input sequence. + * In other words, it updates the background tables. In addition, it + * updates the length of the genome. + * + * sequence: is the input sequence. + * start: the start index - inclosing. + * end: the end index - inclosing. + */ +template +void EnrichmentMarkovView::count(const char * sequence, int start, + int end) { + + // Multiple by 2 if scanning the forward strand and its reverse complement + // l = l + (2 * (end - start + 1)); + l = l + (end - start + 1); + + int modelNumber = modelList->size(); + for (int i = 0; i < modelNumber; i++) { + KmerHashTable * t = modelList->at(i); + t->wholesaleIncrement(sequence, start, end - i); + } +} + +/** + * Normalize the count of words in each model. + * Values stored in these models are multiplied by "factor." + */ +template +void EnrichmentMarkovView::generateProbapilities() { + int modelNumber = modelList->size(); + + for (int m = 0; m < modelNumber; m++) { + KmerHashTable * t = modelList->at(m); + int tSize = t->getMaxTableSize(); + + for (int i = 0; i < tSize; i += 4) { + double sum = 0.0; + + for (int j = i; j < i + 4; j++) { + sum += t->valueOf(j); + } + + for (int j = i; j < i + 4; j++) { + t->insert(j, round(factor * ((double) t->valueOf(j) / sum))); + } + } + } +} + +template +void EnrichmentMarkovView::processTable() { + char base = 4; + int modelNumber = modelList->size(); + + // Make a zero in quaternary form as a string of length k. + string q(""); + for (int x = 0; x < KmerHashTable::k; x++) { + q.append(1, 0); + } + + double lowerP; + double upperP; + for (I y = 0; y < KmerHashTable::maxTableSize; y++) { + if (y % 10000000 == 0) { + cout << "Processing " << y << " keys out of " + << KmerHashTable::maxTableSize; + cout << endl; + } + + const char * qc = q.c_str(); + + // Calculate the expected number of occurrences. + + // a. Calculate probability from lower order models. + // Lower probabilities are the same for four consecutive words of length of k-1 + if (y % 4 == 0) { + lowerP = 1.0; + for (int m = 0; m < modelNumber - 1; m++) { + KmerHashTable * oTable = modelList->at(m); + lowerP *= (((double) oTable->valueOf(qc, 0)) / factor); + } + } + + // b. Calculate probability based on the specified order. + KmerHashTable * oTable = modelList->at(modelNumber - 1); + int resultsSize = KmerHashTable::k - o - 1; + + // Upper probabilities are the same for four consecutive words of length of k-1 + // The scanning of words or length corresponding to the highest order + 1 + // This step is not needed if k = o + 1, i.e. resultsSize = 0. + if (y % 4 == 0) { + if (resultsSize > 0) { + //Initialize the elements of the vector invalid index + vector results = vector(resultsSize, -987); + oTable->wholesaleValueOf(qc, 0, resultsSize - 1, &results, 0); + + upperP = 1.0; + for (int i = 0; i < resultsSize; i++) { + upperP *= (((double) results.at(i)) / factor); + } + results.clear(); + + } else { + upperP = 1.0; + } + } + + // The expected number of occurances + double exp = l * lowerP * upperP + * (((double) oTable->valueOf(qc, resultsSize)) / factor); + + // Calculate the enrichment value. + // Log value + // values[y] = round((log((double) values[y] + 1.0) - log(exp + 1.0))); + + // Raw value + // Requirement: if observed is >= 5 && observed > expected then the value is the difference + // otherwise the value is zero + + V observed = KmerHashTable::values[y]; + + if (observed >= minObs && observed > exp) { + + KmerHashTable::values[y] = round(observed - exp); + } else { + KmerHashTable::values[y] = 0; + } + + /* + KmerHashTable::values[y] = + round( + (((double) KmerHashTable::values[y] + 1.0) + / (exp + 1.0))); + */ + + // Increment the quaternary number: + // 1 - guard against overflow. + if (q[0] == base - 1) { + string z(""); + z.append(1, 0); + q = z + q; + } + + // 2 - increment the quaternary number by 1. + int qLen = q.size(); + for (int i = qLen - 1; i >= 0; i--) { + if (q[i] + 1 < base) { + q[i] = q[i] + 1; + break; + } else { + q[i] = 0; + } + } + } +} + +} /* namespace nonltr */ diff --git a/src/nonltr/EnrichmentMarkovView.h b/src/nonltr/EnrichmentMarkovView.h new file mode 100644 index 0000000..a10a02a --- /dev/null +++ b/src/nonltr/EnrichmentMarkovView.h @@ -0,0 +1,69 @@ +/* + * EnrichmentMarkovView.h + * + * Created on: Apr 17, 2013 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef ENRICHMENTMARKOVVIEW_H_ +#define ENRICHMENTMARKOVVIEW_H_ + +#include +#include +#include + +#include "KmerHashTable.h" +#include "../utility/Util.h" +#include "../exception/InvalidInputException.h" + +using namespace std; +using namespace utility; +using namespace exception; + +namespace nonltr { + +template +class EnrichmentMarkovView: public KmerHashTable{ + +private: + // The minimum number of the observed k-mers + const int minObs; + + // This template specification should work up to order of 14, + // i.e. word length = 15 + vector *> * modelList; + + // Markov order + int o; + + // Total length + long l; + + // Multiplied the probability of word by this factor + // Equivalent to four decimal points. + const double factor; // = 10000.00; + + // Initialize data members + void initialize(int); + + /** + * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c + */ + inline double round(double number) { + return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5); + } + +public: + EnrichmentMarkovView(int, int, int); + EnrichmentMarkovView(int, V, int, int); + virtual ~EnrichmentMarkovView(); + + void count(const char *, int, int); + void generateProbapilities(); + void processTable(); +}; +} /* namespace nonltr */ + +#include "EnrichmentMarkovView.cpp" + +#endif /* ENRICHMENTMARKOVVIEW_H_ */ diff --git a/src/nonltr/HMM.cpp b/src/nonltr/HMM.cpp new file mode 100644 index 0000000..df4c8b0 --- /dev/null +++ b/src/nonltr/HMM.cpp @@ -0,0 +1,630 @@ +/* + * HMM.cpp + * + * Created on: Jun 21, 2013 + * Author: Hani Zakaria Girgis, PhD + */ + +#include "HMM.h" + +#include +#include + +#include "../utility/Util.h" +#include "../exception/InvalidStateException.h" +#include "../exception/InvalidInputException.h" +#include "../exception/FileDoesNotExistException.h" +#include "../exception/InvalidOperationException.h" + +using namespace std; +using namespace utility; +using namespace exception; + +namespace nonltr { + +HMM::HMM(string hmmFile) : + PRECISION(numeric_limits::digits10 + 1) { + // ToDo: Fix this operation + string msg("Reading HMM from file is temporarily disabled."); + throw InvalidOperationException(msg); + + cerr << "Building HMM from: " << hmmFile << endl; + + ifstream in(hmmFile.c_str()); + in.precision(PRECISION); + + if (in) { + string token; + bool isLogBase = false; + bool isStates = false; + bool isPriors = false; + bool isTransition = false; + + while (in >> token) { + if (isLogBase) { + base = atof(token.c_str()); + + checkBase(base); + + logBase = log(base); + isLogBase = false; + } else if (isStates) { + stateNumber = atoi(token.c_str()); + positiveStateNumber = stateNumber / 2; + initializeHelper(); + + isStates = false; + } else if (isPriors) { + //Skip state names + for (int i = 1; i < stateNumber; i++) { + in >> token; + } + for (int i = 0; i < stateNumber; i++) { + in >> token; + (*pList)[i] = atof(token.c_str()); + } + + isPriors = false; + } else if (isTransition) { + //Skip state names + for (int i = 1; i < stateNumber; i++) { + in >> token; + } + + for (int i = 0; i < stateNumber; i++) { + //Skip the name of the state at the beginning of the line + for (int j = -1; j < stateNumber; j++) { + in >> token; + if (j > -1) { + (*(tList->at(i)))[j] = atof(token.c_str()); + } + } + } + + isTransition = false; + } + + if (token.compare("Base") == 0) { + isLogBase = true; + } else if (token.compare("States") == 0) { + isStates = true; + } else if (token.compare("Priors") == 0) { + isPriors = true; + } else if (token.compare("Transition") == 0) { + isTransition = true; + } + } + + in.close(); + } else { + string msg(hmmFile); + msg.append(" does not exist."); + throw FileDoesNotExistException(msg); + } + in.close(); + + //print("/Users/zakarota/Data/HgTest/Rep/Test/genome/hmmTest.txt"); +} + +/** + * Use this constructor to train on the entire genome. + * The client has to call train on each chromosome. + * base is the threshold. + + */ +HMM::HMM(double base, int stateNumber) : + PRECISION(numeric_limits::digits10 + 1) { + initialize(base, stateNumber); +} + +void HMM::initialize(double baseIn, int stateNumberIn) { + base = baseIn; + checkBase(base); + + logBase = log(baseIn); + + stateNumber = stateNumberIn; + // Make sure that the number of states is even and > 0 + if (stateNumber % 2 != 0 || stateNumber == 0) { + string msg("The number of states must be even and > zero."); + throw InvalidInputException(msg); + } + + positiveStateNumber = stateNumber / 2; + cout << "The number of states is: " << stateNumber << endl; + + initializeHelper(); +} + +/** + * This method makes sure that the base is not zero. + */ +void HMM::checkBase(double base) { + if (fabs(base - 0.0) < std::numeric_limits::epsilon()) { + string msg("The base cannot be zero because log(base) is not defined."); + throw InvalidInputException(msg); + } +} + +void HMM::initializeHelper() { + // Ensure that the number of the states is positive + if (stateNumber < 1) { + string msg("The number of states must be positive."); + throw InvalidStateException(msg); + } + + pList = new vector(stateNumber, 1); + tList = new vector *>; + for (int i = 0; i < stateNumber; i++) { + tList->push_back(new vector(stateNumber, 1)); + } + oList = new vector(stateNumber, 1); + + // Check if infinity can be handled + if (!std::numeric_limits::has_infinity) { + string msg("This compiler does not handle infinite values. "); + msg.append(string("The decoding algorithm will not function.")); + throw InvalidStateException(msg); + } else { + minusInf = -1.0 * std::numeric_limits::infinity(); + } +} + +HMM::~HMM() { + pList->clear(); + delete pList; + + Util::deleteInVector(tList); + delete tList; + + oList->clear(); + delete oList; +} + +void HMM::train(vector * scoreListIn, + const vector *> * segmentListIn, + const vector * candidateListIn) { + + scoreList = scoreListIn; + segmentList = segmentListIn; + candidateList = candidateListIn; + + int candidateCount = candidateList->size(); + if (candidateCount > 0) { + int firstCandIndex = 0; + int lastCandIndex = 0; + int segmentNumber = segmentList->size(); + for (int i = 0; i < segmentNumber; i++) { + vector * s = segmentList->at(i); + ILocation * c = candidateList->at(firstCandIndex); + // A segment may have no detections + if (Util::isOverlapping(s->at(0), s->at(1), c->getStart(), + c->getEnd())) { + lastCandIndex = trainHelper1(s->at(0), s->at(1), + firstCandIndex); + trainHelper2(s->at(0), s->at(1), firstCandIndex, lastCandIndex); + firstCandIndex = lastCandIndex + 1; + if (firstCandIndex >= candidateCount) { + break; + } + } + } + } +} + +int HMM::trainHelper1(int segStart, int segEnd, int firstCandIndex) { + ILocation * cand = candidateList->at(firstCandIndex); + if (!Util::isOverlapping(segStart, segEnd, cand->getStart(), + cand->getEnd())) { + string msg("The first candidate is not overlapping with the segment. "); + msg.append("Candidate location is: "); + msg.append(cand->toString()); + msg.append(" Segment location is: "); + msg.append(Util::int2string(segStart)); + msg.append("-"); + msg.append(Util::int2string(segEnd)); + throw InvalidInputException(msg); + } + + int lastCandIndex = -1; + int candidateNumber = candidateList->size(); + for (int c = firstCandIndex; c < candidateNumber; c++) { + ILocation * cand = candidateList->at(c); + if (Util::isOverlapping(segStart, segEnd, cand->getStart(), + cand->getEnd())) { + lastCandIndex = c; + } else { + break; + } + } + + if (lastCandIndex < 0) { + string msg("The index of the last candidate cannot be negative."); + throw InvalidStateException(msg); + } + + return lastCandIndex; +} + +void HMM::trainHelper2(int segStart, int segEnd, int firstCandIndex, + int lastCandIndex) { + ILocation * f = candidateList->at(firstCandIndex); + + // First negative region if present + int fStart = f->getStart(); + if (fStart > segStart) { + trainNegative(segStart, fStart - 1); + move(getNgtvState(fStart - 1), getPstvState(fStart)); + } + + // Alternating positive and negative regions + for (int i = firstCandIndex; i < lastCandIndex; i++) { + ILocation * c = candidateList->at(i); + int cStart = c->getStart(); + int cEnd = c->getEnd(); + trainPositive(cStart, cEnd); + move(getPstvState(cEnd), getNgtvState(cEnd + 1)); + + int nextStart = candidateList->at(i + 1)->getStart(); + trainNegative(cEnd + 1, nextStart - 1); + move(getNgtvState(nextStart - 1), getPstvState(nextStart)); + } + + // Last positive region + ILocation * l = candidateList->at(lastCandIndex); + int lEnd = l->getEnd(); + trainPositive(l->getStart(), lEnd); + + // Last negative region if present + if (segEnd > lEnd) { + move(getPstvState(lEnd), getNgtvState(lEnd + 1)); + trainNegative(lEnd + 1, segEnd); + } +} + +void HMM::trainPositive(int s, int e) { + int pIndex = getPstvState(s); + (*pList)[pIndex] = pList->at(pIndex) + 1; + + for (int i = s; i <= e; i++) { + int index = getPstvState(i); + (*oList)[index] = oList->at(index) + 1; + } + + for (int i = s; i < e; i++) { + move(getPstvState(i), getPstvState(i + 1)); + } +} + +void HMM::trainNegative(int s, int e) { + int pIndex = getNgtvState(s); + (*pList)[pIndex] = pList->at(pIndex) + 1; + + for (int i = s; i <= e; i++) { + int index = getNgtvState(i); + (*oList)[index] = oList->at(index) + 1; + } + + for (int i = s; i < e; i++) { + move(getNgtvState(i), getNgtvState(i + 1)); + } +} + +void HMM::move(int state1, int state2) { + vector * state1Row = tList->at(state1); + (*state1Row)[state2] = state1Row->at(state2) + 1; +} + +void HMM::normalize() { +// Priors + double sum = 0.0; + for (int i = 0; i < stateNumber; i++) { + sum += pList->at(i); + } + for (int i = 0; i < stateNumber; i++) { + (*pList)[i] = log(pList->at(i) / sum); + } + +// Output + for (int i = 0; i < stateNumber; i++) { + (*oList)[i] = log(1.0); + } + +// Transition + for (int i = 0; i < stateNumber; i++) { + vector * row = tList->at(i); + double sum = 0.0; + for (int j = 0; j < stateNumber; j++) { + sum += row->at(j); + } + + for (int j = 0; j < stateNumber; j++) { + (*row)[j] = log(row->at(j) / sum); + } + } +} + +void HMM::print() { + cout.precision(PRECISION); + + // State names + vector v; + for (int j = 0; j < positiveStateNumber; j++) { + v.push_back(Util::int2string(j)); + } + string m("-"); + for (int j = 0; j < positiveStateNumber; j++) { + v.push_back(m + Util::int2string(j)); + } + + cout << "Priors:" << endl; + for (int g = 0; g < 2; g++) { + for (int i = 0; i < positiveStateNumber; i++) { + cout << v.at(i + (g * positiveStateNumber)) << "\t"; + } + + for (int i = 0; i < positiveStateNumber; i++) { + cout << pList->at(i + (g * positiveStateNumber)) << "\t"; + } + cout << endl; + } + cout << endl; + + /* + cout << "Output:" << endl; + for (int i = 0; i < v.size(); i++) { + cout << v.at(i) << "\t"; + } + cout << endl; + for (int i = 0; i < stateNumber; i++) { + cout << oCountList->at(i) << "\t"; + } + cout << endl << endl; + */ + + cout << "Transition:" << endl << "\t"; + for (int i = 0; i < v.size(); i++) { + cout << v.at(i) << "\t"; + } + cout << endl; + + for (int i = 0; i < stateNumber; i++) { + vector * row = tList->at(i); + cout << v.at(i) << "\t"; + for (int j = 0; j < stateNumber; j++) { + cout << row->at(j) << "\t"; + } + cout << endl; + } + cout << endl << endl; +} + +void HMM::print(string hmo) { + ofstream out(hmo.c_str()); + out.precision(PRECISION); + + out << "Base" << endl << base << endl; + + out << "States" << endl << stateNumber << endl; + + vector v; + for (int j = 0; j < positiveStateNumber; j++) { + v.push_back(Util::int2string(j)); + } + string m("-"); + for (int j = 0; j < positiveStateNumber; j++) { + v.push_back(m + Util::int2string(j)); + } + + out << "Priors" << endl; + for (int i = 0; i < v.size(); i++) { + out << v.at(i) << " "; + } + out << endl; + + for (int i = 0; i < v.size(); i++) { + out << pList->at(i) << " "; + } + out << endl; + + out << "Transition" << endl << "\t"; + for (int i = 0; i < v.size(); i++) { + out << v.at(i) << "\t"; + } + out << endl; + + for (int i = 0; i < stateNumber; i++) { + vector * row = tList->at(i); + out << v.at(i) << "\t"; + for (int j = 0; j < stateNumber; j++) { + out << row->at(j) << "\t"; + } + out << endl; + } + out << endl << endl; + + out.close(); +} + +/** + * This method will append the state sequence to the end of the input state list + * This method returns the log likelihood + */ +double HMM::decode(int rStart, int rEnd, vector * scoreListIn, + vector& stateList) { + scoreList = scoreListIn; + + // Make sure that the coordinates represent valid location + Location check(rStart, rEnd); + // End check + + vector > v(stateNumber); + int size = rEnd - rStart + 1; + for (int i = 0; i < stateNumber; i++) { + v[i] = vector(size, minusInf); + } + + vector > p(stateNumber); + for (int i = 0; i < stateNumber; i++) { + p[i] = vector(size, -1); + } + + // Initialize + int firstPstvState = getPstvState(rStart); + int firstNgtvState = positiveStateNumber + firstPstvState; + v[firstPstvState][0] = pList->at(firstPstvState); + v[firstNgtvState][0] = pList->at(firstNgtvState); + + // Recurs + for (int i = rStart + 1; i <= rEnd; i++) { + int vIndex = i - rStart; + + // Obtain states from scores + int pPstvState = getPstvState(i - 1); + int pNgtvState = positiveStateNumber + pPstvState; + int cPstvState = getPstvState(i); + int cNgtvState = positiveStateNumber + cPstvState; + + // Set positive state + double p1 = v[pPstvState][vIndex - 1] + + (*(*tList)[pPstvState])[cPstvState]; + double p2 = v[pNgtvState][vIndex - 1] + + (*(*tList)[pNgtvState])[cPstvState]; + if (p1 > p2) { + v[cPstvState][vIndex] = p1; + p[cPstvState][vIndex] = pPstvState; + } else { + v[cPstvState][vIndex] = p2; + p[cPstvState][vIndex] = pNgtvState; + } + + // Set negative state + double p3 = v[pPstvState][vIndex - 1] + + (*(*tList)[pPstvState])[cNgtvState]; + double p4 = v[pNgtvState][vIndex - 1] + + (*(*tList)[pNgtvState])[cNgtvState]; + if (p3 > p4) { + v[cNgtvState][vIndex] = p3; + p[cNgtvState][vIndex] = pPstvState; + } else { + v[cNgtvState][vIndex] = p4; + p[cNgtvState][vIndex] = pNgtvState; + } + } + + // Decode + int lastBestState = 0; + double lastBestValue = v[0][size - 1]; + for (int i = 1; i < stateNumber; i++) { + double currentValue = v[i][size - 1]; + if (currentValue > lastBestValue) { + lastBestState = i; + lastBestValue = currentValue; + } + } + + int stateListOriginalSize = stateList.size(); + for (int i = stateListOriginalSize; i < stateListOriginalSize + size; i++) { + stateList.push_back(-1); + } + + stateList[stateListOriginalSize + size - 1] = lastBestState; + for (int i = size - 1; i > 0; i--) { + lastBestState = p[lastBestState][i]; + stateList[stateListOriginalSize + i - 1] = lastBestState; + } + + // Make sure that no state in the results has the value of -1 + for (int i = stateListOriginalSize; i < stateListOriginalSize + size; i++) { + if (stateList[i] == -1) { + string msg("At least one state was not determined properly."); + throw InvalidStateException(msg); + } + } + + // Test - start + /* + bool canPrint = false; + for (int i = stateListOriginalSize; i < stateListOriginalSize + size; i++) { + if (stateList.at(i) >= positiveStateNumber) { + canPrint = true; + } + } + if (canPrint) { + for (int i = rStart; i <= rEnd; i++) { + cout << scoreList->at(i) << " "; + } + cout << endl; + + for (int i = stateListOriginalSize; i < stateListOriginalSize + size; + i++) { + if (stateList.at(i) < positiveStateNumber) { + cout << "+"; + } else { + cout << "-"; + //cout << stateList.at(i) << " "; + } + } + cout << endl; + } + */ + + // Test - end + return lastBestValue; +} + +/** + * Append positive regions at the end of regionList + */ +double HMM::decode(int rStart, int rEnd, vector * scoreListIn, + vector& regionList) { + + vector stateList; + double logLikelihood = decode(rStart, rEnd, scoreListIn, stateList); + + int size = stateList.size(); + bool inRpt = false; + bool canFill = false; + int s = -1; + int e = -1; + + for (int i = 0; i < size; i++) { + // Start a new repeat + if (stateList.at(i) < positiveStateNumber && !inRpt) { + inRpt = true; + s = i; + } + // End a the current repeat + else if (stateList.at(i) >= positiveStateNumber && inRpt) { + e = i - 1; + inRpt = false; + canFill = true; + } + // If the current repeat at the end of the segment + else if (i == size - 1 && inRpt) { + e = i; + inRpt = false; + canFill = true; + } + // Extract features of the just recognized repeat + if (canFill) { + regionList.push_back(new Location(s + rStart, e + rStart)); + s = -1; + e = -1; + canFill = false; + } + } + + return logLikelihood; +} + +int HMM::getPositiveStateNumber() { + return positiveStateNumber; +} + +double HMM::getBase() { + return base; +} + +} +/* namespace nonltr */ diff --git a/src/nonltr/HMM.h b/src/nonltr/HMM.h new file mode 100644 index 0000000..82c7ec1 --- /dev/null +++ b/src/nonltr/HMM.h @@ -0,0 +1,103 @@ +/* + * HMM.h + * + * Created on: Jun 21, 2013 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef HMM_H_ +#define HMM_H_ + +#include +#include +#include +#include + +#include "../utility/ILocation.h" + +using namespace std; +using namespace utility; + +namespace nonltr { + +class HMM { +private: + const int PRECISION; + double minusInf; + vector * pList; + vector *> * tList; + vector * oList; + + void initializeHelper(); + // Returns the index of the last candidate in the segment + int trainHelper1(int, int, int); + void trainHelper2(int, int, int, int); + void trainPositive(int, int); + void trainNegative(int, int); + void move(int, int); + void checkBase(double); + + /* + inline int getPstvState(int score) { + int state = ceil(log(score) / logBase); + if (state < 0) { + state = 0; + } + return state; + } + + inline int getNgtvState(int score) { + int state = ceil(log(score) / logBase); + if (state < 0) { + state = 0; + } + return state + positiveStateNumber; + } + */ + + inline int getPstvState(int index) { + int state = scoreList->at(index); + return state; + } + + inline int getNgtvState(int index) { + int state = scoreList->at(index); + return state + positiveStateNumber; + } + +protected: + double base; + double logBase; + int stateNumber; + int positiveStateNumber; + + vector * scoreList; + const vector *> * segmentList; + const vector * candidateList; + + void initialize(double, int); + /** + * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c + */ + inline double round(double number) { + return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5); + } + +public: + HMM(string); // Build a model from file + HMM(double, int); + // HMM(vector *, const vector *> *, + // const vector *, double); + virtual ~HMM(); + void train(vector *, const vector *> *, const vector *); + void normalize(); + double decode(int, int, vector *, vector&); + double decode(int, int, vector *, vector&); + int getPositiveStateNumber(); + void print(); + void print(string); + double getBase(); +}; + +} /* namespace nonltr */ +#endif /* HMM_H_ */ diff --git a/src/nonltr/IChromosome.h b/src/nonltr/IChromosome.h new file mode 100644 index 0000000..8663163 --- /dev/null +++ b/src/nonltr/IChromosome.h @@ -0,0 +1,28 @@ +/* + * IChromosome.h + * + * Created on: Feb 4, 2013 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef ICHROMOSOME_H_ +#define ICHROMOSOME_H_ + +#include +#include + +using namespace std; + +namespace nonltr { + +class IChromosome { +public: + //IChromosome(); + //virtual ~IChromosome(); + virtual const string* getBase() = 0; + virtual const vector *> * getSegment() = 0; + virtual string getHeader() = 0; +}; + +} /* namespace tr */ +#endif /* ICHROMOSOME_H_ */ diff --git a/src/nonltr/ITableView.h b/src/nonltr/ITableView.h new file mode 100644 index 0000000..932b4cc --- /dev/null +++ b/src/nonltr/ITableView.h @@ -0,0 +1,34 @@ +/* + * ITableView.h + * + * Created on: Aug 9, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef ITABLEVIEW_H_ +#define ITABLEVIEW_H_ + +#include + +using namespace std; + +namespace nonltr { + +template +class ITableView { +public: + virtual V valueOf(const char*) = 0 ; + virtual V valueOf(const char*, int) = 0; + virtual V valueOf(I) = 0; + + virtual int getK() = 0; + virtual I getMaxTableSize() = 0; + virtual const V * getValues() const = 0; + + virtual void wholesaleValueOf(const char *, int, int, vector *) = 0; + virtual void wholesaleValueOf(const char *, int, int, vector *, int) = 0; +}; + +} + +#endif /* ITABLEVIEW_H_ */ diff --git a/src/nonltr/KmerHashTable.cpp b/src/nonltr/KmerHashTable.cpp new file mode 100644 index 0000000..dc53505 --- /dev/null +++ b/src/nonltr/KmerHashTable.cpp @@ -0,0 +1,445 @@ +/* + * KmerHashTable.cpp + * + * Created on: Jul 25, 2012 + * Author: Hani Zakaria Girgis, PhD + */ +#include +#include +#include +#include + +#include "../utility/Util.h" +#include "../exception/InvalidInputException.h" +#include "../exception/InvalidStateException.h" + +using namespace std; +using namespace exception; +using namespace nonltr; +using namespace utility; + +template +KmerHashTable::KmerHashTable(int keyLength) { + initialize(keyLength, 0); +} + +template +KmerHashTable::KmerHashTable(int keyLength, V initValue) { + initialize(keyLength, initValue); +} + +template +void KmerHashTable::initialize(int keyLength, V initialValueIn) { + /* + if (keyLength > maxKeyLength) { + string msg = "The maximum size (k) of the k-mer is "; + char temp[3]; + sprintf(temp, "%d", maxKeyLength); + msg += temp; + throw InvalidInputException(msg); + } + */ + + k = keyLength; + initialValue = initialValueIn; + + // Initialize bases + bases = new I[k]; + for (int i = k - 1; i >= 0; i--) { + bases[k - 1 - i] = (I) pow(4.0, i); + } + + // Initialize mMinusOne + mMinusOne = new I[4]; + for (int i = 0; i < 4; i++) { + mMinusOne[i] = i * bases[0]; + } + + // Get maximum size of table + char * temp = new char[k]; + for (int i = 0; i < k; i++) { + temp[i] = 3; + } + + maxTableSize = hash(temp) + 1; + delete[] temp; + + // Initialize values + values = new V[maxTableSize]; + for (I i = 0; i < maxTableSize; i++) { + values[i] = initialValue; + } + + // Test + /* + char key[] = { 3, 3, 3, 3, 0, 0, 0, 0 }; + long value = 100; + insert(key, 4, value); + long index = hash(key, 4); + cout << "Index: " << index << " " << values[index] << endl; + cout << "Index: " << index << " " << valueOf(key, 4) << endl; + cout << "Number of filled entries: " << countNonZeroEntries() << endl; + */ +} + +template +KmerHashTable::~KmerHashTable() { + delete[] bases; + delete[] mMinusOne; + delete[] values; +} + +/** + * word: an array of characters. + * The maximum integer value is 3 and the minimum is 0 + */ +template +I KmerHashTable::hash(const char * key) { + return hash(key, 0); +} + +/** + * seq: an array of characters e.g. [0,0,1,1,1,3,2]. + * start: the start index of the key. + * This method is designed to process a long sequence. + */ +template +I KmerHashTable::hash(const char * sequence, int keyStart) { + I index = 0; + for (int i = 0; i < k; i++) { + char nucleotide = sequence[keyStart + i]; + if (nucleotide >= 0 && nucleotide <= 3) { + index += bases[i] * sequence[keyStart + i]; + } else { + string msg("The value of the char representing the nucleotide "); + msg.append("must be between 0 and 3."); + msg.append("The int value is "); + msg.append(Util::int2string((int) nucleotide)); + msg.append(" of nucleotide at index "); + msg.append(Util::int2string(keyStart + i)); + + for (int h = 0 + keyStart; h < k + keyStart; h++) { + cerr << (int) sequence[h]; + } + cerr << endl; + + throw InvalidInputException(msg); + } + } + return index; +} + +template +void KmerHashTable::hash(const char * sequence, int start, int end, + vector * hashList) { + + for (int i = start; i <= end; i++) { + char nucleotide = sequence[i]; + if (!(nucleotide >= 0 && nucleotide <= 3)) { + string msg("The value of the char representing the nucleotide "); + msg.append("must be between 0 and 3."); + msg.append("The int value is "); + msg.append(Util::int2string((int) nucleotide)); + msg.append(" of nucleotide at index "); + msg.append(Util::int2string(i)); + + throw InvalidInputException(msg); + } + } + + I lastHash = hash(sequence, start); + hashList->push_back(lastHash); + + for (int i = start + 1; i <= end; i++) { + I s1 = 4 * (lastHash - mMinusOne[(int) sequence[i - 1]]) + + (int) sequence[i + k - 1]; + hashList->push_back(s1); + lastHash = s1; + } +} + +/** + * This method put the key-value pair in the table. + * Note: keys are unique, i.e. no duplicate keys. + */ +template +void KmerHashTable::insert(const char* key, V value) { + insert(key, 0, value); +} + +/** + * Similar to the above method. + * The key begins at start in seq. + * The length of the key is k. + */ +template +void KmerHashTable::insert(const char* sequence, int keyStart, V value) { + values[hash(sequence, keyStart)] = value; +} + +template +void KmerHashTable::insert(I keyHash, V value) { + values[keyHash] = value; +} + +/** + * Call wholesaleIncrement on the segment itself. + * Then, call it again on the reverse complement of this segment. + * + * sequence: is a long sequence usually a long segment of a chromosome. + * sFirstKmer: is the start index of the first k-mer. + * sLastKmer: is the start index of the last k-mer. + */ +template +void KmerHashTable::wholesaleIncrement(const char* sequence, + int firstKmerStart, int lastKmerStart) { + // Increment k-mer's in the forward strand + vector hashList = vector(); + hash(sequence, firstKmerStart, lastKmerStart, &hashList); + + int size = hashList.size(); + for (int i = 0; i < size; i++) { + I keyHash = hashList.at(i); + if (keyHash >= maxTableSize) { + cerr << "array out of bounds" << endl; + throw ""; + } + values[keyHash]++; + } + + // Increment k-mer's in the reverse complement + /* + string rc(""); + Util::revCompDig(sequence, firstKmerStart, lastKmerStart + k - 1, &rc); + + hashList.clear(); + hash(rc.c_str(), 0, rc.size() - k, &hashList); + size = hashList.size(); + + for (int i = 0; i < size; i++) { + I keyHash = hashList.at(i); + values[keyHash]++; + }*/ +} + +/** + * Increment the entry associated with the key by one. + */ +template +void KmerHashTable::increment(const char* key) { + increment(key, 0); +} + +/** + * Increment the value associated with the key starting at keyStart in the + * sequence by one. Also, this method increments the count of the reverse complement + * of the kmer by one. + */ +template +void KmerHashTable::increment(const char* sequence, int keyStart) { + // Increment the count of the kmer by one. + I index = hash(sequence, keyStart); + values[index]++; + + // Generate the reverse complement of the kmer. + char * rcKmer = new char[k]; + for (int j = 0; j < k; j++) { + switch (sequence[j + keyStart]) { + case 0: + rcKmer[k - 1 - j] = 3; + break; + case 1: + rcKmer[k - 1 - j] = 2; + break; + case 2: + rcKmer[k - 1 - j] = 1; + break; + case 3: + rcKmer[k - 1 - j] = 0; + break; + default: + string msg = string("Invalid code of a nucleotide: "); + msg.append(1, sequence[j + keyStart]); + msg.append(". Valid codes are 0, 1, 2, and 3."); + throw InvalidInputException(msg); + } + } + + // Update the count of the reverse complement of the kmer by one. + I rcIndex = hash(rcKmer, 0); + values[rcIndex]++; + + // Free memory + delete[] rcKmer; +} + +/** + * Return the value associated with the key + */ +template +V KmerHashTable::valueOf(const char* key) { + return valueOf(key, 0); +} + +/** + * Return the value associated with the key + * The key is a substring of length k starting at keyStart in the sequence + */ +template +V KmerHashTable::valueOf(const char* sequence, int keyStart) { + return values[hash(sequence, keyStart)]; +} + +template +V KmerHashTable::valueOf(I keyHash) { + return values[keyHash]; +} + +template +void KmerHashTable::wholesaleValueOf(const char * sequence, + int firstKmerStart, int lastKmerStart, vector * results) { + wholesaleValueOf(sequence, firstKmerStart, lastKmerStart, results, 0); +} + +/** + * The values are set in the results vector starting at the resultsStart. + * The contents of vector "results" must be initialized. + * Otherwise, the program will crash outputting: "segmentation fault 11" + */ +template +void KmerHashTable::wholesaleValueOf(const char * sequence, + int firstKmerStart, int lastKmerStart, vector * results, + int resultsStart) { + + int index = resultsStart; + vector hashList = vector(); + hash(sequence, firstKmerStart, lastKmerStart, &hashList); + int size = hashList.size(); + + for (int i = 0; i < size; i++) { + (*results)[index] = values[hashList.at(i)]; + index++; + } +} + +/** + * This method returns the number of occupied entries in the table. + * A non-occupied entry has the initial value. + */ +template +I KmerHashTable::countNonInitialEntries() { + I count = 0; + for (I i = 0; i < maxTableSize; i++) { + if (values[i] != initialValue) { + count++; + } + } + return count; +} + +/** + * Make a list of the k-mers. + */ +template +vector* KmerHashTable::getKeys() { + vector * alpha = new vector(); + alpha->push_back((char) 0); + alpha->push_back((char) 1); + alpha->push_back((char) 2); + alpha->push_back((char) 3); + + vector *words = new vector(); + for (int h = 0; h < alpha->size(); h++) { + words->push_back(string(1, alpha->at(h))); + } + + int wLen = k; + for (int i = 1; i < wLen; i++) { + vector *wordsAtItrI = new vector(); + for (I j = 0; j < words->size(); j++) { + for (int h = 0; h < alpha->size(); h++) { + string w = string(words->at(j)); + w.append(1, alpha->at(h)); + wordsAtItrI->push_back(w); + } + } + words->clear(); + delete words; + words = new vector(*wordsAtItrI); + + // Free memory + wordsAtItrI->clear(); + delete wordsAtItrI; + } + + // Free memory + alpha->clear(); + delete alpha; + return words; +} + +/** + * Print the contents of the whole table + */ +template +void KmerHashTable::printTable(string output) { + vector keys; +// getKeys(keys); + + ofstream out(output.c_str()); + + for (I i = 0; i < keys.size(); i++) { + const char * kmer = keys.at(i); + for (int j = 0; j < k; j++) { + out << (int) kmer[j]; + } + cerr << "Hash: " << hash(keys.at(i), 0) << endl; + + out << " -> " << values[hash(keys.at(i), 0)] << endl; + } + + out.close(); + keys.clear(); +} + +template +int KmerHashTable::getK() { + return k; +} + +template +I KmerHashTable::getMaxTableSize() { + return maxTableSize; +} + +template +const V * KmerHashTable::getValues() const { + return values; +} + +/** + * Call after building the table. + * A negative value is a likely indication of overflow. + */ +template +void KmerHashTable::checkOverflow() { + for (I y = 0; y < maxTableSize; y++) { + if (values[y] < 0) { + string msg("A negative value is a likely indication of overflow. "); + msg.append( + "To the developer, consider larger data type in KmerHashTable."); + throw InvalidStateException(msg); + } + } +} + +template +V KmerHashTable::getMaxValue() { + V max = 0; + for (I y = 0; y < maxTableSize; y++) { + if (values[y] > max) { + max = values[y]; + } + } + return max; +} diff --git a/src/nonltr/KmerHashTable.h b/src/nonltr/KmerHashTable.h new file mode 100644 index 0000000..7c38e23 --- /dev/null +++ b/src/nonltr/KmerHashTable.h @@ -0,0 +1,83 @@ +/* + * KmerHashTable.h + * + * Created on: Jul 25, 2012 + * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH + */ + +#ifndef KMERHASHTABLE_H_ +#define KMERHASHTABLE_H_ + +#include +#include +#include "ITableView.h" + +using namespace std; +using namespace nonltr; + +namespace nonltr { + +template +class KmerHashTable: public ITableView { + +protected: + /* Fields */ + static const int maxKeyLength = 15; + int k; + + + I maxTableSize; + + // The hashed values, i.e. the values of the hash table. + // The index is the 4ry representation of the key + V * values; + V initialValue; + +private: + // [4^0, 4^1, ... , 4^(k-1)] + I * bases; + I * mMinusOne; + void initialize(int, V); + +public: + /* Methods */ + KmerHashTable(int); + KmerHashTable(int, V); + + virtual ~KmerHashTable(); + + I hash(const char *); + I hash(const char *, int); + void hash(const char *, int, int, vector *); + + void insert(const char*, V); + void insert(const char*, int, V); + void insert(I, V); + + void increment(const char*); + void increment(const char*, int); + void wholesaleIncrement(const char*, int, int); + + void addReverseComplement(); + I countNonInitialEntries(); + vector *getKeys(); + void printTable(string); + void checkOverflow(); + + /*Vritual methods from ITableView*/ + virtual V valueOf(const char*); + virtual V valueOf(const char*, int); + virtual V valueOf(I); + virtual void wholesaleValueOf(const char *, int, int, vector *); + virtual void wholesaleValueOf(const char *, int, int, vector *, int); + + virtual int getK(); + virtual I getMaxTableSize(); + virtual V getMaxValue(); + virtual const V * getValues() const; +}; +} + +#include "KmerHashTable.cpp" + +#endif /* KMERHASHTABLE_H_ */ diff --git a/src/nonltr/LocationList.cpp b/src/nonltr/LocationList.cpp new file mode 100644 index 0000000..4b93d36 --- /dev/null +++ b/src/nonltr/LocationList.cpp @@ -0,0 +1,153 @@ +/* + * LocationList.cpp + * + * Created on: Feb 19, 2015 + * Author: Hani Zakaria Girgis, PhD + * + * + * An instance of this class holds a list of merged locations. + */ + +#include "LocationList.h" + +namespace nonltr { + +LocationList::LocationList(string chromNameIn) { + chromName = chromNameIn; + regionList = new vector(); + merge(); +} + +LocationList::~LocationList() { + Util::deleteInVector(regionList); + delete regionList; +} + +void LocationList::add(int start, int end) { + regionList->push_back(new Location(start, end)); +} + +void LocationList::merge() { + int regionCount = regionList->size(); + int gg = 0; + while (gg < regionCount) { + ILocation * region = regionList->at(gg); + + int regionStart = region->getStart(); + int regionEnd = region->getEnd(); + + if (gg > 0) { + ILocation * pRegion = regionList->at(gg - 1); + int pStart = pRegion->getStart(); + int pEnd = pRegion->getEnd(); + + if (Util::isOverlapping(pStart, pEnd, regionStart, regionEnd)) { + pRegion->setEnd(regionEnd > pEnd ? regionEnd : pEnd); + regionList->erase(regionList->begin() + gg); + delete region; + regionCount = regionList->size(); + } else { + gg++; + } + } + + if (gg == 0) { + gg++; + } + } +} + +void LocationList::mergeWithAnotherList( + const vector * const otherList) { + //A pre-condition: Ensure that the other list is sorted + for (int h = 1; h < otherList->size(); h++) { + if (otherList->at(h)->getStart() < otherList->at(h - 1)->getStart()) { + throw InvalidStateException( + string("LocationList - The other list is not sorted.")); + } + } + + // Start + vector * mergedList = new vector(); + + int i = 0; + int j = 0; + int iLimit = regionList->size(); + int jLimit = otherList->size(); + + // Continue until one list is finished + while (i < iLimit && j < jLimit) { + ILocation * iLoc = regionList->at(i); + ILocation * jLoc = otherList->at(j); + + if (iLoc->getStart() < jLoc->getStart()) { + mergedList->push_back(iLoc); + i++; + } else { + mergedList->push_back(new Location(*jLoc)); + j++; + } + } + + // Once one list is finished, copy the rest of the other list + if (i == iLimit) { + for (; j < jLimit; j++) { + mergedList->push_back(new Location(*(otherList->at(j)))); + } + } else if (j == jLimit) { + for (; i < iLimit; i++) { + mergedList->push_back(regionList->at(i)); + } + } + + // Once done + // Util::deleteInVector(regionList); + regionList->clear(); // Need to test this line + delete regionList; + regionList = mergedList; + + merge(); + + //A post-condition: Ensure that the list is sorted + for (int h = 1; h < regionList->size(); h++) { + if (regionList->at(h)->getStart() < regionList->at(h - 1)->getStart()) { + throw InvalidStateException(string("This list is not sorted.")); + } + } +} + +void LocationList::print() { + cout << endl << chromName << endl; + for (int i = 0; i < regionList->size(); i++) { + int s = regionList->at(i)->getStart(); + int e = regionList->at(i)->getEnd(); + cout << s << "-" << e << endl; + } +} + +const vector * LocationList::getList() { + return regionList; +} + +void LocationList::convertToRedFormat() { + trim(1); +} + +void LocationList::trim(int x) { + for (int i = regionList->size() - 1; i >= 0; i--) { + ILocation * region = regionList->at(i); + int start = region->getStart(); + int newEnd = region->getEnd() - x; + + if (newEnd < 0 || start > newEnd) { + regionList->erase(regionList->begin() + i); + delete region; + } else { + region->setEnd(newEnd); + } + } +} + +} + +/* namespace nonltr */ diff --git a/src/nonltr/LocationList.h b/src/nonltr/LocationList.h new file mode 100644 index 0000000..1f5202a --- /dev/null +++ b/src/nonltr/LocationList.h @@ -0,0 +1,53 @@ +/* + * LocationList.h + * + * Created on: Feb 19, 2015 + * Author: Hani Z. Girgis, PhD + */ + +#ifndef SRC_NONLTR_LOCATIONLIST_H_ +#define SRC_NONLTR_LOCATIONLIST_H_ + +#include +#include "../utility/Util.h" +#include "../utility/ILocation.h" +#include "../utility/Location.h" +#include "../exception/InvalidStateException.h" + +using namespace std; +using namespace utility; +using namespace exception; + +namespace nonltr { + +class LocationList { +private: + string chromName; + vector * regionList; + void merge(); + +public: + LocationList(string); + virtual ~LocationList(); + + void add(int, int); + + /** + * Take a sorted list + */ + void mergeWithAnotherList(const vector * const); + + + /** + * Print locations + */ + void print(); + + const vector * getList(); + void convertToRedFormat(); + void trim(int ); +}; + +} /* namespace nonltr */ + +#endif /* SRC_NONLTR_LOCATIONLIST_H_ */ diff --git a/src/nonltr/LocationListCollection.cpp b/src/nonltr/LocationListCollection.cpp new file mode 100644 index 0000000..14c7a05 --- /dev/null +++ b/src/nonltr/LocationListCollection.cpp @@ -0,0 +1,101 @@ +/* + * LocationListCollection.cpp + * + * Created on: Feb 19, 2015 + * Author: Hani Zakaria Girgis, PhD + */ + +#include "LocationListCollection.h" + +namespace nonltr { + +LocationListCollection::LocationListCollection(string fileNameIn) { + fileName = fileNameIn; + collection = new map(); + readCoordinates(); +} + +LocationListCollection::~LocationListCollection() { + collection->clear(); + delete collection; +} + +void LocationListCollection::readCoordinates() { + Util::checkFile(fileName); + + ifstream in(fileName.c_str()); + LocationList * locList; + string previousChromName(""); + + while (in.good()) { + string line; + getline(in, line); + + if (line.compare(string("")) != 0) { + int colIndex = line.find_last_of(':'); + int dashIndex = line.find_last_of('-'); + + string chromName = line.substr(0, colIndex); + + if (previousChromName.compare(chromName) != 0) { + + cout << "Processing regions of " << chromName << endl; + + locList = new LocationList(chromName); + collection->insert( + map::value_type(chromName, + locList)); + + previousChromName = chromName; + } + + int start = + atoi( + line.substr(colIndex + 1, dashIndex - colIndex - 1).c_str()); + int end = atoi(line.substr(dashIndex + 1).c_str()); + locList->add(start, end); + } + } + + in.close(); +} + +void LocationListCollection::print() { + map::iterator itr_s = collection->begin(); + map::iterator itr_e = collection->end(); + while (itr_s != itr_e) { + collection->at(itr_s->first)->print(); + ++itr_s; + } +} + +LocationList * const LocationListCollection::getLocationList(string header) { + if (collection->count(header) == 0) { + string msg("Regions of "); + msg.append(header); + msg.append(" cannot be found.\n"); + throw InvalidStateException(msg); + } + + return collection->at(header); +} + +void LocationListCollection::convertToRedFormat() { + map::iterator itr_s = collection->begin(); + map::iterator itr_e = collection->end(); + while (itr_s != itr_e) { + collection->at(itr_s->first)->convertToRedFormat(); + ++itr_s; + } +} + +void LocationListCollection::trim(int x) { + map::iterator itr_s = collection->begin(); + map::iterator itr_e = collection->end(); + while (itr_s != itr_e) { + collection->at(itr_s->first)->trim(x); + ++itr_s; + } +} + +} /* namespace nonltr */ diff --git a/src/nonltr/LocationListCollection.h b/src/nonltr/LocationListCollection.h new file mode 100644 index 0000000..2461e97 --- /dev/null +++ b/src/nonltr/LocationListCollection.h @@ -0,0 +1,41 @@ +/* + * LocationListCollection.h + * + * Created on: Feb 19, 2015 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef SRC_NONLTR_LOCATIONLISTCOLLECTION_H_ +#define SRC_NONLTR_LOCATIONLISTCOLLECTION_H_ + +#include +#include + +#include "LocationList.h" +#include "../utility/Util.h" +#include "../exception/InvalidStateException.h" + +using namespace std; +using namespace utility; + +namespace nonltr { + +class LocationListCollection { + +private: + string fileName; + map * collection; + void readCoordinates(); + +public: + LocationListCollection(string); + virtual ~LocationListCollection(); + LocationList * const getLocationList(string); + void print(); + void convertToRedFormat(); + void trim(int ); +}; + +} /* namespace nonltr */ + +#endif /* SRC_NONLTR_LOCATIONLISTCOLLECTION_H_ */ diff --git a/src/nonltr/Scanner.cpp b/src/nonltr/Scanner.cpp new file mode 100644 index 0000000..8a24070 --- /dev/null +++ b/src/nonltr/Scanner.cpp @@ -0,0 +1,379 @@ +/* + * Scanner.cpp + * + * Created on: Aug 19, 2013 + * Author: Hani Zakaria Girgis, PhD + */ +#include "Scanner.h" + +namespace nonltr { + +Scanner::Scanner(HMM * hmmIn, int kIn, ChromosomeOneDigit * chromIn, + string scoresFile) { + // ToDo: Fix this operation + string msg("Scanning file of scores is temporarily disabled."); + throw InvalidOperationException(msg); + + hmm = hmmIn; + k = kIn; + chrom = chromIn; + segmentList = chrom->getSegment(); + scorer = NULL; + scoreList = new vector(); + ifstream in(scoresFile.c_str()); + if (in) { + string header; + getline(in, header); + + string score; + while (in >> score) { + scoreList->push_back(atoi(score.c_str())); + } + in.close(); + } else { + string msg(scoresFile); + msg.append(" does not exist."); + throw FileDoesNotExistException(msg); + } + + regionList = new vector(); + + // Start scanning + start(); +} + +Scanner::Scanner(HMM * hmmIn, int kIn, ChromosomeOneDigit * chromIn, + ITableView * table) { + hmm = hmmIn; + k = kIn; + + chrom = chromIn; + segmentList = chrom->getSegment(); + scorer = new Scorer(chrom, table); + scorer->takeLog(hmm->getBase()); + scoreList = scorer->getScores(); + regionList = new vector(); + + // Start scanning + start(); +} + +Scanner::~Scanner() { + if (scorer == NULL) { + scoreList->clear(); + delete scoreList; + } else { + delete scorer; + } + + Util::deleteInVector(regionList); + delete regionList; +} + +void Scanner::start() { + check(); + + decode(); + + extendByK(); + + merge(); +} + +void Scanner::check() { + if (chrom->size() != scoreList->size()) { + string msg("The size of the sequence is not the same as the size of "); + msg.append("the scores. The size of sequence is: "); + msg.append(Util::int2string(chrom->size())); + msg.append(". The size of the scores is: "); + msg.append(Util::int2string(scoreList->size())); + msg.append("."); + throw InvalidStateException(msg); + } +} + +void Scanner::decode() { + int segmentCount = segmentList->size(); + for (int tt = 0; tt < segmentCount; tt++) { + vector * segment = segmentList->at(tt); + hmm->decode(segment->at(0), segment->at(1), scoreList, *regionList); + } +} + +void Scanner::extendByK() { + int regionCount = regionList->size(); + if (regionCount > 0) { + int firstCandIndex = 0; + int lastCandIndex = 0; + int segmentNumber = segmentList->size(); + for (int i = 0; i < segmentNumber; i++) { + vector * s = segmentList->at(i); + ILocation * c = regionList->at(firstCandIndex); + // Sometimes a segment have no repeats + if (Util::isOverlapping(s->at(0), s->at(1), c->getStart(), + c->getEnd())) { + lastCandIndex = extendByKHelper(s->at(0), s->at(1), + firstCandIndex); + firstCandIndex = lastCandIndex + 1; + if (firstCandIndex >= regionCount) { + break; + } + } + } + } +} + +int Scanner::extendByKHelper(int segStart, int segEnd, int firstCandIndex) { + ILocation * cand = regionList->at(firstCandIndex); + + // Make sure that the first region is overlapping with the segment + if (!Util::isOverlapping(segStart, segEnd, cand->getStart(), + cand->getEnd())) { + string msg("The first region is not overlapping with the segment."); + msg.append(" Region: "); + msg.append(Util::int2string(cand->getStart())); + msg.append(":"); + msg.append(Util::int2string(cand->getEnd())); + msg.append(" Segment: "); + msg.append(Util::int2string(segStart)); + msg.append(":"); + msg.append(Util::int2string(segEnd)); + throw InvalidInputException(msg); + } + + int lastCandIndex = -1; + int candidateNumber = regionList->size(); + for (int c = firstCandIndex; c < candidateNumber; c++) { + ILocation * cand = regionList->at(c); + if (Util::isOverlapping(segStart, segEnd, cand->getStart(), + cand->getEnd())) { + int newEnd = cand->getEnd() + k - 1; + if (newEnd > segEnd) { + newEnd = segEnd; + } + cand->setEnd(newEnd); + lastCandIndex = c; + } else { + break; + } + } + + if (lastCandIndex < 0) { + string msg("The index of the last region cannot be negative."); + throw InvalidStateException(msg); + } + + return lastCandIndex; +} + +void Scanner::merge() { + int regionCount = regionList->size(); + int gg = 0; + while (gg < regionCount) { + ILocation * region = regionList->at(gg); + + int regionStart = region->getStart(); + int regionEnd = region->getEnd(); + + if (gg > 0) { + ILocation * pRegion = regionList->at(gg - 1); + int pStart = pRegion->getStart(); + int pEnd = pRegion->getEnd(); + + if (Util::isOverlapping(pStart, pEnd, regionStart, regionEnd)) { + pRegion->setEnd(regionEnd > pEnd ? regionEnd : pEnd); + regionList->erase(regionList->begin() + gg); + delete region; + regionCount = regionList->size(); + } else { + gg++; + } + } + + if (gg == 0) { + gg++; + } + } +} + +void Scanner::mergeWithOtherRegions(const vector * otherList) { + vector * mergedList = new vector(); + + int i = 0; + int j = 0; + int iLimit = regionList->size(); + int jLimit = otherList->size(); + + // Continue until one list is finished + while (i < iLimit && j < jLimit) { + ILocation * iLoc = regionList->at(i); + ILocation * jLoc = otherList->at(j); + + if (iLoc->getStart() < jLoc->getStart()) { + mergedList->push_back(iLoc); + i++; + } else { + mergedList->push_back(new Location(*jLoc)); + j++; + } + } + + // Once one list is finished, copy the rest of the other list + if (i == iLimit) { + for (; j < jLimit; j++) { + mergedList->push_back(new Location(*(otherList->at(j)))); + } + } else if (j == jLimit) { + for (; i < iLimit; i++) { + mergedList->push_back(regionList->at(i)); + } + } + + // Once done + // Util::deleteInVector(regionList); + // @@ Need to be tested + regionList->clear(); + delete regionList; + regionList = mergedList; + + merge(); + + //Ensure that the list is sorted + for (int h = 1; h < regionList->size(); h++) { + if (regionList->at(h)->getStart() < regionList->at(h - 1)->getStart()) { + throw InvalidStateException(string("This list is not sorted.")); + } + } +} + +void Scanner::makeForwardCoordinates() { + int regionNum = regionList->size(); + int lastBase = chrom->size() - 1; + + // Calculate the coordinate on the main strand + for (int i = 0; i < regionNum; i++) { + ILocation * oldLoc = regionList->at(i); + regionList->at(i) = new Location(lastBase - oldLoc->getEnd(), + lastBase - oldLoc->getStart()); + delete oldLoc; + } + + // Reverse the regions within the list + int lastRegion = regionNum - 1; + int middle = regionNum / 2; + for (int i = 0; i < middle; i++) { + ILocation * temp = regionList->at(lastRegion - i); + regionList->at(lastRegion - i) = regionList->at(i); + regionList->at(i) = temp; + } + +} + +/** + * Warning: this method prints the logarithm values of the scores + */ +void Scanner::printScores(string outputFile, bool canAppend) { + cout << "Printing the logarithmic values of the scores "; + cout << "NOT the original scores." << endl; + + ofstream outScores; + if (canAppend) { + outScores.open(outputFile.c_str(), ios::out | ios::app); + } else { + outScores.open(outputFile.c_str(), ios::out); + } + + int step = 50; + outScores << chrom->getHeader() << endl; + int len = scoreList->size(); + for (int i = 0; i < len; i = i + step) { + int e = (i + step - 1 > len - 1) ? len - 1 : i + step - 1; + for (int k = i; k <= e; k++) { + outScores << scoreList->at(k) << " "; + } + outScores << endl; + } + outScores << endl; + outScores.close(); +} + +void Scanner::printIndex(string outputFile, bool canAppend, int frmt) { + + if(frmt != FRMT_POS && frmt != FRMT_BED){ + string msg("Unknown output format: "); + msg.append(Util::int2string(frmt)); + msg.append(". The known formats are: "); + msg.append(Util::int2string(FRMT_POS)); + msg.append(" and "); + msg.append(Util::int2string(FRMT_BED)); + msg.append("."); + throw InvalidInputException(msg); + } + + ofstream outIndex; + if (canAppend) { + outIndex.open(outputFile.c_str(), ios::out | ios::app); + } else { + outIndex.open(outputFile.c_str(), ios::out); + } + + // Write the index of the repeat segment [x,y[ + string header = chrom->getHeader(); + + if(frmt == FRMT_POS){ + for (int j = 0; j < regionList->size(); j++) { + outIndex << header << ":"; + outIndex << ((int) (regionList->at(j)->getStart())) << "-"; + outIndex << ((int) (regionList->at(j)->getEnd() + 1)); + outIndex << endl; + } + }else if(frmt == FRMT_BED){ + for (int j = 0; j < regionList->size(); j++) { + outIndex << header << "\t"; + outIndex << ((int) (regionList->at(j)->getStart())) << "\t"; + outIndex << ((int) (regionList->at(j)->getEnd() + 1)); + outIndex << endl; + } + } + + outIndex.close(); +} + +void Scanner::printMasked(string outputFile, Chromosome& oChrom, + bool canAppend) { + + string baseCopy = *(oChrom.getBase()); + int regionCount = regionList->size(); + for (int j = 0; j < regionCount; j++) { + for (int h = regionList->at(j)->getStart(); + h <= regionList->at(j)->getEnd(); h++) { + baseCopy[h] = tolower(baseCopy[h]); + } + } + + ofstream outMask; + + if (canAppend) { + outMask.open(outputFile.c_str(), ios::out | ios::app); + } else { + outMask.open(outputFile.c_str(), ios::out); + } + + outMask << oChrom.getHeader() << endl; + int step = 50; + int len = baseCopy.size(); + for (int i = 0; i < len; i = i + step) { + int e = (i + step - 1 > len - 1) ? len - 1 : i + step - 1; + for (int k = i; k <= e; k++) { + outMask << baseCopy[k]; + } + outMask << endl; + } + outMask.close(); +} + +const vector* Scanner::getRegionList() { + return regionList; +} + +} /* namespace nonltr */ diff --git a/src/nonltr/Scanner.h b/src/nonltr/Scanner.h new file mode 100644 index 0000000..683de7e --- /dev/null +++ b/src/nonltr/Scanner.h @@ -0,0 +1,71 @@ +/* + * Scanner.h + * + * Created on: Aug 19, 2013 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef SCANNER_H_ +#define SCANNER_H_ + +#include +#include +#include + +#include "Chromosome.h" +#include "ChromosomeOneDigit.h" +#include "HMM.h" +#include "ITableView.h" +#include "Scorer.h" +#include "../utility/Util.h" +#include "../utility/ILocation.h" +#include "../utility/Location.h" +#include "../exception/InvalidInputException.h" +#include "../exception/InvalidStateException.h" +#include "../exception/FileDoesNotExistException.h" +#include "../exception/InvalidOperationException.h" + +using namespace std; +using namespace utility; +using namespace exception; + +namespace nonltr { + +class Scanner { +private: + //string chromFile; + ChromosomeOneDigit * chrom; + const vector *> * segmentList; + Scorer * scorer; + vector * scoreList; + vector * regionList; + int k; + HMM * hmm; + // bool isTrainMode; + + // Methods + void start(); + void check(); + void decode(); + void extendByK(); + int extendByKHelper(int, int, int); + void merge(); + +public: + static const int FRMT_POS = 1; + static const int FRMT_BED = 2; + + Scanner(HMM *, int, ChromosomeOneDigit *, string); + Scanner(HMM *, int, ChromosomeOneDigit *, ITableView *); + virtual ~Scanner(); + void makeForwardCoordinates(); + + void printScores(string, bool); + void printIndex(string, bool, int); + void printMasked(string, Chromosome&, bool); + void mergeWithOtherRegions(const vector *); + const vector* getRegionList(); +}; + +} /* namespace nonltr */ +#endif /* SCANNER_H_ */ diff --git a/src/nonltr/Scorer.cpp b/src/nonltr/Scorer.cpp new file mode 100644 index 0000000..947f9b8 --- /dev/null +++ b/src/nonltr/Scorer.cpp @@ -0,0 +1,143 @@ +/* + * Scorer.cpp + * + * Created on: Aug 3, 2012 + * Author: Hani Zakaria Girgis, PhD + */ +#include "Scorer.h" + +Scorer::Scorer(ChromosomeOneDigit * chromIn, + ITableView * const table) { + chrom = chromIn; + kmerTable = table; + scores = new vector(chrom->getBase()->size(), 0); + k = kmerTable->getK(); + max = -1; + score(); + calculateMax(); +} + +Scorer::~Scorer() { + scores->clear(); + delete scores; +} + +/** + * This method scores each nucleotide in the chromosome. + * The nucleotides represented by 'N' are assigned zero. + */ +void Scorer::score() { + const vector *> * segment = chrom->getSegment(); + const char * segBases = chrom->getBase()->c_str(); + + for (int s = 0; s < segment->size(); s++) { + int start = segment->at(s)->at(0); + int end = segment->at(s)->at(1); + kmerTable->wholesaleValueOf(segBases, start, end - k + 1, scores, + start); + + // Handle the last word from end - k + 2 till the end, inclusive. + for (int i = end - k + 2; i <= end; i++) { + (*scores)[i] = scores->at(i - 1); + } + } +} + +/** + * This method takes the logarithm of the scores according to the base. + * If the score equals zero, it is left the same. + */ +void Scorer::takeLog(double base) { + // Handle the case where base is one + bool isOne = false; + if (fabs(base - 1.0) < std::numeric_limits::epsilon()) { + isOne = true; + } + double logBase = isOne ? log(1.5) : log(base); + + const vector *> * segment = chrom->getSegment(); + for (int s = 0; s < segment->size(); s++) { + int start = segment->at(s)->at(0); + int end = segment->at(s)->at(1); + for (int h = start; h <= end; h++) { + int score = scores->at(h); + + if (score != 0) { + if (!isOne || (isOne && score > 1)) { + (*scores)[h] = ceil(log(score) / logBase); + } + } + } + } +} + +int Scorer::getK() { + return k; +} + +vector* Scorer::getScores() { + return scores; +} + +void Scorer::printScores(string outputFile, bool canAppend) { + ofstream outScores; + if (canAppend) { + outScores.open(outputFile.c_str(), ios::out | ios::app); + } else { + outScores.open(outputFile.c_str(), ios::out); + } + + int step = 50; + outScores << chrom->getHeader() << endl; + int len = scores->size(); + for (int i = 0; i < len; i = i + step) { + int e = (i + step - 1 > len - 1) ? len - 1 : i + step - 1; + for (int k = i; k <= e; k++) { + outScores << scores->at(k) << " "; + } + outScores << endl; + } + outScores << endl; + + outScores.close(); +} + +int Scorer::countLessOrEqual(int thr) { + int count = 0; + const vector *> * segment = chrom->getSegment(); + for (int s = 0; s < segment->size(); s++) { + int start = segment->at(s)->at(0); + int end = segment->at(s)->at(1); + for (int h = start; h <= end; h++) { + if (scores->at(h) <= thr) { + count++; + } + } + } + return count; +} + +void Scorer::calculateMax() { + const vector *> * segmentList = chrom->getSegment(); + int segmentCount = segmentList->size(); + for (int jj = 0; jj < segmentCount; jj++) { + vector * segment = segmentList->at(jj); + int start = segment->at(0); + int end = segment->at(1); + for (int ss = start; ss <= end; ss++) { + int score = scores->at(ss); + if (score > max) { + max = score; + } + } + } + + if (max == -1) { + string msg("Error occurred while finding the maximum score."); + throw InvalidStateException(msg); + } +} + +int Scorer::getMax() { + return max; +} diff --git a/src/nonltr/Scorer.h b/src/nonltr/Scorer.h new file mode 100644 index 0000000..06daaf4 --- /dev/null +++ b/src/nonltr/Scorer.h @@ -0,0 +1,54 @@ +/* + * Scorer.h + * + * Created on: Aug 3, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef SCORER_H_ +#define SCORER_H_ + +#include +#include +#include +#include +#include + +#include "ITableView.h" +#include "ChromosomeOneDigit.h" +#include "../utility/Util.h" +#include "../exception/InvalidStateException.h" + +using namespace std; +using namespace nonltr; +using namespace utility; +using namespace exception; + +namespace nonltr { +class Scorer { +private: + /* Fields */ + ChromosomeOneDigit * chrom; + ITableView * kmerTable; + vector * scores; + int k; + int max; + + /* Methods */ + void score(); + void calculateMax(); + +public: + /* Methods */ + Scorer(ChromosomeOneDigit *, ITableView *); + virtual ~Scorer(); + void printScores(string, bool); + vector* getScores(); + int getK(); + void takeLog(double); + int countLessOrEqual(int); + int getMax(); +}; +} + +#endif /* Scorer_H_ */ diff --git a/src/nonltr/TableBuilder.cpp b/src/nonltr/TableBuilder.cpp new file mode 100644 index 0000000..32733a9 --- /dev/null +++ b/src/nonltr/TableBuilder.cpp @@ -0,0 +1,121 @@ +/* + * TableBuilder.cpp + * + * Created on: Jul 31, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#include "TableBuilder.h" + +TableBuilder::TableBuilder(string dir, int motifSize, int order, int minObs) { + genomeDir = dir; + k = motifSize; + genomeLength = 0; + // kmerTable = new KmerHashTable(k); + // kmerTable = new EnrichmentView(k); + + // Whenever you change the template, modify line 50 and 70 and the header file line 35 + kmerTable = new EnrichmentMarkovView(k, order, minObs); + + buildTable(); +} + +TableBuilder::~TableBuilder() { + delete kmerTable; +} + +void TableBuilder::buildTable() { + vector * fileList = new vector(); + Util::readChromList(genomeDir, fileList, "fa"); + + for (int i = 0; i < fileList->size(); i++) { + cout << "Counting k-mers in " << fileList->at(i) << " ..." << endl; + ChromListMaker * maker = new ChromListMaker(fileList->at(i)); + const vector * chromList = maker->makeChromOneDigitList(); + + for (int h = 0; h < chromList->size(); h++) { + ChromosomeOneDigit * chrom = + dynamic_cast(chromList->at(h)); + if (chrom) { + genomeLength += chrom->getEffectiveSize(); + updateTable(chrom); + } else { + throw InvalidStateException(string("Dynamic cast failed.")); + } + } + + delete maker; + } + // Check if overflow has occurred + kmerTable->checkOverflow(); + + // View + // EnrichmentView * view = dynamic_cast(kmerTable); + EnrichmentMarkovView * view = + dynamic_cast *>(kmerTable); + + if (view) { + view->generateProbapilities(); + view->processTable(); + maxValue = view->getMaxValue(); + } else { + throw InvalidStateException(string("Dynamic cast failed.")); + } + cout << "Enrichment view is ready." << endl; + + fileList->clear(); + delete fileList; + + /* If you would like to see the contents of the table.*/ + // kmerTable-> printTable(); +} + +void TableBuilder::updateTable(ChromosomeOneDigit * chrom) { + // EnrichmentView * view = dynamic_cast(kmerTable); + EnrichmentMarkovView * view = + dynamic_cast *>(kmerTable); + + const vector *> * segment = chrom->getSegment(); + const char * segBases = chrom->getBase()->c_str(); + + for (int s = 0; s < segment->size(); s++) { + int start = segment->at(s)->at(0); + int end = segment->at(s)->at(1); + // cerr << "The segment length is: " << (end-start+1) << endl; + + // Fast, but require some memory proportional to the segment length. + kmerTable->wholesaleIncrement(segBases, start, end - k + 1); + if (view) { + view->count(segBases, start, end); + } else { + throw InvalidStateException(string("Dynamic cast failed.")); + } + + // Slow, but memory efficient + /* + vector hashList = vector(); + kmerTable->hash(segBases, start, end - k + 1, &hashList); + + for (int i = start; i <= end - k + 1; i++) { + kmerTable->increment(segBases, i); + } + */ + } +} + +KmerHashTable * const TableBuilder::getKmerTable() { + return kmerTable; +} + +long TableBuilder::getGenomeLength() { + if (genomeLength < 0) { + string msg("The length of the genome cannot be negative."); + throw InvalidStateException(msg); + } + + return genomeLength; +} + +int TableBuilder::getMaxValue() { + return maxValue; +} diff --git a/src/nonltr/TableBuilder.h b/src/nonltr/TableBuilder.h new file mode 100644 index 0000000..1041f3d --- /dev/null +++ b/src/nonltr/TableBuilder.h @@ -0,0 +1,68 @@ +/* + * TableBuilder.h + * + * Created on: Jul 31, 2012 + * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH + */ + +#ifndef TABLEBUILDER_H_ +#define TABLEBUILDER_H_ + +#include "KmerHashTable.h" +#include "EnrichmentMarkovView.h" +#include "ChromosomeOneDigit.h" +#include "ChromListMaker.h" +#include "IChromosome.h" + +#include "../utility/Util.h" +#include "../exception/InvalidStateException.h" + +#include + +using namespace std; +using namespace nonltr; +using namespace utility; +using namespace exception; + +namespace nonltr { +class TableBuilder { +private: + /** + * k-mer table + */ + KmerHashTable * kmerTable; + int maxValue; + + /** + * Directory including the FASTA files comprising the genome. + * These files must have the + */ + string genomeDir; + + /** + * The size of the motif + */ + int k; + + /** + * The total length of the whole genome + */ + long genomeLength; + + /** + * Methods + */ + void buildTable(); + void updateTable(ChromosomeOneDigit *); + +public: + TableBuilder(string, int, int, int); + virtual ~TableBuilder(); + KmerHashTable * const getKmerTable(); + void printTable(); + long getGenomeLength(); + int getMaxValue(); +}; +} + +#endif /* TABLEBUILDER_H_ */ diff --git a/src/nonltr/Trainer.cpp b/src/nonltr/Trainer.cpp new file mode 100644 index 0000000..3e8865f --- /dev/null +++ b/src/nonltr/Trainer.cpp @@ -0,0 +1,278 @@ +/* + * Trainer.cpp + * + * Created on: Aug 20, 2013 + * Author: Hani Zakaria Girgis, PhD + */ + +#include "Trainer.h" + +namespace nonltr { + +// Pass the isCND and the isCON parameters + +Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, + double tIn, string candidateDirIn, int m) : minObs(m) { + candidateDir = candidateDirIn; + canPrintCandidates = true; + isCND = true; + isCON = false; + initialize(genomeDirIn, orderIn, kIn, sIn, tIn); +} + +Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, + double tIn, string candidateDirIn, bool isCNDIn, string otherDirIn, int m) : minObs(m) { + candidateDir = candidateDirIn; + canPrintCandidates = true; + isCND = isCNDIn; + isCON = true; + otherDir = otherDirIn; + initialize(genomeDirIn, orderIn, kIn, sIn, tIn); +} + +Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, + double tIn, int m) : minObs(m) { + canPrintCandidates = false; + isCND = true; + isCON = false; + initialize(genomeDirIn, orderIn, kIn, sIn, tIn); +} + +Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, + double tIn, bool isCNDIn, string otherDirIn, int m) : minObs(m) { + canPrintCandidates = false; + isCND = isCNDIn; + isCON = true; + otherDir = otherDirIn; + initialize(genomeDirIn, orderIn, kIn, sIn, tIn); +} + +void Trainer::initialize(string genomeDirIn, int orderIn, int kIn, double sIn, + double tIn) { + + if (isCND == false && isCON == false) { + string msg("Training using the candidates or the other repeats is required. "); + msg.append("Please specify which regions to be used for training. "); + msg.append("Any of the two sets or a combination of both can be used."); + throw InvalidStateException(msg); + } + + genomeDir = genomeDirIn; + fileList = new vector(); + Util::readChromList(genomeDir, fileList, string("fa")); + chromCount = fileList->size(); + order = orderIn; + k = kIn; + s = sIn; + t = tIn; + p = 0.0; + tDetector = tIn + 0.1; + max = -1; + + stage1(); + + if (isCND) { + stage2(); + } + stage3(); +} + +Trainer::~Trainer() { + fileList->clear(); + delete fileList; + delete builder; + delete hmm; +} + +/** + * Stage 1: Building the table + */ +void Trainer::stage1() { + cout << endl << endl; + cout << "Stage 1: Building the table ..." << endl; + builder = new TableBuilder(genomeDir, k, order, minObs); + table = builder->getKmerTable(); + genomeLength = builder->getGenomeLength(); + max = builder->getMaxValue(); +} + +void Trainer::stage2() { + cout << endl << endl; + cout << "Stage 2: Calculating the percentage ..." << endl; + + double effectiveSize = 0.0; + double countLessOrEqual = 0.0; + for (int i = 0; i < chromCount; i++) { + cout << "Calculating the percentage in: " << fileList->at(i) << " ..."; + cout << endl; + ChromListMaker * maker = new ChromListMaker(fileList->at(i)); + const vector * chromList = maker->makeChromOneDigitList(); + + for (int h = 0; h < chromList->size(); h++) { + ChromosomeOneDigit * chrom = + dynamic_cast(chromList->at(h)); + Scorer * scorer = new Scorer(chrom, table); + + effectiveSize += chrom->getEffectiveSize(); + countLessOrEqual += scorer->countLessOrEqual(t); + + delete scorer; + } + delete maker; + } + + if (effectiveSize == 0) { + string msg("The size of the genome cannot be zero."); + throw InvalidStateException(msg); + } else { + p = 100.00 * countLessOrEqual / effectiveSize; + cout << "The percentage is " << p << endl; + if (p < 52.5) { + p = 52.5; + cout << "The percentage is increased to " << p << endl; + } + } +} + +/** + * Stage 3: Training + */ +void Trainer::stage3() { + cout << endl << endl; + cout << "Stage 3: Training ..." << endl; + + // Handle the case when the threshold is one. + bool isOne = false; + if (fabs(t - 1.0) < std::numeric_limits::epsilon()) { + isOne = true; + } + double hmmBase = isOne ? 1.5 : t; + + // Make a list of candidate HMM + int stateCount = 2 * (ceil(log(max) / log(hmmBase)) + 1); + + // Initialize the HMM + hmm = new HMM(hmmBase, stateCount); + + // Start training the models + for (int i = 0; i < chromCount; i++) { + cout << "Training on: " << fileList->at(i) << endl; + // Name of candidates file + string path(fileList->at(i)); + int slashLastIndex = path.find_last_of(Util::fileSeparator); + int dotLastIndex = path.find_last_of("."); + string nickName = path.substr(slashLastIndex + 1, dotLastIndex - slashLastIndex - 1); + + // May or may not be used + string cndFile = candidateDir + Util::fileSeparator + nickName + ".cnd"; + + // Work on the other repeats if desired + LocationListCollection * otherRegionListCollection; + bool isConRepAvailable = false; + if (isCON) { + string otherFile = otherDir + Util::fileSeparator + nickName + ".rpt"; + ifstream f1(otherFile.c_str()); + if (!f1) { + string message = string("Warning: "); + message.append(otherFile); + message.append(" does not exist. "); + message.append("Repeats of this sequence will not used for training the HMM."); + cout << message << endl; + } else { + otherRegionListCollection = new LocationListCollection(otherFile); + otherRegionListCollection->convertToRedFormat(); + otherRegionListCollection->trim(k - 1); + + isConRepAvailable = true; + } + f1.close(); + } + + // Read sequences in the file + ChromListMaker * maker = new ChromListMaker(fileList->at(i)); + const vector * chromList = maker->makeChromOneDigitList(); + + for (int h = 0; h < chromList->size(); h++) { + ChromosomeOneDigit * chrom = dynamic_cast(chromList->at(h)); + Scorer * scorer = new Scorer(chrom, table); + vector * scoreList = scorer->getScores(); + + // Detect candidates if desired + ChromDetectorMaxima * detector; + const vector * trainingRegionList; + bool canDeleteDetector = true; + if (isCND) { + if (canPrintCandidates) { + detector = new ChromDetectorMaxima(s, 10, 0, tDetector, p,s, scoreList, chrom); + if (h > 0) { + bool canAppend = true; + detector->printIndex(cndFile, canAppend); + } else { + cout << "Printing candidates to: " << cndFile << endl; + detector->printIndex(cndFile); + } + } else { + detector = new ChromDetectorMaxima(s, 10, 0, tDetector, p, s, scoreList, chrom->getSegment()); + } + trainingRegionList = detector->getRegionList(); + + + } + + if (isCON && isConRepAvailable) { + LocationList * const locList = otherRegionListCollection->getLocationList(chrom->getHeader()); + if (isCND) { + locList->mergeWithAnotherList(detector->getRegionList()); + } + trainingRegionList = locList->getList(); + + } + + // The candidate regions are already copied to the location list + if (isCND && isCON && isConRepAvailable) { + delete detector; + canDeleteDetector = false; + } + + // Train the HMM + if(isCND || (isCON && isConRepAvailable)){ + + scorer->takeLog(t); + scoreList = scorer->getScores(); + hmm->train(scoreList, chrom->getSegment(), trainingRegionList); + } + + // Free more memory + if (isCND && canDeleteDetector) { + delete detector; + } + delete scorer; + } + + if (isCON && isConRepAvailable) { + delete otherRegionListCollection; + } + delete maker; + } + + // Normalize HMM's once training is finished + hmm->normalize(); +} + +void Trainer::printTable(string fileName) { + table->printTable(fileName); +} + +HMM*& Trainer::getHmm() { + return hmm; +} + +KmerHashTable * Trainer::getTable() { + return table; +} + +void Trainer::printHmm(string fileName) { + hmm->print(fileName); +} + +} /* namespace nonltr */ diff --git a/src/nonltr/Trainer.h b/src/nonltr/Trainer.h new file mode 100644 index 0000000..8281343 --- /dev/null +++ b/src/nonltr/Trainer.h @@ -0,0 +1,80 @@ +/* + * Trainer.h + * + * Created on: Aug 20, 2013 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef TRAINER_H_ +#define TRAINER_H_ + +#include +#include +#include +#include +#include + +#include "TableBuilder.h" +#include "KmerHashTable.h" +#include "HMM.h" +#include "ChromDetectorMaxima.h" +#include "Scorer.h" +#include "ChromListMaker.h" +#include "LocationListCollection.h" +#include "../utility/Util.h" +#include "../exception/InvalidStateException.h" + +using namespace std; +using namespace utility; +using namespace exception; + +namespace nonltr { + +class Trainer { +private: + string genomeDir; + string candidateDir; + string otherDir; + bool canPrintCandidates; + vector * fileList; + int chromCount; + int order; + int k; + int max; // Maximum score in the entire genome + double t; // Score threshold + double tDetector; // threshold for the detector because it uses < not <=; + double p; // Percentage of scores below the threshold, t, in non-repeats + //double r; + double s; // Half width of the mask + long genomeLength; + //vector * sampleList; + TableBuilder * builder; + KmerHashTable * table; + HMM * hmm; + int isCND; + int isCON; + // The minimum number of the observed k-mers + const int minObs; + + void stage1(); + void stage2(); + void stage3(); + //void stage4(); + +public: + Trainer(string, int, int, double, double, string, int); + Trainer(string, int, int, double, double, string, bool, string, int); + Trainer(string, int, int, double, double, int); + Trainer(string, int, int, double, double, bool, string, int); + + void initialize(string, int, int, double, double); + virtual ~Trainer(); + void printTable(string); + void printHmm(string); + HMM*& getHmm(); + KmerHashTable * getTable(); + +}; + +} /* namespace nonltr */ +#endif /* TRAINER_H_ */ diff --git a/src/utility/AffineId.cpp b/src/utility/AffineId.cpp new file mode 100644 index 0000000..484a5bd --- /dev/null +++ b/src/utility/AffineId.cpp @@ -0,0 +1,212 @@ +/* + * AffineId.cpp + * + * Created on: Dec 6, 2012 + * Modified on: Nov 6, 2017 + * Author: Hani Zakaria Girgis, PhD + */ + +// ToDo: +// 1. Add pre-conditions after testing +#include "AffineId.h" + +#include "Util.h" +#include "../exception/InvalidInputException.h" + +#include +#include +using namespace std; +//using namespace exception; + +namespace utility { + +AffineId::AffineId(const char * seq1In, int start1In, int end1In, + const char * seq2In, int start2In, int end2In) { + + // The shorter of the two sequences is seq2 + seq1 = seq1In; + start1 = start1In; + end1 = end1In; + + seq2 = seq2In; + start2 = start2In; + end2 = end2In; + + if (end1 - start1 < end2 - start2) { + seq1 = seq2In; + start1 = start2In; + end1 = end2In; + + seq2 = seq1In; + start2 = start1In; + end2 = end1In; + } + + /* if (start1 < 0 || end1 < 0 || start1 > end1) { + string msg("Invalid Input. Start1 is "); + msg.append(Util::int2string(start1)); + msg.append(". End 1 is "); + msg.append(Util::int2string(end1)); + msg.append("."); + //throw InvalidInputException(msg); + + cerr << msg << endl; + throw exception(); + } + + if (start2 < 0 || end2 < 0 || start2 > end2) { + string msg("Invalid Input. Start2 is "); + msg.append(Util::int2string(start2)); + msg.append(". End2 is "); + msg.append(Util::int2string(end2)); + msg.append("."); + //throw InvalidInputException(msg); + + cerr << msg << endl; + throw exception(); + }*/ + + // Validate input + // cout << start1 << " " << end1 << endl; + // cout << start2 << " " << end2 << endl; + + len1 = end1 - start1 + 2; + len2 = end2 - start2 + 2; + + align(); +} + +AffineId::~AffineId() { +} + +void AffineId::align() { + // Initialize needed arrays + auto m = new int[len2][2](); // Middle level array + auto u = new int[len2][2](); // Upper level array + auto mId = new int[len2][2](); // Array storing number of matches in the middle array + auto uId = new int[len2][2](); // Array storing number of matches in the upper array + auto mPath = new int[len2][2](); // Array storing number of steps in the middle array + auto uPath = new int[len2][2](); // Array storing number of steps in the upper array + + // Apply the DP + // The i index is only used to get a character from the first sequence + // It is not used for filling the DP matrix + for (int i = 1; i < len1; i++) { + char base1 = seq1[start1 + i - 1]; + int lower = 0; + int lowerId = 0; + int lowerPath = 0; + + // j is the row. There are only two columns 0 and 1 + for (int j = 1; j < len2; j++) { + // Update the lower value + int extLower = lower + EXT; + int openLower = m[j - 1][0] + OPEN; + if (extLower > openLower) { + lower = extLower; + lowerPath++; + } else { + lower = openLower; + lowerId = mId[j - 1][0]; + lowerPath = mPath[j - 1][0] + 1; + } + + // Fill the array of the upper level + int extUpper = u[j][0] + EXT; + int openUpper = m[j][0] + OPEN; + if (extUpper > openUpper) { + u[j][1] = extUpper; + uId[j][1] = uId[j][0]; + uPath[j][1] = uPath[j][0] + 1; + } else { + u[j][1] = openUpper; + uId[j][1] = mId[j][0]; + uPath[j][1] = mPath[j][0] + 1; + } + + // Fill the array of the middle level + int matchOrMis; + if (base1 == seq2[start2 + j - 1]) { + matchOrMis = m[j - 1][0] + MATCH; + } else { + matchOrMis = m[j - 1][0] + MIS; + } + + int lowerOrUpper; + if (lower > u[j][1]) { + lowerOrUpper = lower; + } else { + lowerOrUpper = u[j][1]; + } + + if (matchOrMis > lowerOrUpper) { + m[j][1] = matchOrMis; + mPath[j][1] = mPath[j - 1][0] + 1; + if (base1 == seq2[start2 + j - 1]) { + mId[j][1] = mId[j - 1][0] + 1; + } else { + mId[j][1] = mId[j - 1][0]; + } + } else { + m[j][1] = lowerOrUpper; + if (lower > u[j][1]) { + mId[j][1] = lowerId; + mPath[j][1] = lowerPath; + } else { + mId[j][1] = uId[j][1]; + mPath[j][1] = uPath[j][1]; + } + } + } + + // // Test + // for (int h = 0; h < len2; h++) { + // cout << m[h][0] << "\t" << m[h][1] << "----" << mId[h][0] << "\t" + // << mId[h][1] << endl; + // } + // cout << "---------------------------------------------------" << endl; + // // End of test + + // Copy the second column to the first one + if (i != len1 - 1) { + for (int h = 0; h < len2; h++) { + m[h][0] = m[h][1]; + u[h][0] = u[h][1]; + mId[h][0] = mId[h][1]; + uId[h][0] = uId[h][1]; + mPath[h][0] = mPath[h][1]; + uPath[h][0] = uPath[h][1]; + } + } + } + + lenCS = mId[len2 - 1][1]; + lenPath = mPath[len2 - 1][1]; + //cout << "Alignment length = " << lenPath << endl; + delete[] u; + delete[] m; + delete[] mId; + delete[] uId; + delete[] mPath; + delete[] uPath; +} + +double AffineId::getAlign() { + double amt = lenCS; + return amt / (double)lenPath; +} + +} +/* namespace utility */ + +// // Testing code +// int main() { +// string s1("GATCTCAG"); +// string s2("GACAG"); + +// utility::AffineId id(s1.c_str(), 0, s1.length() - 1, s2.c_str(), 0, +// s2.length() - 1); +// cout << "Length = " << id.getLenCS() << endl; + +// return 0; +// } diff --git a/src/utility/AffineId.h b/src/utility/AffineId.h new file mode 100644 index 0000000..61173e7 --- /dev/null +++ b/src/utility/AffineId.h @@ -0,0 +1,50 @@ +/* + * AffineId.h + * + * Created on: Dec 6, 2012 + * Modified on: Nov 6, 2017 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef AFFINEID_H_ +#define AFFINEID_H_ + +namespace utility { + +class AffineId { +private: + const char * seq1; + int start1; + int end1; + const char * seq2; + int start2; + int end2; + + int len1; + int len2; + //int lenTotal; + int lenCS; + int lenPath; + int * m; // Middle level + //int * l; // Lower level + int * u; // Upper level + + // const int MATCH = 4; // Score of a match + // const int MIS = -4; // Score of a mismatch + // const int OPEN = -2; // Score of a gap opening + // const int EXT = -1; // Score of a gap extension + + const int MATCH = 1; + const int MIS = -1; + const int OPEN = -2; + const int EXT = -1; + void align(); + +public: + AffineId(const char *, int, int, const char *, int, int); + virtual ~AffineId(); + double getAlign(); +}; + +} /* namespace utility */ +#endif /* AFFINEID_H_ */ diff --git a/src/utility/EmptyLocation.cpp b/src/utility/EmptyLocation.cpp new file mode 100644 index 0000000..38e8920 --- /dev/null +++ b/src/utility/EmptyLocation.cpp @@ -0,0 +1,53 @@ +/* + * EmptyLocation.cpp + * + * Created on: Dec 28, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#include "EmptyLocation.h" +#include "../exception/InvalidOperationException.h" + +using namespace exception; + +namespace utility { + +EmptyLocation * EmptyLocation::INSTANCE = new EmptyLocation(); + +EmptyLocation * EmptyLocation::getInstance(){ + return INSTANCE; +} + +EmptyLocation::EmptyLocation() { + msg = new string("Empty location does not allow this operation."); +} + +EmptyLocation::~EmptyLocation() { + delete msg; +} + +string EmptyLocation::toString() { + return string("Empty"); +} + +int EmptyLocation::getEnd() const { + throw InvalidOperationException(*msg); +} + +int EmptyLocation::getStart() const { + throw InvalidOperationException(*msg); +} + +void EmptyLocation::setEnd(int int1) { + throw InvalidOperationException(*msg); +} + +void EmptyLocation::setStart(int int1) { + throw InvalidOperationException(*msg); +} + +int EmptyLocation::getLength() { + throw InvalidOperationException(*msg); +} + +} /* namespace tr */ diff --git a/src/utility/EmptyLocation.h b/src/utility/EmptyLocation.h new file mode 100644 index 0000000..4b0c6e9 --- /dev/null +++ b/src/utility/EmptyLocation.h @@ -0,0 +1,35 @@ +/* + * EmptyLocation.h + * + * Created on: Dec 28, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef EMPTYLOCATION_H_ +#define EMPTYLOCATION_H_ + +#include "ILocation.h" + +namespace utility { + +class EmptyLocation: public ILocation { +private: + string * msg; + static EmptyLocation * INSTANCE; + EmptyLocation(); + virtual ~EmptyLocation(); + +public: + virtual int getEnd() const; + virtual int getStart() const; + virtual void setEnd(int); + virtual void setStart(int); + virtual int getLength(); + virtual string toString(); + + static EmptyLocation * getInstance(); + +}; + +} /* namespace tr */ +#endif /* EMPTYLOCATION_H_ */ diff --git a/src/utility/GlobAlignE.cpp b/src/utility/GlobAlignE.cpp new file mode 100644 index 0000000..f94bfcd --- /dev/null +++ b/src/utility/GlobAlignE.cpp @@ -0,0 +1,317 @@ +/** + * Author: Joseph Valencia + * Modified by Benjamin James + * Date: 12/14/17 + * Bioinformatics Toolsmith Laboratory, University of Tulsa + * */ +#include +#include "../exception/InvalidStateException.h" +#include +#include +#include +#include +#include +#include +#include +#include "GlobAlignE.h" + +using namespace std; +using namespace utility; +using namespace exception; + +GlobAlignE::GlobAlignE(const char * seq1In, int start1In, int end1In, const char * seq2In, + int start2In, int end2In, int matchIn, int mismatchIn, int gapOpenIn, int gapContinueIn){ + + seq1 = seq1In; + start1 = start1In; + end1 = end1In; + + seq2 = seq2In; + start2 = start2In; + end2 = end2In; + + len1 = end1 - start1 + 2; + len2 = end2 - start2 + 2; + + //Incremental score storage + matches = new int[len1]; + upperGap = new int[len1]; + lowerGap = new int[len1]; + + + + //Incremental length storage + matchLen = new int[len1]; + upperLen = new int[len1]; + lowerLen = new int[len1]; + + //Incremental identity storage + matchId = new int[len1]; + upperId = new int[len1]; + lowerId = new int[len1]; + + match = matchIn; + mismatch = mismatchIn; + gapOpen = gapOpenIn; + gapContinue = gapContinueIn; + findAlignment(); + +} +/* +GlobAlignE::GlobAlignE(string filename1,string filename2, int matchIn, int mismatchIn, int gapOpenIn, int gapContinueIn):GlobAlignE(string1.c_str(),0,string.size(),string2.c_str(),0,string2.size(),matchIn,mismatchIn,gapOpenIn,gapContinueIn){ + + ifstream ifs; + + ifs.open (filename1, ifstream::in); + cout<<"FILE OPENED"<'){ + + while(c!='\n'){ + c = ifs.get(); + + } + } + + string string1 =""; + + while (ifs.good()) { + + + if (c!='\n'){ + string1+=c; + } + c = ifs.get(); + } + + ifs.close(); + + + ifstream ifs2; + + ifs2.open (filename2, ifstream::in); + + c = ifs2.get(); + + if(c == '>'){ + + while(c!='\n'){ + c = ifs2.get(); + } + } + + string string2 =""; + + while (ifs2.good()) { + + if(c!='\n'){ + string2+=c; + } + c = ifs2.get(); + } + + ifs2.close(); + + std::transform(string1.begin(),string1.end(),string1.begin(),::toupper); + std::transform(string2.begin(),string2.end(),string2.begin(),::toupper); + + // return GlobAlignE(string1.c_str(),0,string.size(),string2.c_str(),0,string2.size(),matchIn,mismatchIn,gapOpenIn,gapContinueIn); + +} +*/ +void GlobAlignE::findAlignment(){ + + int shorter = min(len2,len1)-1; + int lenDiff = abs(len2-len1); + int maxDiff=0; + + if (lenDiff >=1){ + maxDiff += -gapOpen- (lenDiff*gapContinue); + } + + maxDiff+= (mismatch* shorter)-1; + + const int negativeInf = maxDiff; + + matches[0]= 0; + upperGap[0] = negativeInf; + lowerGap[0] = negativeInf; + + matchLen[0] =0; + upperLen[0] =0; + lowerLen[0] =0; + + matchId[0] =0; + upperId[0] = 0; + lowerId[0] =0; + + //initial values + for (int i = 1; i + +using namespace std; + +namespace utility{ + +class GlobAlignE{ + +private: + const char * seq1; //first sequence to be aligned + int start1; + int end1; + const char * seq2;//second sequence to be aligned + int start2; + int end2; + int len1; + int len2; + int lenTotal; + int match; //score for base pair match + int mismatch;//score for base pair mismatch + int gapOpen; //cost to open a gap + int gapContinue; //cost to continue a gap + int * matches; + int * upperGap; + int * lowerGap; + int * matchLen; + int * upperLen; + int * lowerLen; + int * matchId; + int * upperId; + int * lowerId; + int alignmentScore; + int alignmentLength; + int totalMatches; + string topString; + string bottomString; +public: + GlobAlignE(const char*,int,int,const char *,int,int, int,int,int,int); + GlobAlignE(string,string,int,int,int,int); + virtual ~GlobAlignE(); + void findAlignment(); + double getIdentity(); + int getLength(); + void printAlignment(); + int getScore(); + int getLengthAlignment(); + +}; +} +#endif diff --git a/src/utility/ILocation.h b/src/utility/ILocation.h new file mode 100644 index 0000000..53f1ea6 --- /dev/null +++ b/src/utility/ILocation.h @@ -0,0 +1,29 @@ +/* + * ILocation.h + * + * Created on: Dec 20, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef ILOCATION_H_ +#define ILOCATION_H_ + +#include + +using namespace std; + +namespace utility { + +class ILocation { +public: + virtual int getEnd() const = 0; + virtual int getStart() const = 0; + virtual void setEnd(int) = 0; + virtual void setStart(int) = 0; + virtual int getLength() = 0; + virtual string toString() = 0; +}; + +} + +#endif /* ILOCATION_H_ */ diff --git a/src/utility/LCSLen.cpp b/src/utility/LCSLen.cpp new file mode 100644 index 0000000..76e08e8 --- /dev/null +++ b/src/utility/LCSLen.cpp @@ -0,0 +1,103 @@ +/* + * LCSLen.cpp + * + * Created on: Dec 6, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#include "LCSLen.h" +#include "../utility/Util.h" +#include "../exception/InvalidInputException.h" + +#include + +using namespace std; +using namespace exception; + +namespace utility { + +LCSLen::LCSLen(const char * seq1In, int start1In, int end1In, + const char * seq2In, int start2In, int end2In) { + seq1 = seq1In; + start1 = start1In; + end1 = end1In; + + seq2 = seq2In; + start2 = start2In; + end2 = end2In; + + if(start1 < 0 || end1 < 0 || start1 > end1){ + string msg("Invalid Input. Start1 is "); + msg.append(Util::int2string(start1)); + msg.append(". End 1 is "); + msg.append(Util::int2string(end1)); + msg.append("."); + throw InvalidInputException(msg); + } + + if(start2 < 0 || end2 < 0 || start2 > end2){ + string msg("Invalid Input. Start2 is "); + msg.append(Util::int2string(start2)); + msg.append(". End2 is "); + msg.append(Util::int2string(end2)); + msg.append("."); + throw InvalidInputException(msg); + } + + // Validate input + // cout << start1 << " " << end1 << endl; + // cout << start2 << " " << end2 << endl; + + + len1 = end1 - start1 + 2; + len2 = end2 - start2 + 2; + + lenTotal = 2 * len2; + cTable = new int[lenTotal]; + + for (int i = 0; i < lenTotal; i++) { + cTable[i] = 0; + } + + findLcs(); +} + +LCSLen::~LCSLen() { + delete[] cTable; +} + +void LCSLen::findLcs() { + int iM1Index = 0; + int iIndex = len2; + + for (int i = 1; i < len1; i++) { + char base1 = seq1[start1 + i - 1]; + + for (int j = 1; j < len2; j++) { + int ijIndex = iIndex + j; + if (base1 == seq2[start2 + j - 1]) { + cTable[ijIndex] = cTable[iM1Index + j - 1] + 1; + } else { + if (cTable[iM1Index + j] > cTable[iIndex + j - 1]) { + cTable[ijIndex] = cTable[iM1Index + j]; + } else { + cTable[ijIndex] = cTable[iIndex + j - 1]; + } + } + } + + if(i != len1-1){ + for(int h = 0; h < len2; h++){ + cTable[h] = cTable[len2+h]; + } + } + } + lenCS = cTable[lenTotal-1]; +} + +int LCSLen::getLenCS(){ + return lenCS; +} + +} +/* namespace utility */ diff --git a/src/utility/LCSLen.h b/src/utility/LCSLen.h new file mode 100644 index 0000000..98b9364 --- /dev/null +++ b/src/utility/LCSLen.h @@ -0,0 +1,37 @@ +/* + * LCSLen.h + * + * Created on: Dec 6, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef LCSLEN_H_ +#define LCSLEN_H_ + +namespace utility { + +class LCSLen { +private: + const char * seq1; + int start1; + int end1; + const char * seq2; + int start2; + int end2; + + int len1; + int len2; + int lenTotal; + int lenCS; + + int * cTable; + void findLcs(); + +public: + LCSLen(const char *, int, int, const char *, int, int); + virtual ~LCSLen(); + int getLenCS(); +}; + +} /* namespace utility */ +#endif /* LCSLEN_H_ */ diff --git a/src/utility/Location.cpp b/src/utility/Location.cpp new file mode 100644 index 0000000..7a39e03 --- /dev/null +++ b/src/utility/Location.cpp @@ -0,0 +1,74 @@ +/* + * Location.cpp + * + * Created on: Dec 19, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#include "Location.h" +#include "Util.h" +#include "../exception/InvalidInputException.h" + +using namespace exception; + +namespace utility { + +Location::Location(int startIn, int endIn) { + initialize(startIn, endIn); +} + +Location::Location(ILocation& cp) { + initialize(cp.getStart(), cp.getEnd()); +} + +void Location::initialize(int startIn, int endIn) { + start = startIn; + end = endIn; + check(); + +} + +void Location::check() { + if (start < 0 || end < 0 || start > end) { + string msg("Invalid Input. Start is "); + msg.append(Util::int2string(start)); + msg.append(". End is "); + msg.append(Util::int2string(end)); + msg.append("."); + throw InvalidInputException(msg); + } +} + +Location::~Location() { +} + +int Location::getEnd() const { + return end; +} + +int Location::getStart() const { + return start; +} + +void Location::setEnd(int endIn) { + end = endIn; + check(); +} + +void Location::setStart(int startIn) { + start = startIn; + check(); +} + +int Location::getLength() { + return end - start + 1; +} + +string Location::toString() { + string msg = (Util::int2string(start)); + msg.append("-"); + msg.append(Util::int2string(end)); + + return msg; +} +} diff --git a/src/utility/Location.h b/src/utility/Location.h new file mode 100644 index 0000000..042b2b9 --- /dev/null +++ b/src/utility/Location.h @@ -0,0 +1,41 @@ +/* + * Location.h + * + * Created on: Dec 19, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef LOCATION_H_ +#define LOCATION_H_ + +#include "ILocation.h" + +#include + +using namespace std; + +namespace utility { + +class Location : public ILocation{ +private: + int start; + int end; + void initialize(int, int); + void check(); + +public: + Location(int, int); + Location(ILocation&); + virtual ~Location(); + + int getEnd() const; + int getStart() const; + void setEnd(int); + void setStart(int); + int getLength(); + string toString(); +}; + +} + +#endif /* LOCATION_H_ */ diff --git a/src/utility/Util.cpp b/src/utility/Util.cpp new file mode 100644 index 0000000..4a6d4c1 --- /dev/null +++ b/src/utility/Util.cpp @@ -0,0 +1,347 @@ +/* + * Util.cpp + * + * Created on: Apr 24, 2012 + * Author: Hani Zakaria Girgis, PhD + * This class has a collection of utilities. + */ +#include "Util.h" + +Util::Util() { + // TODO Auto-generated constructor stub + +} + +Util::~Util() { + // TODO Auto-generated destructor stub +} + +string Util::fileSeparator("/"); + +//string * Util::emptyString = new string(""); + +void Util::readFasta(string seqFile, vector * infoList, + vector * seqList, bool canCheckFormat) { + ifstream in(seqFile.c_str()); + string info; + + bool isFirst = true; + string basePtr(""); + + while (in.good()) { + string line; + getline(in, line); + if (line[0] == '>') { + if (canCheckFormat) { + int colIndex = line.find_first_of(':'); + int dashIndex = line.find_first_of('-'); + if (colIndex < 0 || dashIndex < 0) { + string msg = + "The header must be in the following format: chromosome:start-end\n"; + msg += "The current input: " + line; + throw InvalidInputException(msg); + } + } + + infoList->push_back(line); + if (!isFirst) { + seqList->push_back(basePtr); + basePtr = string(""); + } else { + isFirst = false; + } + } else { + basePtr.append(line); + } + } + seqList->push_back(basePtr); + in.close(); + + // cout << "The system read " << infoList->size() << " sequences." << endl; + + // Post condition + if (infoList->size() != seqList->size()) { + cerr << "Error while reading the fasta input file. " + << "Header count = " << infoList->size() << " " + << "Sequence count = " << seqList->size() << endl; + exit(1); + } +} + +void Util::readFasta(string seqFile, vector * infoList, + vector * seqList) { + ifstream in(seqFile.c_str()); + string info; + + bool isFirst = true; + string * basePtr = new string(""); + while (in.good()) { + string line; + getline(in, line); + if (line[0] == '>') { + infoList->push_back(line); + if (!isFirst) { + seqList->push_back(*basePtr); + basePtr = new string(""); + } else { + isFirst = false; + } + } else { + basePtr->append(line); + } + } + seqList->push_back(*basePtr); + in.close(); + + // Post condition + if (infoList->size() != seqList->size()) { + cerr << "Error while reading the fasta input file. " + << "Header count = " << infoList->size() << " " + << "Sequence count = " << seqList->size() << endl; + exit(1); + } +} + +void Util::readCoordinates(string fileName, vector * coor) { + checkFile(fileName); + + ifstream in(fileName.c_str()); + string line; + + while (in >> line) { + int colIndex = line.find_first_of(':'); + int dashIndex = line.find_first_of('-'); + + int start = atoi(line.substr(colIndex + 1, dashIndex - colIndex - 1).c_str()); + int end = atoi(line.substr(dashIndex + 1).c_str()); + Location * loc = new Location(start, end); + coor->push_back(loc); + } + + //cout << "Read "; + //cout << coor->size() << endl; + + in.close(); +} + +void Util::readChromList(string genomeDir, vector * chromList, + string ext) { + // This function may not be platform-independent + // Credit: http://www.cplusplus.com/forum/beginner/9173/ + DIR * dirPtr = opendir(genomeDir.c_str()); + + struct dirent * entry; + entry = readdir(dirPtr); + while (entry) { + string file(entry->d_name); + // Credit: http://stackoverflow.com/questions/51949/how-to-get-file-extension-from-string-in-c + if (file.substr(file.find_last_of(".") + 1) == ext) { + chromList->push_back(genomeDir + fileSeparator + entry->d_name); + } + entry = readdir(dirPtr); + } + + closedir(dirPtr); +} + +// This method will modify the contents of its parameter basePtr! +void Util::toUpperCase(string * basePtr) { + string base = *basePtr; + // Convert alphabet to upper case + for (int i = 0; i < base.length(); i++) { + base[i] = toupper(base[i]); + } +} + +void Util::toUpperCase(string& base) { + // Convert alphabet to upper case + for (int i = 0; i < base.length(); i++) { + base[i] = toupper(base[i]); + } +} + +// credit: http://stackoverflow.com/questions/228005/alternative-to-itoa-for-converting-integer-to-string-c +string Util::int2string(int i) { + string s; + stringstream out; + out << i; + s = out.str(); + return s; +} + +// Need to use templates +string Util::double2string(double i) { + string s; + stringstream out; + out << i; + s = out.str(); + return s; +} + +string Util::long2string(long i) { + string s; + stringstream out; + out << i; + s = out.str(); + return s; +} + +void Util::checkFile(string fileName) { + ifstream f1(fileName.c_str()); + if (!f1) { + string message = string("ERROR: "); + message.append(fileName); + message.append(" does not exist.\n"); + throw FileDoesNotExistException(message); + } + f1.close(); +} + +void Util::deleteFile(string fileName) { + ifstream f1(fileName.c_str()); + if (f1) { + if (remove(fileName.c_str()) != 0) { + cerr << "Could not remove: " << fileName << endl; + } else { + cout << "Deleting: " << fileName << endl; + } + } else { + cerr << "Warning! This file does not exist: " << fileName << endl; + } + f1.close(); +} + +void Util::deleteFilesUnderDirectory(string dirName) { + // This function may not be platform-independent + // Credit: http://www.cplusplus.com/forum/beginner/9173/ + DIR * dirPtr = opendir(dirName.c_str()); + struct dirent * entry; + entry = readdir(dirPtr); + while (entry) { + string file(entry->d_name); + if (file.compare(string(".")) == 0 || file.compare(string("..")) == 0) { + // Skip current and parent directories + } else { + string url = dirName; + url.append(fileSeparator); + url.append(file); + deleteFile(url); + // cerr << "Deleting " << file << endl; + } + entry = readdir(dirPtr); + } + closedir(dirPtr); +} + +bool Util::isOverlapping(int s1, int e1, int s2, int e2) { + if (s1 > e1) { + string msg("Util::isOverlapping. Invalid Input. s1 is "); + msg.append(Util::int2string(s1)); + msg.append(". e1 is "); + msg.append(Util::int2string(e1)); + msg.append("."); + throw InvalidInputException(msg); + } + + if (s2 > e2) { + string msg("Util::isOverlapping. Invalid Input. s2 is "); + msg.append(Util::int2string(s2)); + msg.append(". e2 is "); + msg.append(Util::int2string(e2)); + msg.append("."); + throw InvalidInputException(msg); + } + + bool isStartWithin = s2 >= s1 && s2 <= e1; + bool isEndWithin = e2 >= s1 && e2 <= e1; + bool isIncluding = s2 >= s1 && e2 <= e1; + bool isIncluded = s1 >= s2 && e1 <= e2; + bool isAdjacent = (e1 == (s2 + 1)) || (e2 == (s1 + 1)); + + return (isStartWithin || isEndWithin || isIncluding || isIncluded + || isAdjacent); +} + +/** + * The input string is s. + * The reverse complement is rc. + * The start, and the end are inclusive. + */ +void Util::revCompDig(const char * s, int start, int end, string * rc) { + for (int i = end; i >= start; i--) { + char b = s[i]; + switch (b) { + case 0: + rc->append(1, 3); + break; + case 3: + rc->append(1, 0); + break; + case 1: + rc->append(1, 2); + break; + case 2: + rc->append(1, 1); + break; + default: + string msg("Valid codes are 0-3. The invalid code is "); + msg.append(1, b); + throw InvalidInputException(msg); + } + } +} + +void Util::revCompDig(string * s, string * rc) { + revCompDig(s->c_str(), 0, s->size() - 1, rc); + + /* + int len = s->size(); + for (int i = len - 1; i >= 0; i--) { + char b = s->at(i); + switch (b) { + case 0: + rc->append(1, 3); + break; + case 3: + rc->append(1, 0); + break; + case 1: + rc->append(1, 2); + break; + case 2: + rc->append(1, 1); + break; + default: + string msg("Valid codes are 0-3. The invalid code is "); + msg.append(1, b); + throw InvalidInputException(msg); + } + } + */ +} + +void Util::writeFasta(const string& sequence, const string& header, + const string& outputFile) { + ofstream outMask; + outMask.open(outputFile.c_str(), ios::out); + outMask << header << endl; + int step = 50; + int len = sequence.size(); + for (int i = 0; i < len; i = i + step) { + int e = (i + step - 1 > len - 1) ? len - 1 : i + step - 1; + for (int k = i; k <= e; k++) { + outMask << sequence[k]; + } + outMask << endl; + } + outMask.close(); +} + +int Util::sumTotalLength(const vector * list) { + int size = list->size(); + int sum = 0; + for (int i = 0; i < size; i++) { + sum += list->at(i)->getLength(); + } + return sum; +} diff --git a/src/utility/Util.h b/src/utility/Util.h new file mode 100644 index 0000000..a9ed695 --- /dev/null +++ b/src/utility/Util.h @@ -0,0 +1,79 @@ +/* + * Util.h + * + * Created on: Apr 24, 2012 + * Author: Hani Zakaria Girgis, PhD + */ + +#ifndef UTIL_H_ +#define UTIL_H_ + +#include "Location.h" +#include "../exception/FileDoesNotExistException.h" +#include "../exception/InvalidInputException.h" + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace utility; +using namespace exception; + +namespace utility { +class Util { +private: + Util(); + ~Util(); + +public: + static string * emptyString; + static string fileSeparator; + static void readFasta(string, vector *, vector *, bool); + static void readFasta(string, vector *, vector *); + static void readCoordinates(string, vector *); + static void readChromList(string, vector *, string); + static void toUpperCase(string*); + static void toUpperCase(string&); + static string int2string(int); + static string double2string(double); + static string long2string(long); + static void deleteFile(string); + static void deleteFilesUnderDirectory(string); + static void checkFile(string); + static bool isOverlapping(int, int, int, int); + static void revCompDig(string *, string *); + static void revCompDig(const char* sequence, int, int, string *); + + static void writeFasta(const string&, const string&, const string&); + + static int sumTotalLength(const vector *); + + /** + * Delete the objects pointed to by pointers in a vector. + * It does not delete the vector itself. + * + * Credit: http://stackoverflow.com/questions/594089/does-stdvector-clear-do-delete-free-memory-on-each-element + */ + template + static void deleteInVector(vector * deleteMe) { + while (!deleteMe->empty()) { + delete deleteMe->back(); + deleteMe->pop_back(); + } + + // Set the size to zero + deleteMe->clear(); + + // Set the capacity to zero + vector empty; + deleteMe->swap(empty); + } +}; +} + +#endif /* UTIL_H_ */