diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e660fd9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+bin/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..37abd71
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,111 @@
+cmake_minimum_required (VERSION 3.1)
+project (MeshClust2)
+
+include_directories(src/exception src/nonltr src/utility src/cluster src/prediction src/clutil src/fastcar)
+set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin)
+set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+
+
+add_library(Fastcar
+  ${CMAKE_SOURCE_DIR}/src/fastcar/FC_Runner.cpp
+)
+
+add_library(ClusterUtil
+  ${CMAKE_SOURCE_DIR}/src/clutil/DivergencePoint.cpp
+  ${CMAKE_SOURCE_DIR}/src/clutil/Histogram.cpp
+  ${CMAKE_SOURCE_DIR}/src/clutil/Loader.cpp
+  ${CMAKE_SOURCE_DIR}/src/clutil/SingleFileLoader.cpp
+  ${CMAKE_SOURCE_DIR}/src/clutil/Progress.cpp
+  ${CMAKE_SOURCE_DIR}/src/clutil/Datatype.cpp
+  ${CMAKE_SOURCE_DIR}/src/clutil/Clock.cpp
+)
+
+add_library(Predict
+  ${CMAKE_SOURCE_DIR}/src/predict/Feature.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/GLM.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/HandleSeq.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/Matrix.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/MultiMute.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/Predictor.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/SingMute.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/FeatureSelector.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/GreedySelector.cpp
+  ${CMAKE_SOURCE_DIR}/src/predict/BestFirstSelector.cpp
+)
+
+add_library(Cluster
+  ${CMAKE_SOURCE_DIR}/src/cluster/ClusterFactory.cpp
+  ${CMAKE_SOURCE_DIR}/src/cluster/CRunner.cpp
+  ${CMAKE_SOURCE_DIR}/src/cluster/Trainer.cpp
+  ${CMAKE_SOURCE_DIR}/src/cluster/bvec.cpp
+  ${CMAKE_SOURCE_DIR}/src/cluster/bvec_iterator.cpp
+
+)
+
+add_library(Exception
+  ${CMAKE_SOURCE_DIR}/src/exception/FileDoesNotExistException.cpp
+  ${CMAKE_SOURCE_DIR}/src/exception/InvalidInputException.cpp
+  ${CMAKE_SOURCE_DIR}/src/exception/InvalidOperationException.cpp
+  ${CMAKE_SOURCE_DIR}/src/exception/InvalidOrderOfOperationsException.cpp
+  ${CMAKE_SOURCE_DIR}/src/exception/InvalidScoreException.cpp
+  ${CMAKE_SOURCE_DIR}/src/exception/InvalidStateException.cpp
+)
+
+add_library(Nonltr
+  ${CMAKE_SOURCE_DIR}/src/nonltr/ChromDetectorMaxima.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/ChromListMaker.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/Chromosome.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigit.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitDna.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitProtein.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeRandom.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/DetectorMaxima.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/HMM.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/LocationList.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/LocationListCollection.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/Scanner.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/Scorer.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.cpp
+  ${CMAKE_SOURCE_DIR}/src/nonltr/Trainer.cpp
+)
+
+add_library(Utility
+  ${CMAKE_SOURCE_DIR}/src/utility/EmptyLocation.cpp
+  ${CMAKE_SOURCE_DIR}/src/utility/GlobAlignE.cpp
+  ${CMAKE_SOURCE_DIR}/src/utility/Location.cpp
+  ${CMAKE_SOURCE_DIR}/src/utility/Util.cpp
+)
+
+target_include_directories(Exception PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(Nonltr PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(Utility PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(Cluster PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(Fastcar PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(ClusterUtil PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(Predict PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+set (HEADER_FILES
+  ${CMAKE_SOURCE_DIR}/src/nonltr/KmerHashTable.h
+  ${CMAKE_SOURCE_DIR}/src/nonltr/EnrichmentMarkovView.h
+  ${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.h
+)
+
+set (CMAKE_CXX_COMPILER g++)
+set (CMAKE_CXX_STANDARD 11)
+set (CMAKE_CXX_FLAGS "-fopenmp -g -O3 -march=native")
+
+target_compile_definitions(Cluster PRIVATE VERSION="2.3.0")
+target_compile_definitions(Fastcar PRIVATE VERSION="0.7.1")
+
+add_executable(Red ${CMAKE_SOURCE_DIR}/src/nonltr/RepeatsDetector.cpp )
+add_executable(meshclust2 ${CMAKE_SOURCE_DIR}/src/cluster/meshclust2.cpp)
+add_executable(fastcar ${CMAKE_SOURCE_DIR}/src/fastcar/fastcar.cpp)
+
+target_link_libraries(Red Exception Nonltr Utility ${HEADER_FILES})
+target_link_libraries(Utility Exception ${HEADER_FILES})
+target_link_libraries(Nonltr Utility Exception ${HEADER_FILES})
+target_link_libraries(ClusterUtil Nonltr ${HEADER_FILES})
+target_link_libraries(Predict ClusterUtil Nonltr ${HEADER_FILES})
+target_link_libraries(meshclust2 Cluster Nonltr ClusterUtil Predict ${HEADER_FILES})
+target_link_libraries(fastcar Nonltr ClusterUtil Fastcar Predict ${HEADER_FILES})
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 2e611c1..0000000
--- a/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-all: bin/Red.o bin/meshclust2
-
-bin/Red.o:
-	mkdir -p bin
-	mkdir -p bin/exception
-	mkdir -p bin/nonltr
-	mkdir -p bin/utility
-	$(MAKE) -C src
-bin/meshclust2: bin/Red.o
-	$(MAKE) -C src/cluster
-	cp src/cluster/meshclust2 bin
-
-clean:
-	$(MAKE) clean -C src
-	$(MAKE) clean -C src/cluster
-	$(RM) -r bin
-
-rebuild: clean all
-.PHONY: all clean
diff --git a/README b/README
deleted file mode 100644
index 1388af3..0000000
--- a/README
+++ /dev/null
@@ -1,85 +0,0 @@
-MeShClust2
-Release version
-
-Requirements: g++ 4.9.1 or later, requires Homebrew on Mac OS X
-
-Compilation using g++ (homebrew) and GNU Make on Mac OS X
-CXX=g++-7 make
-
-see: https://stackoverflow.com/questions/29057437/compile-openmp-programs-with-gcc-compiler-on-os-x-yosemite
-
-Linux/Unix compilation:
-make
-
-
-If you find this tool helpful, please cite:
-
-James, Benjamin T. et al. (2018), MeShClust2: Application of alignment-free identity scores in clustering long DNA sequences. bioRxiv, 451278.
-
-
-Usage: bin/meshclust2 --id 0.x [OPTIONS] *.fasta
-
---id          The most important parameter, --id, controls the identity cutoff of the sequences.
-              Needs to be between 0 and 1.
-              If it is not specified, an identity of 0.9 is used.
-
---kmer        decides the size of the kmers. It is by default automatically decided by average sequence
-              length, but if provided, MeShClust can speed up a little by not having to find the largest
-              sequence length. Increasing kmer size can increase accuracy, but increases memory consumption.
-
---mut-type   {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation}
-             changes the mutation generation algorithm. By default, "single" is used, utilizing only
-             single point mutations. On low identity data sets, "both", which includes single mutations
-             and block mutations, is preferable. The option "nonsingle-typical" uses only block mutations,
-             disallowing single point mutations. Other options include "all", which includes single,
-             block, and nontypical mutations translocation and reversion.
-
---feat       determines the combinations of features to be used. By default, "slow" allows 11
-             combinations to be selected from. "fast" removes 2 slower features from "slow"
-             which include logarithm based features, and "extraslow" includes 33 total features
-             used in a previous study.
-
---min-feat   (default 3) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs
-             will be used. Recall that features include pairwise combinations of the "feat" option.
-
---max-feat   (default 5) sets the maximum feature pairs to be used. Diminishing returns appears quickly,
-             so a very large maximum is not advised.
-
---sample     selects the total number of sequences used for both training and testing.
-             300 is the default value. Each sequence generates 10 synthetic mutants.
-             That is, --sample 300 provides 3000 training pairs and 3000 testing pairs.
-
---min-id     (default 0.35) sets the lower bound for mutation identity scores to be calculated.
-             Shouldn't need to be set normally, as lower identites take much longer,
-             especially with single mutations only.
-
---threads    sets the number of threads to be used. By default OpenMP uses the number of available cores
-             on your machine, but this parameter overwrites that.
-
---output     specifies the output file, in CD-HIT's CLSTR format, described below:
-             A '>Cluster ' followed by an increasing index designates a cluster.
-             Otherwise, the sequence is printed out.
-             A '*' at the end of a sequence designates the center of the cluster.
-             An example of a small data set:
-             >Cluster 0
-             0       993nt, >seq128 template_6... *
-             >Cluster 1
-             0       1043nt, >seq235 template_10...
-             1       1000nt, >seq216 template_10... *
-             2       1015nt, >seq237 template_10...
-
---delta      decides how many clusters are looked around in the final clustering stage.
-             Increasing it creates more accuracy, but takes more time. Default value is 5.
-
---iterations specifies how many iterations in the final stage of merging are done until convergence.
-             Default value is 15.
-
-
-
-If the argument is not listed here, it is interpreted as an input (FASTA format) file.
-
-
-License
-
-Academic use: The software is provided as-is under the GNU GPLv3.
-Any restrictions to use for-profit or non-academics: License needed.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e099f63
--- /dev/null
+++ b/README.md
@@ -0,0 +1,113 @@
+## MeShClust2
+Release version - 2.3.0
+
+### Requirements
+g++ 4.9.1 or later, requires Homebrew on Mac OS X
+Compilation using g++ (homebrew) and CMake on Mac OS X see [this link](https://stackoverflow.com/questions/29057437/compile-openmp-programs-with-gcc-compiler-on-os-x-yosemite)
+
+### Linux/Unix compilation
+> mkdir bin && cd bin
+> cmake ..
+> make
+
+### Citation
+If you find this tool helpful, please cite:
+
+[James, Benjamin T. et al. (2018), MeShClust2: Application of alignment-free identity scores in clustering long DNA sequences. bioRxiv, 451278.](https://doi.org/10.1101/451278)
+
+### Usage
+
+  Usage: meshclust2 --id 0.x [OPTIONS] *.fasta
+
+  --id          The most important parameter, --id, controls the identity cutoff of the sequences.
+                Needs to be between 0 and 1.
+                If it is not specified, an identity of 0.9 is used.
+
+  --kmer        decides the size of the kmers. It is by default automatically decided by average sequence
+                length, but if provided, MeShClust can speed up a little by not having to find the largest
+                sequence length. Increasing kmer size can increase accuracy, but increases memory consumption.
+
+  --dump       Run until the classifier is trained, and then dump the weights to the file,
+               default 'weights.txt'. Can be used with --recover to recover the weights
+               instead of re-training.
+
+  --recover    Recover weights for the classifier trained by a previous run which used --dump to dump
+               the weights.
+
+  --list       Instead of specifying files as extra arguments, provide a text file with
+               a list of files. Can use pipes or process substitutions such as "--list <(ls *.fasta) "
+
+  --no-train-list    Same as --list, but these files are not passed to the classifier,
+                     e.g. unassembled genomes
+
+  --mut-type   {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation}
+               changes the mutation generation algorithm. By default, "both" is used, utilizing
+               single point and block mutations. On higher identity data sets, "single", which includes only single point mutations,
+               is preferable. The option "nonsingle-typical" uses only block mutations,
+               disallowing single point mutations. Other options include "all", which includes single,
+               block, and nontypical mutations translocation and reversion.
+
+  --feat       determines the combinations of features to be used. By default, "slow" allows 11
+               combinations to be selected from. "fast" removes 2 slower features from "slow"
+               which include logarithm based features.
+
+  --single-file  Using this option, (no value is needed), each file is treated as a single sequence.
+                 If multiple sequences in a file are encountered, they are joined with 50 Ns,
+                 and the k-mers are not counted in that region.
+                 However, to be most accurate, it is advised to not use these sequences in the
+                 training step (for mutations) and instead 1) train using un-joined sequences and
+                 use --dump to dump to a file, and 2) use --recover with --single-file for the
+                 file list.
+
+  --sample     selects the total number of sequences used for both training and testing.
+               2000 is the default value. That is, --sample 2000 provides 2000 training
+               pairs and 2000 testing pairs.
+
+  --num-templates   selects the number of "template" sequences from which to mutate.
+               For example, if 300 (the default) templates are requested, and the number of
+               "samples" is requested to be 2000 (the default), 300 sequences will be read in
+               and mutated 2000/300 times each to create 2000 semi-synthetic pairs.
+
+  --min-feat   (default 4) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs
+               will be used. Recall that features include pairwise combinations of the "feat" option.
+
+  --max-feat   (default 4) sets the maximum feature pairs to be used. Diminishing returns appears quickly,
+               so a very large maximum (>10) is not advised.
+
+  --min-id     (default 0.35) sets the lower bound for mutation identity scores to be calculated.
+               Shouldn't need to be set normally, as lower identites take much longer,
+               especially with single mutations only.
+
+  --datatype   (8,16,32,64) Decides the integer size of the histograms. If not provided,
+               all sequences are read in and counted to ensure the largest k-mer does not
+               overflow. If the provided k-mer is too small, it will overflow.
+
+  --threads    sets the number of threads to be used. By default OpenMP uses the number of available cores
+               on your machine, but this parameter overwrites that.
+
+  --output     specifies the output file, in CD-HIT's CLSTR format, described below:
+               A '>Cluster ' followed by an increasing index designates a cluster.
+               Otherwise, the sequence is printed out.
+               A '*' at the end of a sequence designates the center of the cluster.
+               An example of a small data set:
+
+               >Cluster 0
+               0       993nt, >seq128 template_6... *
+               >Cluster 1
+               0       1043nt, >seq235 template_10...
+               1       1000nt, >seq216 template_10... *
+               2       1015nt, >seq237 template_10...
+
+  --delta      decides how many clusters are looked around in the final clustering stage.
+               Increasing it creates more accuracy, but takes more time. Default value is 5.
+
+  --iterations specifies how many iterations in the final stage of merging are done until convergence.
+               Default value is 15.
+
+  If the argument is not listed here, it is interpreted as an input (FASTA format) file.
+
+
+### License
+
+Academic use: The software is provided as-is under the GNU GPLv3.
+Any restrictions to use for-profit or non-academics: License needed.
diff --git a/src/Makefile b/src/Makefile
deleted file mode 100644
index 3013ed0..0000000
--- a/src/Makefile
+++ /dev/null
@@ -1,175 +0,0 @@
-# CXX = /usr/bin/c++
-CXX ?= g++
-
-CXXFLAGS =	-O3 -g -fmessage-length=0 -Wall -march=native -std=c++11
-
-#
-# Objects
-#
-
-ORed  = ../bin/Red.o
-
-# Exception
-OInvalidInputException  = ../bin/exception/InvalidInputException.o
-OInvalidStateException  = ../bin/exception/InvalidStateException.o
-OFileDoesNotExistException  = ../bin/exception/FileDoesNotExistException.o
-OInvalidOrderOfOperationsException  = ../bin/exception/InvalidOrderOfOperationsException.o
-OInvalidScoreException  = ../bin/exception/InvalidScoreException.o
-OInvalidOperationException = ../bin/exception/InvalidOperationException.o
-
-# Utility
-OUtil = ../bin/utility/Util.o
-OLocation = ../bin/utility/Location.o
-OEmptyLocation = ../bin/utility/EmptyLocation.o
-OLCSLen = ../bin/utility/LCSLen.o
-OAffineId = ../bin/utility/AffineId.o
-OGlobAlignE = ../bin/utility/GlobAlignE.o
-
-# Non TR
-OChromosome  = ../bin/nonltr/Chromosome.o
-OChromosomeOneDigit = ../bin/nonltr/ChromosomeOneDigit.o
-OChromosomeRandom = ../bin/nonltr/ChromosomeRandom.o
-OChromListMaker = ../bin/nonltr/ChromListMaker.o
-OTableBuilder = ../bin/nonltr/TableBuilder.o
-OScorer = ../bin/nonltr/Scorer.o
-ODetectorMaxima = ../bin/nonltr/DetectorMaxima.o
-OChromDetectorMaxima =  ../bin/nonltr/ChromDetectorMaxima.o
-OHMM = ../bin/nonltr/HMM.o
-OScanner = ../bin/nonltr/Scanner.o
-OTrainer = ../bin/nonltr/Trainer.o
-OLocationList = ../bin/nonltr/LocationList.o
-OLocationListCollection = ../bin/nonltr/LocationListCollection.o
-
-OBJS = $(ORed) $(OInvalidInputException) $(OInvalidStateException) $(OFileDoesNotExistException) $(OInvalidOrderOfOperationsException) $(OInvalidOperationException) $(OInvalidScoreException) $(OUtil) $(OLocation) $(OEmptyLocation) $(OChromosome) $(OChromosomeOneDigit) $(OChromosomeRandom) $(OChromListMaker) $(OTableBuilder) $(OScorer) $(ODetectorMaxima) $(OChromDetector) $(OChromDetectorMaxima) $(OHMM) $(OScanner) $(OTrainer) $(OLocationList) $(OLocationListCollection) $(OLCSLen) $(OAffineId) $(OGlobAlignE)
-
-#
-# Target
-#
-
-TRed = ../bin/Red
-
-#
-# Make RepeatsDetector
-#
-
-$(TRed): $(OBJS)
-	$(CXX) -o $(TRed) $(OBJS)
-
-#
-# RepeatsDetector
-#
-
-$(ORed): RepeatsDetector.cpp nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/TableBuilder.h nonltr/HMM.h nonltr/Scanner.h nonltr/Trainer.h utility/Util.h
-	$(CXX) $(CXXFLAGS) -c RepeatsDetector.cpp -o $(ORed)
-
-#
-# Exception
-#
-$(OInvalidInputException): exception/InvalidInputException.cpp exception/InvalidInputException.h
-	$(CXX) $(CXXFLAGS) -c exception/InvalidInputException.cpp -o $(OInvalidInputException)
-
-$(OInvalidStateException): exception/InvalidStateException.cpp exception/InvalidStateException.h
-	$(CXX) $(CXXFLAGS) -c exception/InvalidStateException.cpp -o $(OInvalidStateException)
-
-$(OFileDoesNotExistException): exception/FileDoesNotExistException.cpp exception/FileDoesNotExistException.h
-	$(CXX) $(CXXFLAGS) -c exception/FileDoesNotExistException.cpp -o $(OFileDoesNotExistException)
-
-$(OInvalidOrderOfOperationsException): exception/InvalidOrderOfOperationsException.cpp exception/InvalidOrderOfOperationsException.h
-	$(CXX) $(CXXFLAGS) -c exception/InvalidOrderOfOperationsException.cpp -o $(OInvalidOrderOfOperationsException)
-
-$(OInvalidScoreException): exception/InvalidScoreException.cpp exception/InvalidScoreException.h
-	$(CXX) $(CXXFLAGS) -c exception/InvalidScoreException.cpp -o $(OInvalidScoreException)
-
-$(OInvalidOperationException): exception/InvalidOperationException.cpp exception/InvalidOperationException.h
-	$(CXX) $(CXXFLAGS) -c exception/InvalidOperationException.cpp -o $(OInvalidOperationException)
-
-#
-# Utility
-#
-
-$(OUtil): utility/Util.cpp utility/Util.h utility/Location.h exception/FileDoesNotExistException.h
-	$(CXX) $(CXXFLAGS) -c utility/Util.cpp -o $(OUtil)
-
-$(OLocation): utility/Location.cpp utility/Location.h utility/ILocation.h exception/InvalidInputException.h utility/Util.h
-	$(CXX) $(CXXFLAGS) -c utility/Location.cpp -o $(OLocation)
-
-$(OEmptyLocation): utility/EmptyLocation.cpp utility/EmptyLocation.h utility/ILocation.h exception/InvalidOperationException.h
-	$(CXX) $(CXXFLAGS) -c utility/EmptyLocation.cpp -o $(OEmptyLocation)
-
-$(OLCSLen): utility/LCSLen.cpp utility/LCSLen.h
-	$(CXX) $(CXXFLAGS) -c utility/LCSLen.cpp -o $(OLCSLen)
-
-$(OAffineId): utility/AffineId.cpp utility/AffineId.h
-	$(CXX) $(CXXFLAGS) -c utility/AffineId.cpp -o $(OAffineId)
-
-$(OGlobAlignE): utility/GlobAlignE.cpp utility/GlobAlignE.h
-	$(CXX) $(CXXFLAGS) -c utility/GlobAlignE.cpp -o $(OGlobAlignE)
-#
-# Non LTR
-#
-
-$(OChromosome): nonltr/Chromosome.cpp nonltr/Chromosome.h nonltr/IChromosome.h utility/Util.h exception/InvalidInputException.h exception/InvalidOperationException.h
-	$(CXX) $(CXXFLAGS) -c nonltr/Chromosome.cpp -o $(OChromosome)
-
-$(OChromosomeOneDigit): nonltr/ChromosomeOneDigit.cpp nonltr/ChromosomeOneDigit.h nonltr/Chromosome.h exception/InvalidInputException.h
-	$(CXX) $(CXXFLAGS) -c nonltr/ChromosomeOneDigit.cpp	-o $(OChromosomeOneDigit)
-
-$(OChromosomeRandom): nonltr/ChromosomeRandom.cpp nonltr/ChromosomeRandom.h nonltr/IChromosome.h exception/InvalidInputException.h exception/InvalidStateException.h utility/Util.h
-	$(CXX) $(CXXFLAGS) -c nonltr/ChromosomeRandom.cpp -o $(OChromosomeRandom)
-
-$(OTableBuilder): nonltr/TableBuilder.cpp nonltr/TableBuilder.h utility/Util.h nonltr/ChromosomeOneDigit.h nonltr/ITableView.h nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/EnrichmentMarkovView.h nonltr/EnrichmentMarkovView.cpp exception/InvalidStateException.h nonltr/ChromListMaker.h nonltr/IChromosome.h
-	$(CXX) $(CXXFLAGS) -c nonltr/TableBuilder.cpp -o $(OTableBuilder)
-
-$(OScorer): nonltr/Scorer.cpp nonltr/Scorer.h nonltr/ChromosomeOneDigit.h utility/Util.h exception/InvalidStateException.h
-	$(CXX) $(CXXFLAGS) -c nonltr/Scorer.cpp -o $(OScorer)
-
-$(ODetectorMaxima): nonltr/DetectorMaxima.cpp nonltr/DetectorMaxima.h utility/ILocation.h exception/InvalidStateException.h
-	$(CXX) $(CXXFLAGS) -c nonltr/DetectorMaxima.cpp -o $(ODetectorMaxima)
-
-$(OChromDetectorMaxima): nonltr/ChromDetectorMaxima.cpp nonltr/ChromDetectorMaxima.h nonltr/DetectorMaxima.h nonltr/ChromosomeOneDigit.h utility/Util.h utility/ILocation.h utility/Location.h
-	$(CXX) $(CXXFLAGS) -c nonltr/ChromDetectorMaxima.cpp -o $(OChromDetectorMaxima)
-
-$(OHMM): nonltr/HMM.cpp nonltr/HMM.h utility/ILocation.h exception/InvalidStateException.h exception/InvalidInputException.h exception/FileDoesNotExistException.h exception/InvalidOperationException.h
-	$(CXX) $(CXXFLAGS) -c nonltr/HMM.cpp -o $(OHMM)
-
-$(OScanner): nonltr/Scanner.cpp nonltr/Scanner.h nonltr/Chromosome.h nonltr/ChromosomeOneDigit.h nonltr/HMM.h nonltr/ITableView.h nonltr/Scorer.h utility/Util.h utility/ILocation.h exception/InvalidInputException.h exception/InvalidStateException.h exception/FileDoesNotExistException.h exception/InvalidOperationException.h
-	$(CXX) $(CXXFLAGS) -c nonltr/Scanner.cpp -o $(OScanner)
-
-$(OTrainer): nonltr/Trainer.cpp nonltr/Trainer.h nonltr/TableBuilder.h nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/HMM.h nonltr/ChromDetectorMaxima.h nonltr/Scorer.h nonltr/ChromListMaker.h utility/Util.h nonltr/LocationListCollection.h
-	$(CXX) $(CXXFLAGS) -c nonltr/Trainer.cpp -o $(OTrainer)
-
-$(OChromListMaker): nonltr/ChromListMaker.cpp nonltr/ChromListMaker.h nonltr/Chromosome.h nonltr/ChromosomeOneDigit.h utility/Util.h
-	$(CXX) $(CXXFLAGS) -c nonltr/ChromListMaker.cpp -o $(OChromListMaker)
-
-$(OCluster): nonltr/Cluster.cpp nonltr/Cluster.h utility/Util.h exception/InvalidStateException.h exception/InvalidInputException.h
-	$(CXX) $(CXXFLAGS) -c nonltr/Cluster.cpp -o $(OCluster)
-
-$(OLocationList): nonltr/LocationList.cpp nonltr/LocationList.h utility/ILocation.h utility/Location.h exception/InvalidStateException.h
-	$(CXX) $(CXXFLAGS) -c nonltr/LocationList.cpp -o $(OLocationList)
-
-$(OLocationListCollection): nonltr/LocationListCollection.cpp nonltr/LocationListCollection.h utility/Location.h exception/InvalidStateException.h
-	$(CXX) $(CXXFLAGS) -c nonltr/LocationListCollection.cpp -o $(OLocationListCollection)
-
-
-#
-# Make binary directories
-#
-
-red: $(TRed)
-
-#
-# Make Red
-#
-
-bin:
-	mkdir ../bin
-	mkdir ../bin/exception
-	mkdir ../bin/utility
-	mkdir ../bin/nonltr
-
-#
-# Make clean
-#
-
-clean:
-	rm -f ../bin/*.o ../bin/exception/*.o ../bin/ms/*.o ../bin/nonltr/*.o ../bin/test/*.o ../bin/utility/*.o ../bin/tr/*.o *.o $(TRed)
diff --git a/src/cluster/src/Runner.cpp b/src/cluster/CRunner.cpp
similarity index 52%
rename from src/cluster/src/Runner.cpp
rename to src/cluster/CRunner.cpp
index 8a367d7..16514fd 100644
--- a/src/cluster/src/Runner.cpp
+++ b/src/cluster/CRunner.cpp
@@ -9,77 +9,117 @@
 #include <cmath>
 #include <sys/stat.h>
 #include <cstdlib>
-#include "../../nonltr/ChromListMaker.h"
-#include "../../utility/AffineId.h"
-#include "Runner.h"
+#include "../nonltr/ChromListMaker.h"
+#include "../clutil/Datatype.h"
+#include "../clutil/Loader.h"
+#include "../clutil/Clock.h"
+#include "CRunner.h"
 #include "Trainer.h"
 #include "ClusterFactory.h"
 #include "bvec.h"
-#include "Progress.h"
+#include "../clutil/Progress.h"
+#include "../predict/Predictor.h"
 #ifdef _OPENMP
 #include <omp.h>
 #endif
+
+/*
+ * Constructor for runner.
+ * Gets the options
+ * If --recover is passed, set the K using that
+ * Otherwise, if the K wasn't set, find the K by iterating through all sequences
+ */
 Runner::Runner(int argc, char **argv)
 {
 	get_opts(argc, argv);
-	if (k == -1) {
-		auto pr = find_k();
-		k = pr.first;
-	}
-	// if (similarity < 0.6) {
-	// 	align = true;
-	// }
-	if (sample_size == 0) {
-		if (similarity < 0.6) {
-			sample_size = 1000;
-		} else {
-			sample_size = 300;
-		}
+	if (pred64 != NULL) {
+		k = pred64->get_k();
+	} else if (k == -1) {
+		k = find_k();
 	}
 	srand(10);
 }
 
+
+/*
+ * Main entry point for MeShClust2.
+ * Sets datatype and identity if --recover was used
+ * If datatype wasn't set, run through and find the max histogram
+ * Based on the max histogram, call do_run with the smallest possible
+ * histogram that will fit all sequences
+ */
 int Runner::run()
 {
-	largest_count = 0;
-	Progress progress(files.size(), "Reading in sequences");
-	for (auto i = 0; i < files.size(); i++) {
-		auto f = files.at(i);
-		ChromListMaker maker(f);
-		auto chromList = maker.makeChromOneDigitList();
-
-		progress++;
+	if (pred64 != NULL) {
+		Datatype::set(pred64->get_datatype());
+		similarity = pred64->get_id();
+	} else if (Datatype::get() == "") {
+		largest_count = 0;
+		Progress progress(all_files.size(), "Reading in sequences");
+#pragma omp parallel for
+		for (auto i = 0; i < all_files.size(); i++) {
+			auto f = all_files.at(i);
+			ChromListMaker maker(f, is_single_file);
+			auto chromList = maker.makeChromOneDigitDnaList();
+
+
 //		cout << "Reading in sequences from " << f << "..." << endl;
-		uint64_t local_largest_count = 0;
-#pragma omp parallel for reduction(max:local_largest_count)
-	        for (int i = 0; i < chromList->size(); i++) {
-			std::vector<uint64_t> values;
-			KmerHashTable<unsigned long, uint64_t> table(k, 1);
-			ChromosomeOneDigit *chrom = dynamic_cast<ChromosomeOneDigit *>(chromList->at(i));
-			fill_table<uint64_t>(table, chrom, values);
-			uint64_t l_count = *std::max_element(values.begin(), values.end());
-			if (l_count > local_largest_count) {
-				local_largest_count = l_count;
+			uint64_t local_largest_count = 0;
+//#pragma omp parallel for reduction(max:local_largest_count)
+			for (int i = 0; i < chromList->size(); i++) {
+				std::vector<uint64_t> values;
+				KmerHashTable<unsigned long, uint64_t> table(k, 1);
+				ChromosomeOneDigit *chrom = dynamic_cast<ChromosomeOneDigit *>(chromList->at(i));
+				fill_table<uint64_t>(table, chrom, values);
+				uint64_t l_count = *std::max_element(values.begin(), values.end());
+				if (l_count > local_largest_count) {
+					local_largest_count = l_count;
+					// #pragma omp critical
+					// cout << "local largest count reset to " << local_largest_count << endl;
+				}
+			}
+
+			#pragma omp critical
+			{
+				if (local_largest_count > largest_count) {
+					largest_count = local_largest_count;
+				// #pragma omp critical
+				// cout << "largest count updated to " << largest_count << endl;
+				}
+				progress++;
 			}
 		}
-		if (local_largest_count > largest_count) {
-			largest_count = local_largest_count;
-		}
+		progress.end();
+		cout << "Largest count: " << largest_count << endl;
 	}
-	progress.end();
-
 
+	if (Datatype::get() != "") {
+		std::string type = Datatype::get();
+		if (type == "uint8_t") {
+			largest_count = std::numeric_limits<uint8_t>::max();
+		} else if (type == "uint16_t") {
+			largest_count = std::numeric_limits<uint16_t>::max();
+		} else if (type == "uint32_t") {
+			largest_count = std::numeric_limits<uint32_t>::max();
+		} else if (type == "uint64_t") {
+			largest_count = std::numeric_limits<uint64_t>::max();
+		}
+	}
 	if (largest_count <= std::numeric_limits<uint8_t>::max()) {
 		cout << "Using 8 bit histograms" << endl;
+		Datatype::set("uint8_t");
 		return do_run<uint8_t>();
 	} else if (largest_count <= std::numeric_limits<uint16_t>::max()) {
 		cout << "Using 16 bit histograms" << endl;
+		Datatype::set("uint16_t");
 		return do_run<uint16_t>();
 	} else if (largest_count <= std::numeric_limits<uint32_t>::max()){
 	       	cout << "Using 32 bit histograms" << endl;
+		Datatype::set("uint32_t");
 		return do_run<uint32_t>();
 	} else if (largest_count <= std::numeric_limits<uint64_t>::max()) {
 	       	cout << "Using 64 bit histograms" << endl;
+		Datatype::set("uint64_t");
 		return do_run<uint64_t>();
 	} else {
 		throw "Too big sequence";
@@ -108,32 +148,61 @@ void usage(std::string progname)
               length, but if provided, MeShClust can speed up a little by not having to find the largest
               sequence length. Increasing kmer size can increase accuracy, but increases memory consumption.
 
+--dump       Run until the classifier is trained, and then dump the weights to the file,
+             default 'weights.txt'. Can be used with --recover to recover the weights
+             instead of re-training.
+
+--recover    Recover weights for the classifier trained by a previous run which used --dump to dump
+             the weights.
+
+--list       Instead of specifying files as extra arguments, provide a text file with
+             a list of files. Can use pipes or process substitutions such as "--list <(ls *.fasta) "
+
+--no-train-list    Same as --list, but these files are not passed to the classifier,
+                   e.g. unassembled genomes
+
 --mut-type   {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation}
-             changes the mutation generation algorithm. By default, "single" is used, utilizing only
-             single point mutations. On low identity data sets, "both", which includes single mutations
-             and block mutations, is preferable. The option "nonsingle-typical" uses only block mutations,
+             changes the mutation generation algorithm. By default, "both" is used, utilizing
+             single point and block mutations. On higher identity data sets, "single", which includes only single point mutations,
+             is preferable. The option "nonsingle-typical" uses only block mutations,
              disallowing single point mutations. Other options include "all", which includes single,
              block, and nontypical mutations translocation and reversion.
 
 --feat       determines the combinations of features to be used. By default, "slow" allows 11
              combinations to be selected from. "fast" removes 2 slower features from "slow"
-             which include logarithm based features, and "extraslow" includes 33 total features
-             used in a previous study.
-
---min-feat   (default 3) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs
-             will be used. Recall that features include pairwise combinations of the "feat" option.
+             which include logarithm based features.
 
---max-feat   (default 5) sets the maximum feature pairs to be used. Diminishing returns appears quickly,
-             so a very large maximum is not advised.
+--single-file  Using this option, (no value is needed), each file is treated as a single sequence.
+               If multiple sequences in a file are encountered, they are joined with 50 Ns,
+               and the k-mers are not counted in that region.
+               However, to be most accurate, it is advised to not use these sequences in the
+               training step (for mutations) and instead 1) train using un-joined sequences and
+               use --dump to dump to a file, and 2) use --recover with --single-file for the
+               file list.
 
 --sample     selects the total number of sequences used for both training and testing.
-             300 is the default value. Each sequence generates 10 synthetic mutants.
-             That is, --sample 300 provides 3000 training pairs and 3000 testing pairs.
+             2000 is the default value. That is, --sample 2000 provides 2000 training
+             pairs and 2000 testing pairs.
+
+--num-templates   selects the number of "template" sequences from which to mutate.
+             For example, if 300 (the default) templates are requested, and the number of
+             "samples" is requested to be 2000 (the default), 300 sequences will be read in
+             and mutated 2000/300 times each to create 2000 semi-synthetic pairs.
+
+--min-feat   (default 4) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs
+             will be used. Recall that features include pairwise combinations of the "feat" option.
+
+--max-feat   (default 4) sets the maximum feature pairs to be used. Diminishing returns appears quickly,
+             so a very large maximum (>10) is not advised.
 
 --min-id     (default 0.35) sets the lower bound for mutation identity scores to be calculated.
              Shouldn't need to be set normally, as lower identites take much longer,
              especially with single mutations only.
 
+--datatype   (8,16,32,64) Decides the integer size of the histograms. If not provided,
+             all sequences are read in and counted to ensure the largest k-mer does not
+             overflow. If the provided k-mer is too small, it will overflow.
+
 --threads    sets the number of threads to be used. By default OpenMP uses the number of available cores
              on your machine, but this parameter overwrites that.
 
@@ -187,6 +256,45 @@ void Runner::get_opts(int argc, char **argv)
 				exit(EXIT_FAILURE);
 			}
 			i++;
+		} else if (arg == "--single-file") {
+			is_single_file = true;
+		} else if ((arg == "--list" || arg == "-l") && i + 1 < argc) {
+			std::ifstream in(argv[++i]);
+			std::string line;
+			while (getline(in, line)) {
+				files.push_back(line);
+			}
+		} else if ((arg == "--no-train-list" || arg == "--notrain-list") && i + 1 < argc) {
+			std::ifstream in(argv[++i]);
+			std::string line;
+			while (getline(in, line)) {
+				notrain_files.push_back(line);
+			}
+		} else if (arg == "--dump") {
+			if (i + 1 < argc && argv[i+1][0] != '-') {
+				dump_str = argv[++i];
+			}
+			dump = true;
+		} else if ((arg == "--datatype") && i + 1 < argc) {
+			std::string val = argv[++i];
+			if (val == "uint8_t" || val == "8" || val == "uint8") {
+				Datatype::set("uint8_t");
+			} else if (val == "uint16_t" || val == "16" || val == "uint16") {
+				Datatype::set("uint16_t");
+			} else if (val == "uint32_t" || val == "32" || val == "uint32") {
+				Datatype::set("uint32_t");
+			} else if (val == "uint64_t" || val == "64" || val == "uint64") {
+				Datatype::set("uint64_t");
+			} else {
+				cerr << "Histogram data type must have a valid data type or size: one of 8, 16, 32, 64" << endl;
+				exit(EXIT_FAILURE);
+			}
+	        } else if ((arg == "-r" || arg == "--recover") && i + 1 < argc) {
+			dump_str = argv[++i];
+			recover = true;
+			pred64 = new Predictor<uint64_t>(dump_str);
+			similarity = pred64->get_id();
+			k = pred64->get_k();
 		} else if (arg == "--min-id" && i + 1 < argc) {
 			try {
 				std::string opt = argv[i+1];
@@ -199,6 +307,8 @@ void Runner::get_opts(int argc, char **argv)
 				exit(EXIT_FAILURE);
 			}
 			i++;
+		} else if ((arg == "-b" || arg == "--bias") && i + 1 < argc) {
+			bias = std::stod(argv[++i]);
 		} else if ((arg == "-k" || arg == "--kmer") && i + 1 < argc) {
 			k = strtol(argv[i+1], NULL, 10);
 			if (errno) {
@@ -212,26 +322,26 @@ void Runner::get_opts(int argc, char **argv)
 		} else if ((arg == "-o" || arg == "--output") && i + 1 < argc) {
 			output = string(argv[i+1]);
 			i++;
+		} else if ((arg == "--num-templates") && i + 1 < argc) {
+			n_templates = strtol(argv[i+1], NULL, 10);
+			if (errno) {
+				perror(argv[i+1]);
+				exit(EXIT_FAILURE);
+			} else if (n_templates <= 0) {
+				fprintf(stderr, "Number of templates must be greater than 0.\n");
+				exit(EXIT_FAILURE);
+			}
+			i++;
 		} else if ((arg == "-s" || arg == "--sample") && i + 1 < argc) {
-			sample_size = strtol(argv[i+1], NULL, 10);
+			total_sample_size = strtol(argv[i+1], NULL, 10);
 			if (errno) {
 				perror(argv[i+1]);
 				exit(EXIT_FAILURE);
-			} else if (sample_size <= 0) {
+			} else if (total_sample_size <= 0) {
 				fprintf(stderr, "Sample size must be greater than 0.\n");
 				exit(EXIT_FAILURE);
 			}
 			i++;
-		// } else if ((arg == "-p" || arg == "--pivot") && i + 1 < argc) {
-		// 	pivots = strtol(argv[i+1], NULL, 10);
-		// 	if (errno) {
-		// 		perror(argv[i+1]);
-		// 		exit(EXIT_FAILURE);
-		// 	} else if (sample_size <= 0) {
-		// 		fprintf(stderr, "Points per pivot must be greater than 0.\n");
-		// 		exit(EXIT_FAILURE);
-		// 	}
-		// 	i++;
 		} else if ((arg == "--mut-type") && i + 1 < argc) {
 			std::string opt = argv[i+1];
 			if (opt == "all") {
@@ -342,6 +452,20 @@ void Runner::get_opts(int argc, char **argv)
 			}
 		}
 	}
+	set<std::string> file_list(files.begin(), files.end());
+	set<std::string> notrain_file_list;
+	set<std::string> all_files_list(files.begin(), files.end());
+	for (std::string val : notrain_files) {
+		if (file_list.find(val) == file_list.end()) {
+			notrain_file_list.insert(val);
+			all_files_list.insert(val);
+		}
+	}
+	files.assign(file_list.begin(), file_list.end());
+	notrain_files.assign(notrain_file_list.begin(),
+			     notrain_file_list.end());
+	all_files.assign(all_files_list.begin(),
+			 all_files_list.end());
 	if (files.empty()) {
 		usage(*argv);
 		exit(EXIT_FAILURE);
@@ -352,53 +476,104 @@ void Runner::get_opts(int argc, char **argv)
 	}
 }
 
-pair<int,uint64_t> Runner::find_k()
+int Runner::find_k()
 {
 	unsigned long long count = 0, length = 0, largest_count = 0;
-        uint64_t longest_seq = 0;
-	uintmax_t num_sequences = 0;
-	for (auto f : files) {
-		ChromListMaker maker(f);
-		auto chromList = maker.makeChromOneDigitList();
+#pragma omp parallel for
+	for (size_t i = 0; i < all_files.size(); i++) {
+//		cout << "Processing " << f << endl;
+		ChromListMaker maker(all_files.at(i), is_single_file);
+		auto chromList = maker.makeChromList();
 		unsigned long long l = 0;
 	        for (int i = 0; i < chromList->size(); i++) {
-			ChromosomeOneDigit *chrom = dynamic_cast<ChromosomeOneDigit *>(chromList->at(i));
-			auto sz = chrom->size();
+			Chromosome *chrom = dynamic_cast<Chromosome *>(chromList->at(i));
+			auto sz = chrom->getEffectiveSize();
 			l += sz;
-			if (sz > longest_seq) {
-				longest_seq = sz;
-			}
-			num_sequences++;
-
 		}
 		l /= chromList->size();
+#pragma omp atomic
 		length += l;
 	}
 	length /= files.size();
 	int newk = ceil(log(length) / log(4)) - 1;
 	cout << "avg length: " << length << endl;
 	cout << "Recommended K: " << newk << endl;
-	return make_pair(newk, longest_seq);
+	return newk;
 }
 
+template<class T>
+void get_points(std::vector<std::string> files, std::vector<Point<T>*> &points, int k, uintmax_t &_id, bool is_single_file, bool set_seq=true, int chunk_size=10000)
+{
+	if (files.empty()) {
+		return;
+	}
+	auto sort_func = [](Point<T>*a, Point<T>*b) {
+		return a->get_length() < b->get_length();
+	};
+	auto sort_hdr = [](Point<T>*a, Point<T>*b) {
+		return a->get_header() < b->get_header();
+	};
+	int n_threads = omp_get_max_threads();
+	std::ostringstream oss;
+	oss << "Counting " << k << "-mers";
+	Progress prog(files.size(), oss.str());
+	#pragma omp parallel for
+	for (size_t i = 0; i < files.size(); i++) {
+		ChromListMaker maker(files.at(i), is_single_file);
+		auto chromList = maker.makeChromOneDigitDnaList();
+		for (auto elt : *chromList) {
+			ChromosomeOneDigitDna* chrom = dynamic_cast<ChromosomeOneDigitDna*>(elt);
+			Point<T>* pt = Loader<T>::get_point(chrom, _id, k);
+#pragma omp critical
+			points.push_back(pt);
+		}
+#pragma omp critical
+		prog++;
+	}
+	prog.end();
+	std::string warning = Loader<T>::get_warning();
+	if (warning != "") {
+		cout << warning << endl;
+	}
+	std::sort(points.begin(), points.end(), sort_hdr);
+	std::sort(points.begin(), points.end(), sort_func);
+	// for (auto seq : points) {
+	// 	cout << "SEQ " << seq->get_header() << endl;
+	// }
 
-double global_mat[4][4] = {{1, -1, -1, -1},
-			   {-1, 1, -1, -1},
-			   {-1, -1, 1, -1},
-			   {-1, -1, -1, 1}};
-double global_sigma = -2;
-double global_epsilon = -1;
+}
 
+/*
+ * Main launching point for the algorithm
+ *
+ * Reads in all points
+ * Trains the model using a previous model if provided
+ * Else trains the model from scratch
+ * Initializes bvec search structure
+ * Runs mean shift
+ */
 template<class T>
 int Runner::do_run()
 {
 	using pvec = vector<Point<T> *>;
 	using pmap = map<Point<T>*, pvec*>;
-
-	ClusterFactory<T> factory(k);
-	auto points = factory.build_points(files, [&](nonltr::ChromosomeOneDigit *p){ return factory.get_divergence_point(p); });
-	Trainer<T> tr(points, sample_size, largest_count, similarity, pivots, global_mat, global_sigma, global_epsilon, align ? 0 : k);
-	tr.train(min_n_feat, max_n_feat, feat_type, mut_type, min_id);
+	uintmax_t s_id = 0;
+	Predictor<T>::set_bias(bias);
+
+	pvec points;
+	get_points(files, points, k, s_id, is_single_file);
+	Clock::stamp("read_in_points");
+	Trainer<T> tr(points, total_sample_size, largest_count, similarity, n_templates, k);
+	if (recover) {
+		tr.train(dump_str);
+	} else {
+		// If we are working in low-identity space, get more room
+		if (similarity < 0.6) {
+			min_id = 0.2;
+		}
+		tr.train(min_n_feat, max_n_feat, feat_type, mut_type, min_id, dump ? dump_str : "");
+	}
+	get_points(notrain_files, points, k, s_id, is_single_file, false);
 	vector<uint64_t> lengths;
 	for (Point<T>* p : points) {
 		if (!align) {
@@ -416,28 +591,7 @@ int Runner::do_run()
 		bv.insert(p);
 	}
 	bv.insert_finalize();
-//	cout << "bv size: " << bv.report() << endl;
-	// Point<T>* mid = points[points.size()/2];
-	// auto rng = bv.get_range(mid->get_length() * 0.99,
-	// 			mid->get_length() / 0.99);
-	// auto begin = bv.iter(rng.first);
-	// auto end = bv.iter(rng.second);
-	// size_t before = bv.report();
-	// for (int i = 0; i < 1; i++) {
-	// 		bool is_min = false;
-	// 		Point<T>* p = tr.get_close(mid, begin, end, is_min);
-	// 		size_t after = bv.report();
-	// 		if (is_min) {
-	// 			string expr = (after + 1 == before) ? "true" : "false";
-	// 			if (expr == "false") {
-	// 				throw expr;
-	// 			}
-	// 			cout << expr << endl;
-	// 			cout << "is min" << endl;
-	// 		} else {
-	// 			cout << "is not min" << endl;
-	// 		}
-	// }
+	ClusterFactory<T> factory(k);
 	factory.MS(bv, bandwidth, similarity, tr, output, iterations, delta);
 	return 0;
 }
diff --git a/src/cluster/src/Runner.h b/src/cluster/CRunner.h
similarity index 51%
rename from src/cluster/src/Runner.h
rename to src/cluster/CRunner.h
index 9cc8c04..1ad6abb 100644
--- a/src/cluster/src/Runner.h
+++ b/src/cluster/CRunner.h
@@ -4,14 +4,14 @@
  *
  * Author: Benjamin T James
  */
-#ifndef RUNNER_H
-#define RUNNER_H
+#ifndef CRUNNER_H
+#define CRUNNER_H
 
 #include <iostream>
 #include <map>
-#include "Point.h"
-#include "HandleSeq.h"
-#include "Predictor.h"
+#include "../clutil/Point.h"
+#include "../predict/HandleSeq.h"
+#include "../predict/Predictor.h"
 using namespace std;
 
 class Runner {
@@ -29,16 +29,23 @@ class Runner {
 	int iterations = 15;
 	int delta = 5;
 	bool align = false;
-	int sample_size = 0;
-	int pivots = 40;
-	int min_n_feat = 3;
-	int max_n_feat = 5;
-	int mut_type = HandleSeq::SINGLE;
-	uint64_t feat_type = PRED_FEAT_FAST | PRED_FEAT_DIV;
+	int total_sample_size = 2000;
+	int n_templates = 300;
+	int min_n_feat = 4;
+	int max_n_feat = 4;
+	bool is_single_file = false;
+	double bias = 0;
+	int mut_type = HandleSeq::BOTH;
+	uint64_t feat_type = PRED_FEAT_FAST;
 	double min_id = 0.35;
-	std::vector<std::string> files;
+	std::vector<std::string> files, notrain_files, all_files;
 	string output = "output.clstr";
 	void get_opts(int argc, char** argv);
-	pair<int,uint64_t> find_k();
+        int find_k();
+
+	bool dump = false;
+	bool recover = false;
+	std::string dump_str = "weights.txt";
+	Predictor<uint64_t> *pred64 = NULL;
 };
 #endif
diff --git a/src/cluster/src/Center.h b/src/cluster/Center.h
similarity index 97%
rename from src/cluster/src/Center.h
rename to src/cluster/Center.h
index 8c2acc5..52c1cbd 100644
--- a/src/cluster/src/Center.h
+++ b/src/cluster/Center.h
@@ -7,7 +7,7 @@
 #ifndef CENTER_H
 #define CENTER_H
 
-#include "Point.h"
+#include "../clutil/Point.h"
 
 template<class T>
 struct Center {
diff --git a/src/cluster/src/ClusterFactory.cpp b/src/cluster/ClusterFactory.cpp
similarity index 79%
rename from src/cluster/src/ClusterFactory.cpp
rename to src/cluster/ClusterFactory.cpp
index c6ce7ae..c0f25fc 100644
--- a/src/cluster/src/ClusterFactory.cpp
+++ b/src/cluster/ClusterFactory.cpp
@@ -15,14 +15,16 @@
 #include <queue>
 #include <string>
 #include <list>
-#include "Histogram.h"
-#include "../../nonltr/KmerHashTable.h"
-#include "../../nonltr/ChromListMaker.h"
-#include "DivergencePoint.h"
+#include "../clutil/Histogram.h"
+#include "../nonltr/KmerHashTable.h"
+#include "../nonltr/ChromListMaker.h"
+#include "../clutil/DivergencePoint.h"
+#include "../clutil/Clock.h"
 #include "Center.h"
-#include "Progress.h"
+#include "../clutil/Progress.h"
 //#include <omp.h>
 
+/* The main method is MS() */
 template<class T>
 T avg_distance(Point<T> &c, const std::vector<Point<T>*> &vec)
 {
@@ -308,31 +310,25 @@ void mean_shift_update(vector<Center<T> > &part, int j, const Trainer<T>& trn, i
 		}
 	}
 	trn.filter(center, good);
+	uint64_t cen_len = center->get_length();
+	uint64_t min_len = trn.get_id() * cen_len;
+	uint64_t max_len = cen_len / trn.get_id();
 	if (!good.empty()) {
+
 		for (auto p : good) {
+			uint64_t p_len = p.first->get_length();
+			bool within = p_len >= min_len && p_len <= max_len;
+//			cout << "CEN " << j << " " << cen_len << " " << min_len << " " << max_len << " " << p_len << " " << (within ? "TRUE" : "FALSE") << endl;
 			p.first->set_arg_to_this_d(*temp);
 			*top += *temp;
 			bottom++;
 		}
 		*top /= bottom;
 		Point<T>* next = trn.closest(top, good);
-		// Point<T> *next = NULL;
-		// int next_dist = std::numeric_limits<int>::max();
-		// for (int i = 0; i < N; i++) {
-		// 	int dist = points[i]->distance_d(*top);
-		// 	if (dist < next_dist) {
-		// 		next_dist = dist;
-		// 		next = points[i];
-		// 	}
-		// }
-		if (next != NULL) {
-			center->set(*next);
-			center->set_data_str(next->get_data_str());
-		} else {
-			cerr << "mean shift: NULL" << endl;
-		}
-	} else {
-		//cout << "GOOD: EMPTY" << endl;
+		center->set(*next);
+	} else if (delta == 0) {
+		Point<T>* first = part[j].getPoints()[0];
+		center->set(*first);
 	}
 	delete top;
 	delete temp;
@@ -390,35 +386,12 @@ bool merge(vector<Center<T> > &centers, const Trainer<T>& trn, int delta, int ba
 	for (int i = 0; i < centers.size(); i++) {
 		long ret = trn.merge(centers, i, i + 1, std::min((int)centers.size()-1, i + delta));
 		if (ret > i) {
-
 			num_merge++;
 			auto &to_add = centers[ret].getPoints();
 			auto &to_del = centers[i].getPoints();
 			to_add.insert(std::end(to_add), std::begin(to_del), std::end(to_del));
 			centers[i].lazy_remove();
 		}
-		// vector<pair<Point<T>*,double> > to_merge;
-		// for (int j = i + 1; j < std::min((int)centers.size(), i + 1 + delta); j++) {
-		// 	to_merge.push_back(std::make_pair(centers[j].getCenter(), -1));
-		// }
-		// Point<T>* closest = trn.merge(centers[i].getCenter(), to_merge);
-		// if (closest != NULL) {
-		// 	#ifdef DEBUG
-		// 	cout << "Merged center " << centers[i]->get_header() << " and " << closest->get_header() << endl;
-		// 	#endif
-		// 	num_merge++;
-		// 	// auto& to_del = partition[centers[i]];
-		// 	// auto& to_add = partition[closest];
-		// 	// to_add.insert(std::end(to_add), std::begin(to_del), std::end(to_del));
-		// 	// partition.erase(centers[i]);
-		// 	// centers[i]->set_to_delete(true);
-		// 	auto& to_del = partition[centers[i]];
-		// 	auto& to_add = partition[closest];
-		// 	to_add.insert(std::end(to_add), std::begin(to_del), std::end(to_del));
-		// 	partition.erase(centers[i]);
-		// 	centers[i]->set_to_delete(true);
-
-		// }
 	}
 	//cout << "Merged " << num_merge << " centers" << endl;
 	centers.erase(std::remove_if(centers.begin(), centers.end(), [](const Center<T>& p) {
@@ -430,7 +403,7 @@ bool merge(vector<Center<T> > &centers, const Trainer<T>& trn, int delta, int ba
 template<class T>
 void print_output(const string& output, vector<Center<T> > & partition)
 {
-	cout << "Printing output" << endl;
+//	cout << "Printing output" << endl;
 	std::ofstream ofs;
 	ofs.open(output, std::ofstream::out);
 	int counter = 0;
@@ -447,11 +420,6 @@ void print_output(const string& output, vector<Center<T> > & partition)
 				break;
 			}
 		}
-		if (!cen_found) {
-			cout << "Center not found" << endl;
-			cout << "Cluster " << counter << " has center " << cen.getCenter()->get_header() << endl;
-			//	cen.getCenter()->set(*cen.getPoints().at(0));
-		}
 		for (auto p : cen.getPoints()) {
 			string s = p->get_header();
 			ofs << pt << "\t" << p->get_length() << "nt, " << s << "... ";
@@ -588,7 +556,7 @@ size_t accumulate(Point<T>** last_ptr, bvec<T> &points, vector<Center<T> > &cent
 	Point<T>* last = *last_ptr;
 	vector<Point<T>*> current = {last};
 	bool is_min = false;
-
+	//cout << "Accumulation: " << last->get_header() << " length: " << last->get_length() << endl;
 	for (int num_iter=0; !is_min; num_iter++) {
 		#ifdef DEBUG
 		cout << num_iter << " last: " << last->get_header() << endl;
@@ -626,82 +594,65 @@ size_t accumulate(Point<T>** last_ptr, bvec<T> &points, vector<Center<T> > &cent
 			}
 		} else { // keep adding points, find new mean
 			size_t prev_size = current.size();
+			//cout << "Center: " << last->get_header() << " length: " << last->get_length() << endl;
 			points.remove_available(bounds.first, bounds.second, current);
 
+
 			last = get_mean(current, *last, bandwidth);
+
 			size_t added_size = current.size() - prev_size;
-			#ifdef DEBUG
-			cout << "added new points (" << added_size << ")" << endl;
-			#endif
-			if (last == NULL) {
-				cerr << "Last is null" << endl;
-				throw 100;
-			}
 		}
 	}
 //	cout << "Pushed back center " << last->get_header() << endl;
 	Center<T> cc(last, current);
 	centers.push_back(cc);
-//	Center<T> cen(last, current);
-//	centers.emplace_back(last, current);
-	// Point<T>* center = last->clone();
-	// centers.push_back(center);
-	// part[center] = current;
-	#ifdef DEBUG
-	for (auto p : current) {
-		cout << total_iter << " Cluster " << last->get_header() << ": " << p->get_header() << endl;
-	}
-	#endif
-        // if (points.empty()) {
-	// 	return true;
-	// } else {
-	// 	return false;
-	// }
 	return current.size();
 }
 
 
+/*
+ * The main method of this file, which drives the mean shift accumulate and update steps.
+ * As seen, the first FOR loop calls accumulate() and progresses through the accumulation stage.
+ * The second FOR loop iterates through the update stage, calling mean_shift_update() and merge()
+ * to update each center.
+ * Finally, the output is printed through print_output.
+ */
 template<class T>
 void ClusterFactory<T>::MS(bvec<T> &points, T bandwidth, double sim, const Trainer<T>& trn, string output, int iter, int delta)
 {
 	vector<Center<T> > part;
-//	using partition = map<Point<T>*, vector<Point<T>*> >;
-//	partition part;
-
 	Point<T>* last = points.pop();
-	//cout << "First length: " << last->get_length() << endl;
 	Progress pa(points.size(), "Accumulation");
         for (int num = 0; last != NULL; num++) {
 	        size_t n = accumulate(&last, points, part, trn, sim, bandwidth, num);
 		pa += n;
 	}
 	pa.end();
-//	points.check();
-	size_t total = 0;
-	for (auto cen  : part) {
-		total += cen.getPoints().size();
-	}
-	cout << "total size: " << total << endl;
+	cout << "Number of clusters before update: " << part.size() << endl;
+	Clock::stamp("accumulate");
+	vector<int> num_clusters;
 	Progress pu(iter, "Update");
 	for (int i = 0; i < iter; i++) {
-		// #ifdef DEBUG
-		//print_output(output + to_string(i), part);
-		// #endif
-		//cout << "Mean shift iteration " << i << endl;
+		if (i >= 3 && part.size() == num_clusters[i-3]) {
+			break;
+		}
 		#pragma omp parallel for
 		for (int j = 0; j < part.size(); j++) {
 			mean_shift_update(part, j, trn, delta);
 		}
 		merge(part, trn, delta, bandwidth);
 		pu++;
+		num_clusters.push_back(part.size());
 	}
 
 	#pragma omp parallel for
-	for (int j = 0; j < m_centers.size(); j++) {
+	for (int j = 0; j < part.size(); j++) {
 		mean_shift_update(part, j, trn, 0);
 	}
 	pu.end();
 	print_output(output, part);
+	Clock::stamp("update");
+	Clock::stamp("done");
 }
 
 /*
@@ -715,7 +666,6 @@ template<class T>
 std::vector<Point<T>*> ClusterFactory<T>::build_points(vector<string> fileList, std::function<Point<T>*(ChromosomeOneDigit *)> get_point)
 {
 	std::vector<Point<T>*> points;
-	std::vector<Point<T>*> cpoints;
 	unsigned fsize = fileList.size();
 	std::vector<Point<T>*> initial_centers;
 	std::stringstream buffer;
@@ -724,7 +674,7 @@ std::vector<Point<T>*> ClusterFactory<T>::build_points(vector<string> fileList,
 	for (unsigned i = 0; i < fsize; i++) {
 		p++;
 		ChromListMaker *maker = new ChromListMaker(fileList.at(i));
-		const std::vector<Chromosome *> * chromList = maker->makeChromOneDigitList();
+		const std::vector<Chromosome *> * chromList = maker->makeChromOneDigitDnaList();
 		unsigned csize = chromList->size();
 #pragma omp parallel for ordered
 		for (unsigned h = 0; h < csize; h++) {
@@ -745,111 +695,6 @@ std::vector<Point<T>*> ClusterFactory<T>::build_points(vector<string> fileList,
 		delete maker;
 	}
 	return points;
-//	std::random_shuffle(points.begin(), points.end());
-//	queue<int> gaps;
-//	calculate_gaps<T>(points, gaps, func);
-	// for (int i = 1; i < points.size(); i++) {
-	// 	int la = points[i]->get_length();
-	// 	int lb = points[i-1]->get_length();
-	// 	if (lb > la && 100.0 * la / lb < sim) {
-	// 		gaps.push(i);
-	// 	}
-	// }
-
-
-// 	vector<vector<Point<T>*>> p;
-// 	vector<Point<T>*> tmp;
-// 	tmp.push_back(points[0]);
-// 	for (int j = 1; j < points.size(); j++) {
-
-// 		int la = points[j]->get_length();
-// 		int lb = points[j-1]->get_length();
-// 		assert(lb >= la);
-// 		if (lb > la && 100.0 * la / lb < sim) {
-// 			p.push_back(tmp);
-// 			cout << "Gap " << tmp.size() << endl;
-// 			tmp.clear();
-// 		}
-// 		tmp.push_back(points[j]);
-// 	}
-// 	if (!tmp.empty()) {
-// 		p.push_back(tmp);
-// 	}
-
-// //	calculate_distances(points);
-// 	int idx = 0;
-// 	for (auto &c : p) {
-// 		sort_nn_func<T>(c, func);
-// 		for (auto v : c) {
-// 			v->set_id(idx++);
-// 			cpoints.push_back(v);
-// 		}
-// 	}
-
-	// sort_nn_func<T>(points,
-	// 		[&](const Point<T>&a, const Point<T>&b) {
-	// 			int la = a.get_length();
-	// 			int lb = b.get_length();
-	// 			return lb > la && 100.0 * la / lb < sim;
-	// 		},
-	// 		[](const Point<T>& a, const Point<T>& b) {
-	// 		        return a.distance_k1(b);
-	// 		});
-
-
-	// // for(auto p : points){
-	// //	cout << p->get_header() << endl;
-	// // }
-
-
-
-	// sort_nn_func<T>(points,
-	// 		[&](const Point<T>& a, const Point<T>& b) {
-	// 			int la = a.get_length();
-	// 			int lb = b.get_length();
-	// 			if (lb > la && 100.0 * la / lb < sim) {
-	// 				double mono = a.distance_k1(b) * 100;
-	// 				bool q = mono < sim;
-	// 				/*
-	// 				if (q) {
-	// 					cout << "TRUE" << endl;
-	// 				} else {
-	// 					cout << "FALSE"<< endl;
-	// 				}
-	// 				*/
-	// 				return q;
-	// 			} else {
-	// 				return false;
-	// 			}
-	// 		},
-	// 		[](const Point<T>& a, const Point<T>& b) {
-	// 			return a.distance(b);
-	// 		});
-	// uint64_t idx = 0;
-	// for (auto v : points) {
-	// 	v->set_id(idx++);
-
-	// 	cpoints.push_back(v);
-	// }
-	// cout << "Points: " << cpoints.size() << endl;
-
-
-	// for (int i = 0; i < points.size(); i++) {
-	// 	cout << points[i]->get_header();
-	// 	if (i > 0) {
-	// 		cout << "  " << points[i]->distance(*points[i-1]);
-	// 	}
-	// 	cout << endl;
-	// }
-
-
-
-	// for (int i = 0; i < points.size(); i++) {
-	// 	points[i]->set_id(i);
-	// 	cpoints.push_back(points[i]);
-	// 	assert(cpoints[i]->get_id() == i);
-	// }
-	return points;
 }
 
 
@@ -998,21 +843,7 @@ T ClusterFactory<T>::find_h(const std::vector<Point<T>*> &centers) const
 		return divs[divs.size()/2];
 	}
 }
-/*
-template<class T>
-std::vector<Point<T> *> ClusterFactory<T>::get_centers(const std::vector<Point<T> *> &points)
-{
-	std::vector<Point<T>*> centers;
-	for (typename std::vector<Point<T>*>::const_iterator it = points.begin(); it != points.end(); ++it) {
-		Point<T> *p = *it;
-		if (choose_center(*p)) {
-			centers.push_back(p->clone());
-		}
-	}
 
-	return centers;
-}
-*/
 #ifndef HEADER_HACK
 template class ClusterFactory<double>;
 template class ClusterFactory<int>;
diff --git a/src/cluster/src/ClusterFactory.h b/src/cluster/ClusterFactory.h
similarity index 94%
rename from src/cluster/src/ClusterFactory.h
rename to src/cluster/ClusterFactory.h
index 12180c9..75a2309 100644
--- a/src/cluster/src/ClusterFactory.h
+++ b/src/cluster/ClusterFactory.h
@@ -13,9 +13,9 @@
 #include <vector>
 #include <functional>
 #include <limits>
-#include "../../nonltr/ChromosomeOneDigit.h"
-#include "../../nonltr/KmerHashTable.h"
-#include "Point.h"
+#include "../nonltr/ChromosomeOneDigit.h"
+#include "../nonltr/KmerHashTable.h"
+#include "../clutil/Point.h"
 #include "Trainer.h"
 #include "bvec.h"
 
@@ -31,7 +31,6 @@ class ClusterFactory {
 	void MS(bvec<T> &points, T bandwidth, double sim, const Trainer<T>& trn, string output, int iter, int delta);
 private:
 	vector<int> lookup_table;
-	vector<Point<T>*> m_centers;
 	const int num_per_partition;
 	int k;
 	//void fill_table(KmerHashTable<unsigned long, T> &table, ChromosomeOneDigit *chrom, std::vector<T>& values);
diff --git a/src/cluster/Makefile b/src/cluster/Makefile
deleted file mode 100644
index 9186210..0000000
--- a/src/cluster/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-TARGET ?= meshclust2
-VERSION ?= 2.1.0
-CXX ?= g++
-ifeq ($(debug),yes)
-	CXXFLAGS += -ggdb -DDEBUG -fno-omit-frame-pointer -fopenmp
-else
-	CXXFLAGS += -fopenmp -O3 -march=native -g
-endif
-CXXFLAGS += -std=c++11 -DVERSION=\"$(VERSION)\"
-LDFLAGS += -lm
-
-SOURCES := $(shell find ./src -name '*.cpp')
-OBJECTS = $(SOURCES:%.cpp=bin/%.o)
-BIN_OBJECTS := $(shell find  ../../bin/ -mindepth 2 -name '*.o')
-
-all: clean $(TARGET)
-
-$(TARGET): $(OBJECTS) $(BIN_OBJECTS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-
-bin/%.o: %.cpp
-	mkdir -p $(@D)
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-clean:
-	$(RM) $(OBJECTS) $(TARGET)
-
-install: $(TARGET)
-	cp $(TARGET) ~/bin
-
-.PHONY: all clean install
diff --git a/src/cluster/Trainer.cpp b/src/cluster/Trainer.cpp
new file mode 100644
index 0000000..923f410
--- /dev/null
+++ b/src/cluster/Trainer.cpp
@@ -0,0 +1,198 @@
+/* -*- C++ -*- */
+/*
+ * Trainer.cpp
+ *
+ * Author: Benjamin T James
+ */
+#include "Trainer.h"
+#include "../predict/HandleSeq.h"
+#include "../clutil/Datatype.h"
+#include "../clutil/Loader.h"
+#include "ClusterFactory.h"
+#include <algorithm>
+#include <set>
+#include <map>
+#include <cmath>
+#include "../predict/Predictor.h"
+#include "../predict/GLM.h"
+#include "../predict/Feature.h"
+#include "../clutil/Progress.h"
+#include "../clutil/Random.h"
+
+template<class T>
+std::tuple<Point<T>*,double,size_t,size_t> Trainer<T>::get_close(Point<T> *p, bvec_iterator<T> istart, bvec_iterator<T> iend, bool &is_min_r) const
+{
+	int ncols = weights.getNumRow();
+#pragma omp declare reduction(pmax:std::tuple<Point<T>*,double,size_t,size_t>: \
+			      omp_out = get<1>(omp_in) > get<1>(omp_out) ? omp_in : omp_out ) \
+	initializer (omp_priv=std::make_tuple((Point<T>*)NULL,-1,0,0))
+
+	std::tuple<Point<T>*,
+		   double,
+		   size_t,
+		   size_t> result = std::tuple<Point<T>*, double, size_t, size_t>(NULL,
+				     -1,
+				     0,
+				     0);
+	bool has_found = false;
+	bool is_min = true;
+	uint64_t min_len = p->get_length() * cutoff;
+	uint64_t max_len = p->get_length() / cutoff;
+#pragma omp parallel for reduction(pmax:result), reduction(&&:is_min)
+	for (bvec_iterator<T> i = istart; i < iend; ++i) {
+		Point<T>* pt = (*i).first;
+
+		uint64_t len = pt->get_length();
+		if (len < min_len || len > max_len) {
+			continue;
+		}
+		auto cache = feat->compute(*pt, *p);
+		double dist = (*feat)(0, cache);
+		double sum = classify(pt, p);
+		double res = round(sum) > 0;
+		// #pragma omp critical
+		// cout << "Result: " << sum << " raw_sigmoid: " << matrix::GLM::logistic(sum) << " classify_sum: " << Predictor<T>::classify_sum(sum) << " final: " << res << endl;
+// set second to true if result is not 1.0
+// which means it will be removed
+		result = (dist > std::get<1>(result)) ? std::make_tuple(pt, dist, i.r, i.c) : result;
+		is_min = is_min && (res != 1.0);
+//		has_found = has_found || (res != 1.0);
+		if (res == 1.0) {
+			*i = std::make_pair(pt, true);
+//			(*i).second = true;
+		}
+	}
+
+//	is_min = !has_found;
+	is_min_r = is_min;
+//	return get<0>(result);
+	return result;
+
+}
+
+template<class T>
+long Trainer<T>::merge(vector<Center<T> > &centers, long current, long begin, long last) const
+{
+#pragma omp declare reduction(ldpmax:std::pair<long,double>:			\
+			      omp_out = omp_in.second > omp_out.second ? omp_in : omp_out ) \
+	initializer (omp_priv=std::make_pair(0, std::numeric_limits<double>::min()))
+	std::pair<long,double> best = std::make_pair(0, std::numeric_limits<double>::min());
+	Point<T>* p = centers[current].getCenter();
+	uint64_t cen_length = p->get_length();
+	uint64_t min_length = cen_length * get_id();
+	uint64_t max_length = cen_length / get_id();
+#pragma omp parallel for reduction(ldpmax:best)
+	for (long i = begin; i <= last; i++) {
+		double sum = weights.get(0, 0);
+		double dist = 0;
+
+		Point<T>* cen = centers[i].getCenter();
+		uint64_t cen_len = cen->get_length();
+		bool length_pass = cen_len >= min_length && cen_len <= max_length;
+		if (length_pass) {
+			auto cache = feat->compute(*cen, *p);
+			for (int col = 1; col < weights.getNumRow(); col++) {
+				double d = (*feat)(col-1, cache);
+				if (col == 1) {
+					dist = d;
+				}
+				sum += weights.get(col, 0) * d;
+			}
+			double res = round(Predictor<T>::classify_sum(sum));
+
+			if (res == 1) {
+				best = best.second > dist ? best : std::make_pair(i, dist);
+			}
+		}
+	}
+	return best.first;
+}
+
+template<class T>
+double Trainer<T>::classify(Point<T>*a, Point<T>*b) const
+{
+	double sum = weights.get(0, 0);
+	auto cache = feat->compute(*a, *b);
+	for (int col = 1; col < weights.getNumRow(); col++) {
+		sum += weights.get(col, 0) * (*feat)(col-1, cache);
+	}
+	return Predictor<T>::classify_sum(sum);
+}
+
+template<class T>
+void Trainer<T>::filter(Point<T> *p, vector<pair<Point<T> *, bool> > &vec) const
+{
+	uint64_t cen_length = p->get_length();
+	uint64_t min_length = cen_length * get_id();
+	uint64_t max_length = cen_length / get_id();
+	for (auto& pt : vec) {
+		uint64_t pt_len = pt.first->get_length();
+		bool length_pass = pt_len >= min_length && pt_len <= max_length;
+		pt.second = true;
+		if (length_pass) {
+			double sum = classify(p, pt.first);
+			double res = round(sum);
+			pt.second = (res == 0);
+		}
+	}
+	vec.erase(std::remove_if(vec.begin(), vec.end(), [](pair<Point<T>*, bool> p) {
+				return p.second;
+			}), vec.end());
+}
+
+template<class T>
+Point<T>* Trainer<T>::closest(Point<double> *p, vector<pair<Point<T> *, bool> > &vec) const
+{
+	Point<T>* best_pt = NULL;
+	double best_dist = 0;
+	for (auto& pt : vec) {
+		double sum = weights.get(0, 0);
+		double dist = pt.first->distance_d(*p);
+		if (best_pt == NULL || dist < best_dist) {
+			best_dist = dist;
+			best_pt = pt.first;
+		}
+	}
+	return best_pt;
+}
+
+template<class T>
+void Trainer<T>::train(std::string dump_str)
+{
+	Predictor<T> pred(dump_str);
+	delete feat;
+	auto pr = pred.get_class();
+	feat = pr.first;
+	feat->set_save(false);
+	matrix::GLM glm = pr.second;
+	weights = glm.get_weights();
+}
+
+template<class T>
+void Trainer<T>::train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, std::string dump_str, double acc_cutoff)
+{
+	std::cout << "Splitting data" << endl;
+	uintmax_t _id = points.size();
+	Predictor<T> pred(k, cutoff, PRED_MODE_CLASS, feat_type,
+			  mut_type, min_n_feat, max_n_feat, min_id);
+	pred.train(points, _id, n_samples, n_templates);
+	delete feat;
+	auto pr = pred.get_class();
+	feat = pr.first;
+	matrix::GLM glm = pr.second;
+	weights = glm.get_weights();
+
+	if (dump_str != "") {
+		pred.save(dump_str, Datatype::get());
+		exit(0);
+	} else {
+		pred.save("weights.txt", Datatype::get());
+	}
+}
+
+template class Trainer<uint8_t>;
+template class Trainer<uint16_t>;
+template class Trainer<uint32_t>;
+template class Trainer<uint64_t>;
+template class Trainer<int>;
+template class Trainer<double>;
diff --git a/src/cluster/Trainer.h b/src/cluster/Trainer.h
new file mode 100644
index 0000000..b41031a
--- /dev/null
+++ b/src/cluster/Trainer.h
@@ -0,0 +1,46 @@
+/* -*- C++ -*- */
+/*
+ * Trainer.h
+ *
+ * Author: Benjamin T James
+ */
+
+#ifndef TRAINER_H
+#define TRAINER_H
+
+#include "../clutil/Point.h"
+#include "../predict/GLM.h"
+#include "../predict/Feature.h"
+#include "../predict/Predictor.h"
+#include "bvec.h"
+#include "Center.h"
+#include <set>
+
+template<class T>
+class Trainer {
+public:
+	Trainer(std::vector<Point<T>*> v, size_t num_points, size_t largest_count, double cutoff_, size_t max_pts_from_one_, int ksize) : points(v), n_samples(num_points), cutoff(cutoff_), n_templates(max_pts_from_one_), k(ksize) {
+		uintmax_t size = 1000 * 1000 * 10;
+		feat = new Feature<T>(k);
+	};
+	~Trainer() { delete feat; }
+	void train(std::string);
+	void train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, std::string dump_str, double acc_cutoff=97.5);
+
+	std::tuple<Point<T>*,double,size_t,size_t> get_close(Point<T>*, bvec_iterator<T> istart, bvec_iterator<T> iend,  bool& is_min) const;
+
+	void filter(Point<T>*, vector<pair<Point<T>*,bool> >&) const;
+	Point<T>* closest(Point<double>*, vector<pair<Point<T>*,bool> >&) const;
+	long merge(vector<Center<T> > &centers, long current, long begin, long end) const;
+
+	double get_id() const { return cutoff > 1 ? cutoff / 100.0 : cutoff; }
+private:
+	double classify(Point<T>*, Point<T>*) const;
+	matrix::Matrix weights;
+	Feature<T> *feat;
+	std::vector<Point<T>*> points;
+	size_t n_samples, n_templates;
+	double cutoff;
+	int k;
+};
+#endif
diff --git a/src/cluster/src/bvec.cpp b/src/cluster/bvec.cpp
similarity index 70%
rename from src/cluster/src/bvec.cpp
rename to src/cluster/bvec.cpp
index 2efed1e..8bd35d6 100644
--- a/src/cluster/src/bvec.cpp
+++ b/src/cluster/bvec.cpp
@@ -123,15 +123,13 @@ template<class T>
 bool bvec<T>::index_of(uint64_t point, size_t* pfront, size_t* pback) const
 {
 	size_t low = begin_bounds.size()-1, high = 0;
+	size_t prev = 0;
+	size_t prev_index = 0;
 
-	for (size_t i = 0; i < begin_bounds.size(); i++) {
-		size_t prev = 0;
-		size_t prev_index = 0;
-		if (i > 0) {
-			prev_index = i - 1;
-			prev = begin_bounds[i-1];
-		}
-		if (point >= prev && point <= begin_bounds[i]) {
+	for (size_t i = 1; i < begin_bounds.size(); i++) {
+		prev_index = i - 1;
+		prev = begin_bounds[i-1];
+		if (point >= prev && point < begin_bounds[i]) {
 			low = std::min(low, prev_index);
 			high = std::max(high, prev_index);
 		}
@@ -174,6 +172,15 @@ void bvec<T>::insert(Point<T> *p)
 	}
 	auto mid_min = min_sizes[min_sizes.size() / 2];
 	data.at(mid_min).push_back(std::make_pair(p, false));
+	if (begin_bounds.at(mid_min) > len) {
+		cerr << "Begin Insertion of " << len << " should not be in bin " << begin_bounds.at(mid_min) << endl;
+		throw std::exception();
+	}
+	if (mid_min < begin_bounds.size() - 1 && begin_bounds.at(mid_min+1) < len) {
+		cerr << "End Insertion of " << len << " should not be in bin " << begin_bounds.at(mid_min+1) << endl;
+		throw std::exception();
+	}
+
 }
 
 template<class T>
@@ -214,6 +221,14 @@ void bvec<T>::insert_finalize()
 	for (size_t i = 0; i < data.size(); i++) {
 		std::sort(std::begin(data[i]), std::end(data[i]), sorter);
 		data[i].shrink_to_fit();
+/*		if (data[i][0].first->get_length() < begin_bounds[i]) {
+			cerr << "Length " << data[i][0].first->get_length() << " should not be in bin " << begin_bounds[i] << endl;
+			throw std::exception();
+		}
+		if (i < data.size()-1 && data[i][data[i].size()-1].first->get_length() > begin_bounds[i+1]) {
+			cerr << "Length " << data[i][0].first->get_length() << " should not be in bin " << begin_bounds[i] << " to " << begin_bounds[i+1] << endl;
+			throw std::exception();
+		}*/
 	}
 }
 
@@ -251,13 +266,24 @@ bvec<T>::get_range(uint64_t begin_len, uint64_t end_len) const
 	front.first = 0;
 	front.second = 0;
 	back.first = data.size()-1;
-	back.second = data[back.first].size() - 1;
+        back.second = data[back.first].size() - 1;
+
+	// Determination of the outer indices
 	if (!index_of(begin_len, &front.first, NULL)) {
 		throw 100;
 	}
 	if (!index_of(end_len, NULL, &back.first)) {
 		throw 100;
 	}
+	// if (begin_len < begin_bounds.at(front.first)) {
+	// 	cerr << "Low index is not accurate" << endl;
+	// 	throw std::exception();
+	// }
+	// if (front.first > back.first) {
+	// 	cerr << "Front index is greater than back index" << endl;
+	// 	throw std::exception();
+	// }
+	// Determination of the inner indices
 	if (!inner_index_of(begin_len, front.first, &front.second, NULL)) {
 		throw 100;
 	}
@@ -274,6 +300,32 @@ bvec<T>::get_range(uint64_t begin_len, uint64_t end_len) const
 	// } else {
 	// 	throw 101;
 	// }
+
+	if (back.first == (uint64_t)-1 || back.second == (uint64_t)-1) {
+		back.is_empty = true;
+	}
+// 	for (uint64_t i = front.first; i <= back.first; i++) {
+// 		uint64_t j = 0;
+// 		uint64_t end = data.at(i).size();
+// 		if (i == front.first) {
+// 			j = front.second;
+// 		}
+// 		if (i == back.first) {
+// 			end = min(back.second, end);
+// 		}
+// 		for (; j < end; j++) {
+// 			uint64_t len = data.at(i).at(j).first->get_length();
+// 			if (len < begin_len || len > end_len) {
+// //				cerr << "Warning: Length in BVec " << len << " is not in [" << begin_len << ", " << end_len << "]. The classifier will not select these points." << endl;
+// 				// if (i == front.first) {
+// 				// 	cerr << "Front Bounds of selected bin: " << j << " -> " << data.at(i).at(j).first->get_length();
+// 				// }
+// 				// if (i == back.first) {
+// 				// 	cerr << "End Bounds of selected bin: " << end-1 << " -> " << data.at(i).at(end-1).first->get_length();
+// 				// }
+// 			}
+// 		}
+// 	}
 	return std::make_pair(front, back);
 }
 
@@ -291,6 +343,9 @@ void bvec<T>::remove_available(bvec_idx_t begin, bvec_idx_t end, std::vector<Poi
 {
 	size_t a = begin.first;
 	size_t b = end.first;
+	if (begin.is_empty || end.is_empty) {
+		return;
+	}
 	int num = 0, new_num = 0;
 	auto func = [](const bv_data_type<T> d) { return d.second; };
 	auto inserter = [&](const std::pair<Point<T>*,bool> p) {
@@ -299,6 +354,14 @@ void bvec<T>::remove_available(bvec_idx_t begin, bvec_idx_t end, std::vector<Poi
 			available.push_back(p.first);
 		}
 	};
+	// uint64_t begin_len = data.at(a).at(begin.second).first->get_length();
+	// uint64_t end_len;
+	// if (end.second == data.at(b).size()) {
+	// 	end_len = data.at(b).at(end.second-1).first->get_length();
+	// } else {
+	// 	end_len = data.at(b).at(end.second).first->get_length();
+	// }
+	// cout << "Boundary: " << begin_len << " -> " << end_len;
 	#pragma omp parallel for
 	for (size_t i = a; i <= b; i++) {
 		/* move marked points to end of vector, then copy, then erase */
@@ -308,6 +371,10 @@ void bvec<T>::remove_available(bvec_idx_t begin, bvec_idx_t end, std::vector<Poi
 			if (kv.second) {
 #pragma omp critical
 				{
+					//cout << "Added " << kv.first->get_header() << " length: " << kv.first->get_length() << endl;
+					// if (kv.first->get_length() > end_len || kv.first->get_length() < begin_len) {
+					// 	cerr << "Error in bvec" << endl;
+					// }
 					available.push_back(kv.first);
 				}
 			}
diff --git a/src/cluster/src/bvec.h b/src/cluster/bvec.h
similarity index 96%
rename from src/cluster/src/bvec.h
rename to src/cluster/bvec.h
index 43384e9..0c1b98a 100644
--- a/src/cluster/src/bvec.h
+++ b/src/cluster/bvec.h
@@ -7,11 +7,12 @@
 #ifndef BVEC_H
 #define BVEC_H
 
-#include "Point.h"
+#include "../clutil/Point.h"
 #include "bvec_iterator.h"
 
 typedef struct bvec_idx {
 	size_t first, second;
+	bool is_empty = false;
 } bvec_idx_t;
 
 /*
diff --git a/src/cluster/src/bvec_iterator.cpp b/src/cluster/bvec_iterator.cpp
similarity index 88%
rename from src/cluster/src/bvec_iterator.cpp
rename to src/cluster/bvec_iterator.cpp
index f8d1c76..e97e1a1 100644
--- a/src/cluster/src/bvec_iterator.cpp
+++ b/src/cluster/bvec_iterator.cpp
@@ -1,3 +1,9 @@
+/* -*- C++ -*- */
+/*
+ * bvec_iterator.cpp
+ *
+ * Author: Benjamin T James
+ */
 #include "bvec_iterator.h"
 
 template<class T>
diff --git a/src/cluster/src/bvec_iterator.h b/src/cluster/bvec_iterator.h
similarity index 100%
rename from src/cluster/src/bvec_iterator.h
rename to src/cluster/bvec_iterator.h
diff --git a/src/cluster/src/main.cpp b/src/cluster/meshclust2.cpp
similarity index 87%
rename from src/cluster/src/main.cpp
rename to src/cluster/meshclust2.cpp
index 562fd96..81340b7 100644
--- a/src/cluster/src/main.cpp
+++ b/src/cluster/meshclust2.cpp
@@ -4,7 +4,7 @@
  *
  * Author: Benjamin T James
  */
-#include "Runner.h"
+#include "CRunner.h"
 int main(int argc, char **argv)
 {
 	Runner runner(argc, argv);
diff --git a/src/cluster/src/Loader.cpp b/src/cluster/src/Loader.cpp
deleted file mode 100644
index 73691b6..0000000
--- a/src/cluster/src/Loader.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/* -*- C++ -*-
- *
- * Loader.cpp
- *
- * Author: Benjamin T James
- *
- * Class which can 'preload' chunks of sequences from a file list,
- * and then count the k-mers separately, which can be done in
- * multiple threads
- */
-#include "Loader.h"
-#include "ClusterFactory.h"
-#include "DivergencePoint.h"
-#include <omp.h>
-
-template<class T>
-bool Loader<T>::done() const
-{
-	return file_idx == files.size();
-}
-
-template<class T>
-void Loader<T>::preload(int tid)
-{
-	if (file_idx == files.size()) {
-		return;
-	}
-	for (uint64_t j = 0; j < chunk_size; j++) {
-		auto chrom = next();
-		if (chrom.first == "") {
-			return;
-		}
-		cache_list.at(tid).emplace_back(chrom.first, chrom.second);
-	}
-}
-
-
-template<class T>
-Point<T>* Loader<T>::get_point(std::string header, const std::string &base, uintmax_t& id, int k)
-{
-	KmerHashTable<unsigned long, T> table(k, 1);
-	KmerHashTable<unsigned long, uint64_t> table_k1(1, 0);
-	std::vector<T> values;
-	vector<uint64_t> values_k1;
-	values.clear();
-	ChromosomeOneDigit chrom;
-	chrom.setHeader(header);
-	chrom.appendToSequence(base);
-	chrom.finalize();
-	fill_table<T>(table, &chrom, values);
-	fill_table<uint64_t>(table_k1, &chrom, values_k1);
-//	int tmplate = get_template(chrom->getHeader(), templates);
-	Point<T> *p = new DivergencePoint<T>(values, chrom.size());
-//	cout << "mag: " << ((DivergencePoint<T>*)p)->getPseudoMagnitude() << std::endl;
-	p->set_1mers(values_k1);
-	p->set_header(header);
-	p->set_length(chrom.getBase()->length());
-	p->set_data_str(*chrom.getBase());
-	DivergencePoint<T>* q = dynamic_cast<DivergencePoint<T>*>(p);
-	const auto N = q->points.size();
-	double aq = (double) q->getPseudoMagnitude() / N;
-	double sq = 0;
-	for (auto i = 0; i < N; i++) {
-		double qdiff = q->points[i] - aq;
-		sq += qdiff * qdiff;
-	}
-	sq = sqrt(sq / N);
-	q->set_stddev(sq);
-	p->set_id(id);
-	#pragma omp atomic
-	id++;
-	return p;
-}
-
-template<class T>
-std::vector<Point<T>*> Loader<T>::load_next(int tid)
-{
-	std::vector<Point<T>*> points;
-	for (size_t i = 0; i < cache_list.at(tid).size(); i++) {
-	        auto pr = cache_list.at(tid).at(i);
-		Point<T>* p = get_point(pr.first, *pr.second, id_list.at(tid), k);
-		points.push_back(p);
-		delete pr.second;
-	}
-	cache_list.at(tid).clear();
-	return points;
-}
-
-template<class T>
-std::pair<std::string,std::string*> Loader<T>::next()
-{
-	auto n = maker->next();
-	if (n.first != "") {
-		return n;
-	}
-	delete maker;
-	maker = NULL;
-	file_idx++;
-	if (file_idx >= files.size()) {
-		return n;
-	}
-	maker = new SingleFileLoader(files.at(file_idx));
-	return maker->next();
-}
-
-template class Loader<double>;
-template class Loader<int>;
-template class Loader<uint64_t>;
-template class Loader<uint32_t>;
-template class Loader<uint16_t>;
-template class Loader<uint8_t>;
diff --git a/src/cluster/src/LogTable.cpp b/src/cluster/src/LogTable.cpp
deleted file mode 100644
index 0a05a9d..0000000
--- a/src/cluster/src/LogTable.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#include "LogTable.h"
-
-#include <cmath>
-#include <iostream>
-
-LogTable::LogTable() : coeff(1000000 / 2)
-{
-	uintmax_t size = 1000000;
-	double imax = 2;
-//	map = new double[size];
-	double lsize = log(size);
-	for (uintmax_t i = 0; i < size; i++) {
-		map[i] = log(imax * (i + 1)) - lsize;
-	}
-	std::cout << "dmax: " << coeff << std::endl;
-}
-LogTable::LogTable(uintmax_t size, double imax) : coeff(size / imax)
-{
-	//map = new double[size];
-	double lsize = log(size);
-	for (uintmax_t i = 0; i < size; i++) {
-		map[i] = log(imax * (i + 1)) - lsize;
-	}
-	std::cout << "dmax: " << coeff << std::endl;
-}
-
-LogTable::~LogTable()
-{
-	//delete[] map;
-}
-
-double LogTable::at(double d) const
-{
-	size_t idx = d * coeff;
-	return map[idx];
-}
-double LogTable::operator[](double d) const
-{
-	size_t index = d * coeff;
-	return map[index];
-}
diff --git a/src/cluster/src/LogTable.h b/src/cluster/src/LogTable.h
deleted file mode 100644
index 6fab42e..0000000
--- a/src/cluster/src/LogTable.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef LOGTABLE_H
-#define LOGTABLE_H
-
-#include <stdint.h>
-#include <vector>
-
-#define TBLSIZE 1000000
-class LogTable {
-public:
-	LogTable();
-	LogTable(uintmax_t _size, double imax=2);
-	~LogTable();
-	double at(double d) const;
-	double operator[](double d) const;
-private:
-	double map[TBLSIZE];
-
-	const double coeff;
-};
-#endif
diff --git a/src/cluster/src/Mat.h b/src/cluster/src/Mat.h
deleted file mode 100644
index eb711ed..0000000
--- a/src/cluster/src/Mat.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* -*- C++ -*-
- *
- * Mat.h
- *
- * Author: Benjamin T James
- */
-#ifndef MAT_H
-#define MAT_H
-#include <iostream>
-#include <functional>
-using namespace std;
-template<class T>
-class Mat {
-public:
-	Mat(function<T(int,int)> func, const long size) : n(size), table_size(size*(size+1)/2), compute(func) {
-		if (size <= 0) {
-			throw "Invalid size";
-		}
-		table = new T[table_size];
-		set = new bool[table_size]();
-	};
-	~Mat() {
-		delete[] table;
-		delete[] set;
-	};
-	void fill() {
-		unsigned long long count = 0;
-		#ifdef OPENMP
-                #pragma omp parallel for collapse(2) shared(set)
-		#endif
-		for (long i = 0; i < n; i++) {
-			for (long j = 0; j < n; j++) {
-				const auto idx = addr(i, j);
-				if (!set[idx]) {
-					auto res = compute(i, j);
-					table[idx] = res;
-					set[idx] = true;
-					count++;
-				}
-				if (count % 10000 == 0) {
-					cout << count << " / " << table_size << endl;
-				}
-			}
-		}
-
-	};
-	T& operator[](pair<int,int> index) {
-		const unsigned long idx = addr(index.first, index.second);
-		if (!set[idx]) {
-			table[idx] = compute(index.first, index.second);
-			set[idx] = true;
-		}
-		return table[idx];
-	};
-	bool exists(int i, int j) const {
-		return set[addr(i, j)];
-	}
-private:
-	T* table;
-	bool* set;
-	const unsigned long table_size;
-	const unsigned long n;
-	function<T(int,int)> compute;
-
-	unsigned long addr(unsigned long i, unsigned long j) const {
-		if (i <= j) {
-			return i * n - (i - 1) * i / 2 + j - i;
-		} else {
-			return j * n - (j - 1) * j / 2 + i - j;
-		}
-	};
-};
-#endif
diff --git a/src/cluster/src/NearestNeighbor.h b/src/cluster/src/NearestNeighbor.h
deleted file mode 100644
index a59b87b..0000000
--- a/src/cluster/src/NearestNeighbor.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* -*- C++ -*-
- *
- * NearestNeighbor.h
- *
- * Author: Benjamin T James
- */
-#ifndef NEARESTNEIGHBOR_H
-#define NEARESTNEIGHBOR_H
-// #include <ANN/ANN.h>
-// #include "Point.h"
-// template<class T>
-// class NearestNeighbor {
-// public:
-// 	NearestNeighbor(const vector<Point<T>*> &pts) : points(pts) {
-// 		const int dim = pts[0]->get_data().size();
-// 		const int maxPts = pts.size();
-// 		dataPts = annAllocPts(maxPts, dim);
-// 		queryPt = annAllocPt(dim);
-// 		for (int nPts = 0; nPts < maxPts; nPts++) {
-// 			auto vec = pts[nPts]->get_data();
-// 			for (int i = 0; i < vec.size(); i++) {
-// 				dataPts[nPts][i] = vec[i];
-// 			}
-// 		}
-// 		kd_tree = new ANNkd_tree(dataPts, maxPts, dim);
-// 		nnIdx = new ANNidx[1];
-// 		dists = new ANNdist[1];
-// 	};
-// 	~NearestNeighbor() {
-// 		delete[] nnIdx;
-// 		delete[] dists;
-// 		delete kd_tree;
-// 		annClose();
-// 	};
-// 	void find_nearest_neighbor(Point<T> &center) const {
-// 		auto vec = center.get_data();
-// 		for (int i = 0; i < vec.size(); i++) {
-// 			queryPt[i] = vec[i];
-// 		}
-// 		kd_tree->annkSearch(queryPt, 1, nnIdx, dists);
-// 		ANNidx idx = nnIdx[0];
-// 		center.set(*points[idx]);
-// 	};
-// private:
-// 	ANNkd_tree *kd_tree = NULL;
-// 	ANNpointArray dataPts;
-// 	ANNpoint queryPt;
-// 	ANNidxArray nnIdx;
-// 	ANNdistArray dists;
-// 	const vector<Point<T>*> &points;
-// };
-#endif
diff --git a/src/cluster/src/Predictor.cpp b/src/cluster/src/Predictor.cpp
deleted file mode 100644
index 3f19ba9..0000000
--- a/src/cluster/src/Predictor.cpp
+++ /dev/null
@@ -1,841 +0,0 @@
-/* -*- C++ -*-
- *
- * Predictor.cpp
- *
- * Author: Benjamin T James
- *
- * Predictor implementation class
- * train(vector<>...) is entry point, generates "semi-synthetic" sequences
- * train() actually trains applicable GLM's.
- * close() and similarity() are callable once trained
- */
-#include "Predictor.h"
-#include "Loader.h"
-#include "Matrix.h"
-#include "ClusterFactory.h"
-#include "HandleSeq.h"
-#include "Progress.h"
-#include "Random.h"
-#include <algorithm>
-
-template<class T>
-void Predictor<T>::save(std::string file)
-{
-	std::ofstream out(file);
-	out << "k: " << k << endl;
-	out << "mode: " << (unsigned int)mode << endl;
-	out << "max_features: " << max_num_feat << endl;
-	out << "ID: " << id << endl;
-	if (mode & PRED_MODE_CLASS) {
-		write_to(out, feat_c, c_glm);
-	}
-	if (mode & PRED_MODE_REGR) {
-		write_to(out, feat_r, r_glm);
-	}
-}
-
-template<class T>
-Predictor<T>::Predictor(const std::string filename)
-{
-	std::ifstream in(filename);
-	std::string buf;
-	unsigned mode_ = 0;
-	in >> buf >> k;
-	cout << buf << k << endl;
-	in >> buf >> mode_;
-	mode = mode_;
-	cout << buf << mode << endl;
-	in >> buf >> max_num_feat;
-	cout << buf << max_num_feat << endl;
-	in >> buf >> id;
-	cout << buf << id << endl;
-	is_trained = true;
-	is_training = false;
-	if (mode & PRED_MODE_CLASS) {
-		auto pr = read_from(in, k);
-		c_glm = pr.first;
-		feat_c = pr.second;
-	}
-	if (mode & PRED_MODE_REGR) {
-		auto pr = read_from(in, k);
-		r_glm = pr.first;
-		feat_r = pr.second;
-	}
-}
-
-template<class T>
-void Predictor<T>::write_to(std::ofstream &out, Feature<T>* feat, matrix::GLM glm)
-{
-	auto combos = feat->get_combos();
-	auto lookup = feat->get_lookup();
-	auto mins = feat->get_mins();
-	auto maxs = feat->get_maxs();
-	out << std::endl << "n_combos: " << combos.size() << std::endl;
-	out << glm.get_weights().get(0, 0) << endl;
-	for (int j = 0; j < combos.size(); j++) {
-		auto cmb = combos[j];
-		unsigned int val = 0;
-		uint64_t flags = 0;
-		for (auto i : cmb.second) {
-			flags |= lookup[i];
-		}
-		switch (cmb.first) {
-		case Combo::xy:
-			val = 0;
-			break;
-		case Combo::xy2:
-			val = 1;
-			break;
-		case Combo::x2y:
-			val = 2;
-			break;
-		case Combo::x2y2:
-			val = 3;
-			break;
-		}
-		out << val << " ";
-		out << flags << " ";
-		out << glm.get_weights().get(j+1, 0) << std::endl;
-	}
-	out << std::endl << "n_singles: " << lookup.size() << std::endl;
-	for (int j = 0; j < lookup.size(); j++) {
-		out << lookup[j] << " ";
-		out << mins[j] << " ";
-		out << maxs[j] << std::endl;
-	}
-}
-
-
-template<class T>
-pair<matrix::GLM, Feature<T>*> Predictor<T>::read_from(std::ifstream& in, int k_)
-{
-	matrix::GLM glm;
-	int c_num_raw_feat, c_num_combos;
-	Feature<T> *feat = new Feature<T>(k_);
-	std::string buf;
-	in >> buf >> c_num_combos;
-	cout << buf << "\"" << c_num_combos << "\"" << endl;
-	matrix::Matrix weights(c_num_combos+1, 1);
-	double d_;
-	in >> d_;
-	weights.set(0, 0, d_);
-	for (int i = 0; i < c_num_combos; i++) {
-		int cmb;
-		in >> cmb;
-		cout << (int)cmb << endl;
-		uint64_t flags;
-		in >> flags;
-		cout << flags << endl;
-		double d;
-		in >> d;
-		cout << "[" << 0 << "," << i << "] " << d << endl;
-		weights.set(i+1, 0, d);//push_back(d);
-		Combo cmb_ = Combo::xy;
-		switch (cmb) {
-		case 0:
-			cmb_ = Combo::xy;
-			break;
-		case 1:
-			cmb_ = Combo::xy2;
-			break;
-		case 2:
-			cmb_ = Combo::x2y;
-			break;
-		case 3:
-			cmb_ = Combo::x2y2;
-			break;
-		default:
-			cerr << "error reading weights file" << endl;
-			break;
-		}
-		feat->add_feature(flags, cmb_);
-	}
-
-	in >> buf >> c_num_raw_feat;
-	cout << buf << "\"" << c_num_raw_feat << "\"" << endl;
-	for (int i = 0; i < c_num_raw_feat; i++) {
-		uint64_t single_flag;
-		double min_, max_;
-		in >> single_flag;
-		cout << single_flag << endl;
-		in >> min_;
-		cout << min_ << endl;
-		in >> max_;
-		cout << max_ << endl;
-		feat->set_normal(single_flag, min_, max_);
-	}
-	feat->finalize();
-	glm.load(weights);
-	return {glm, feat};
-}
-
-template<class T>
-void Predictor<T>::add_feats(std::vector<std::pair<uint64_t, Combo> >& vec, uint64_t feat_flags)
-{
-	for (uint64_t i = 1; i <= feat_flags; i *= 2) {
-		if ((i & feat_flags) == 0) {
-			continue;
-		}
-		for (uint64_t j = 1; j <= i; j *= 2) {
-			if ((j & feat_flags) == 0) {
-				continue;
-			}
-			vec.emplace_back(i | j, Combo::xy);
-			vec.emplace_back(i | j, Combo::x2y2);
-			if (i != j) {
-				vec.emplace_back(i | j, Combo::x2y);
-				vec.emplace_back(i | j, Combo::xy2);
-			}
-		}
-	}
-}
-template<class T>
-void Predictor<T>::check()
-{
-	// if (!is_trained && training.size() >= threshold && !is_training) {
-	// 	omp_set_lock(&lock);
-	// 	is_training = true;
-	// 	train();
-	// 	is_training = false;
-	// 	omp_unset_lock(&lock);
-	// }
-}
-template<class T>
-double Predictor<T>::similarity(Point<T>* a, Point<T>* b)
-{
-	if (!is_trained) {
-//		double d = Selector<T>::align(a, b);
-		cerr << "alignment: we don't do that here" << endl;
-		throw "Bad";
-		//		return d;
-		// if (!is_training) {
-		// 	omp_set_lock(&lock);
-		// 	if (training.size() < testing.size() && training.size() < threshold) {
-		// 		training.push_back(pra<T>(a, b, d));
-		// 	} else if (training.size() >= testing.size() && testing.size() < threshold) {
-		// 		testing.push_back(pra<T>(a, b, d));
-		// 	}
-		// 	omp_unset_lock(&lock);
-		// }
-		return 0;
-
-	} else {
-		return predict(a, b);
-	}
-}
-
-template<class T>
-bool Predictor<T>::close(Point<T> *a, Point<T> *b)
-{
-	if (!is_trained) {
-//		double d = Selector<T>::align(a, b);
-		cerr << "alignment shouldn't be used here" << endl;
-		throw "bad";
-		// if (!is_training) {
-		// 	omp_set_lock(&lock);
-		// 	if (training.size() < testing.size() && training.size() < threshold) {
-		// 		training.push_back(pra<T>(a, b, d));
-		// 	} else if (training.size() >= testing.size() && testing.size() < threshold) {
-		// 		testing.push_back(pra<T>(a, b, d));
-		// 	}
-		// 	omp_unset_lock(&lock);
-		// }
-//		return d > id;
-		return false;
-	}
-	bool val = p_close(a, b);
-	if ((mode & PRED_MODE_REGR) && val) {
-		// val = p_predict(a, b) > id;
-		// if (!val) {
-		// 	cout << "FIXED" << endl;
-		// }
-	}
-	return val;
-}
-
-template<class T>
-double Predictor<T>::p_predict(Point<T>* a, Point<T>* b)
-{
-	auto cache = feat_r->compute(*a, *b);
-	auto weights = r_glm.get_weights();
-	double sum = weights.get(0, 0);
-	for (int col = 0; col < feat_r->size(); col++) {
-		double val = (*feat_r)(col, cache);
-		sum += weights.get(col+1, 0) * val;
-	}
-	if (sum < 0) {
-		sum = 0;
-	} else if (sum > 1) {
-		sum = 1;
-	}
-	return sum;
-}
-template<class T>
-double Predictor<T>::predict(Point<T>* a, Point<T>* b)
-{
-	if ((mode & PRED_MODE_CLASS) && !p_close(a, b)) {
-		return 0;
-	}
-	return p_predict(a, b);
-}
-
-template<class T>
-bool Predictor<T>::p_close(Point<T>* a, Point<T>* b)
-{
-	auto weights = c_glm.get_weights();
-	double sum = weights.get(0, 0);
-	auto cache = feat_c->compute(*a, *b);
-	for (int col = 1; col < weights.getNumRow(); col++) {
-		double d = (*feat_c)(col-1, cache);
-		sum += weights.get(col, 0) * d;
-	}
-	return sum > 0.0;
-}
-
-
-template<class T>
-std::pair<matrix::Matrix,matrix::Matrix> generate_feat_mat(const vector<pra<T> > &data, Feature<T>& feat, double cutoff)
-{
-	bool classify = (cutoff >= 0);
-	int nrows = data.size();
-	int ncols = feat.size()+1;
-	matrix::Matrix feat_mat(nrows, ncols);
-	matrix::Matrix labels(nrows, 1);
-	#pragma omp parallel for
-	for (int row = 0; row < data.size(); row++) {
-		auto kv = data.at(row);
-		vector<double> cache;
- 		// #pragma omp critical
-		// {
-			cache = feat.compute(*kv.first, *kv.second);
-		// }
-		feat_mat.set(row, 0, 1);
-		if (classify) {
-			labels.set(row, 0, kv.val >= cutoff ? 1 : -1);
-		} else {
-			labels.set(row, 0, kv.val);
-		}
-		for (int col = 1; col < ncols; col++) {
-			double val = feat(col-1, cache);
-			feat_mat.set(row, col, val);
-		}
-	}
-	return std::make_pair(feat_mat, labels);
-}
-
-template<class T>
-void Predictor<T>::train(const vector<Point<T> *> &points, const vector<Point<T>* > &queries, uintmax_t &_id, size_t num_sample)
-{
-	if (is_trained) { return; }
-
-	num_sample = min(num_sample, points.size());
-
-	vector<Point<T>*> f_points_tr, f_points_test;
-	size_t total_size = points.size();// + queries.size();
-	for (int i = 0; i < num_sample; i++) {
-		int i1 = floor((double)i * total_size / (2 * num_sample));
-		int i2 = floor((i + 1) * (double)total_size / (2 * num_sample));
-		f_points_tr.push_back(points.at(i1));
-		f_points_test.push_back(points.at(i2));
-	}
-	// size_t q_sample = min(num_sample / 10, queries.size());
-	// while (10 * f_points_tr.size() <= 11 * num_sample) {
-	// 	for (int i = 0; i < q_sample; i++) {
-	// 		int i1 = floor((double)i * queries.size() / (2 * q_sample));
-	// 		int i2 = floor((i + 1) * (double)queries.size() / (2 * q_sample));
-	// 		f_points_tr.push_back(queries.at(i1));
-	// 		f_points_test.push_back(queries.at(i2));
-	// 	}
-	// }
-	training.clear();
-	testing.clear();
-	if (mode & PRED_MODE_CLASS) {
-
-		std::vector<pra<T> > pos_buf, neg_buf;
-		cout << "mutating sequences" << endl;
-		size_t counter = 0;
-		// struct timespec start, stop;
-		// clock_gettime(CLOCK_MONOTONIC, &start);
-		Progress prog1(f_points_tr.size(), "Generating training");
-		#pragma omp parallel for
-		for (size_t i = 0; i < f_points_tr.size(); i++) {
-			auto p = f_points_tr[i];
-			mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id);
-			mutate_seqs(p, 5, pos_buf, neg_buf, min_id, 100 * id, _id);
-			#pragma omp critical
-			prog1++;
-		}
-		prog1.end();
-		// clock_gettime(CLOCK_MONOTONIC, &stop);
-		// printf("took %lu\n", stop.tv_sec - start.tv_sec);
-
-		counter = 0;
-		size_t buf_size = std::min(pos_buf.size(), neg_buf.size());
-		cout << "training +: " << pos_buf.size() << endl;
-		cout << "training -: " << neg_buf.size() << endl;
-		std::random_shuffle(pos_buf.begin(), pos_buf.end());
-		std::random_shuffle(neg_buf.begin(), neg_buf.end());
-		for (size_t i = 0; i < buf_size; i++) {
-			training.push_back(pos_buf[i].deep_clone());
-			training.push_back(neg_buf[i].deep_clone());
-		}
-		for (auto p : pos_buf) {
-			delete p.first;
-			delete p.second;
-		}
-		for (auto p : neg_buf) {
-			delete p.first;
-			delete p.second;
-		}
-		pos_buf.clear();
-		neg_buf.clear();
-		Progress prog2(f_points_test.size(), "Generating testing");
-		#pragma omp parallel for
-		for (size_t i = 0; i < f_points_test.size(); i++) {
-			auto p = f_points_test[i];
-			mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id);
-			mutate_seqs(p, 5, pos_buf, neg_buf, min_id, 100 * id, _id);
-#pragma omp critical
-			prog2++;
-		}
-		prog2.end();
-		buf_size = std::min(pos_buf.size(), neg_buf.size());
-		cout << "testing +: " << pos_buf.size() << endl;
-		cout << "testing -: " << neg_buf.size() << endl;
-		std::random_shuffle(pos_buf.begin(), pos_buf.end());
-		std::random_shuffle(neg_buf.begin(), neg_buf.end());
-		for (size_t i = 0; i < buf_size; i++) {
-			testing.push_back(pos_buf[i].deep_clone());
-			testing.push_back(neg_buf[i].deep_clone());
-		}
-		for (auto p : pos_buf) {
-			delete p.first;
-			delete p.second;
-		}
-		for (auto p : neg_buf) {
-			delete p.first;
-			delete p.second;
-		}
-	} else {
-		for (auto p : f_points_tr) {
-			mutate_seqs(p, 10, training, training, min_id, 100, _id);
-		}
-		for (auto p : f_points_test) {
-			mutate_seqs(p, 10, testing, testing, min_id, 100, _id);
-		}
-	}
-
-
-	train();
-}
-template<class T>
-std::pair<double, matrix::GLM> regression_train(const vector<pra<T> > &data, Feature<T>& feat)
-{
-	auto pr = generate_feat_mat(data, feat, -1);
-	matrix::GLM glm;
-	glm.train(pr.first, pr.second);
-	auto result1 = pr.first * glm.get_weights();
-	auto diff1 = result1 - pr.second;
-	double sum = 0;
-	for (int i = 0; i < diff1.getNumRow(); i++) {
-		sum += fabs(diff1.get(i, 0));
-	}
-	sum /= diff1.getNumRow();
-	return {sum, glm};
-}
-
-template<class T>
-std::pair<double, matrix::GLM> class_train(vector<pra<T> > &data, Feature<T>& feat, double cutoff)
-{
-	// vector<pra<T> > above, below;
-
-	// for (auto d : data) {
-	// 	if (d.val > cutoff) {
-	// 		above.push_back(d);
-	// 	} else {
-	// 		below.push_back(d);
-	// 	}
-	// }
-	// size_t sz = std::min(above.size(), below.size());
-	// data.clear();
-	// for (size_t i = 0; i < sz; i++) {
-	// 	data.push_back(above[i]);
-	// 	data.push_back(below[i]);
-	// }
-	auto pr = generate_feat_mat(data, feat, cutoff);
-	matrix::GLM glm;
-	glm.train(pr.first, pr.second);
-	matrix::Matrix p = glm.predict(pr.first);
-	for (int row = 0; row < p.getNumRow(); row++) {
-		if (p.get(row, 0) == 0) {
-			p.set(row, 0, -1);
-		}
-	}
-	double acc = get<0>(glm.accuracy(pr.second, p));
-	return {acc, glm};
-}
-
-template<class T>
-double regression_test(const vector<pra<T> >& data, Feature<T>& feat, const matrix::GLM& glm, std::string prefix="")
-{
-	auto pr = generate_feat_mat(data, feat, -1);
-	auto result1 = pr.first * glm.get_weights();
-	auto diff1 = result1 - pr.second;
-	double sum = 0;
-	for (int i = 0; i < diff1.getNumRow(); i++) {
-		sum += fabs(diff1.get(i, 0));
-	}
-	if (prefix != "") {
-		for (int row = 0; row < result1.getNumRow(); row++) {
-			cout << prefix << ";" << data[row].first->get_header() << ";" << data[row].second->get_header() << ";" << result1.get(row, 0) << ";" << pr.second.get(row, 0) << ";" << diff1.get(row, 0) << endl;
-		}
-	}
-	sum /= diff1.getNumRow();
-	return sum;
-}
-
-template<class T>
-void print_wrong(matrix::Matrix oLabels, matrix::Matrix pLabels)
-{
-	for(int i = 0; i < oLabels.getNumRow(); i++){
-	        if(oLabels.get(i,0) == pLabels.get(i, 0)){
-			cout << "";
-		}
-	}
-}
-
-template<class T>
-tuple<double,double,double> class_test(const vector<pra<T> >& data, Feature<T>& feat, const matrix::GLM& glm, double cutoff, std::string prefix="")
-{
-	auto pr = generate_feat_mat(data, feat, cutoff);
-	matrix::Matrix p = glm.predict(pr.first);
-	for (int row = 0; row < p.getNumRow(); row++) {
-		if (p.get(row, 0) == 0) {
-			p.set(row, 0, -1);
-		}
-		if (prefix != "") {
-			cout << prefix << ";" << data[row].first->get_header() << ";" << data[row].second->get_header() << ";" << data[row].val << ";" << p.get(row, 0) << ";" << pr.second.get(row, 0) << endl;
-		}
-	}
-//	print_wrong(pr.second, p);
-	return glm.accuracy(pr.second, p);
-}
-
-template<class T>
-void Predictor<T>::filter(std::vector<pra<T> > &vec, std::string prefix)
-{
-	std::vector<std::vector<pra<T> > > bins;
-	std::vector<double> limits;
-	size_t num_bins = 10;
-	size_t smallest_bin_size = vec.size();
-	for (size_t i = 0; i < num_bins; i++) {
-		limits.push_back(id + i * (1 - id) / num_bins);
-		bins.push_back(std::vector<pra<T> >());
-	}
-	limits.push_back(1);
-	for (auto p : vec) {
-		for (size_t i = 1; i < limits.size(); i++) {
-			if (p.val <= limits[i] && p.val > limits[i-1]) {
-				bins[i-1].push_back(p);
-				if (prefix != "") {
-					cout << prefix << " bin " << i - 1 << " " << p.val << endl;
-				}
-				break;
-			}
-		}
-	}
-	size_t bin_size = 0;
-	for (auto &v : bins) {
-		bin_size += v.size();
-		// smallest_bin_size = std::min(smallest_bin_size, v.size());
-		std::random_shuffle(v.begin(), v.end());
-	}
-	smallest_bin_size = bin_size / bins.size();
-	vec.clear();
-
-	for (auto &v : bins) {
-		for (size_t i = 0; i < std::min(v.size(), smallest_bin_size); i++) {
-			vec.push_back(v[i]);
-		}
-	}
-	cout << "new vector size: " << vec.size() << " divided into " << bins.size() << " equal parts" << endl;
-}
-
-double rand_between(double mute, double rng, double low, double high)
-{
-	Random r;
-	double r_d = r.random();
-
-	double mn = std::max(mute - rng, low);
-	double mx = std::min(mute + rng, high);
-	return r_d * (mx - mn) + mn;
-}
-
-template<class T>
-void Predictor<T>::mutate_seqs(Point<T>* p, size_t num_seq, vector<pra<T> > &pos_buf, vector<pra<T> > &neg_buf, double id_begin, double id_end, uintmax_t& _id)
-{
-	HandleSeq h(mut_type);
-	ClusterFactory<T> factory(k);
-	double inc = (id_end - id_begin) / num_seq;
-	std::string bin_seq = p->get_data_str();
-	std::string seq;
-	for (auto c : bin_seq) {
-		switch (c) {
-		case 0:
-			seq += 'A';
-			break;
-		case 1:
-			seq += 'C';
-			break;
-		case 2:
-			seq += 'G';
-			break;
-		case 3:
-			seq += 'T';
-			break;
-		case 'N':
-			seq += 'C';
-			break;
-		default:
-			cout << "Invalid character " << c << endl;
-			cout << "from sequence " << bin_seq << endl;
-			throw 3;
-		}
-	}
-	for (size_t i = 0; i < num_seq; i++) {
-		double iter_id = id_begin + inc * (i + 0.5);
-		double actual_id = rand_between(iter_id, inc, id_begin, id_end);
-		int mut = round(100 - actual_id);
-		auto newseq = h.mutate(seq, mut);
-		std::string chrom;
-		std::string header = p->get_header();
-		Point<T>* new_pt = Loader<T>::get_point(header, newseq.second, _id, k);
-		pra<T> pr;
-		pr.first = p->clone();
-		pr.second = new_pt;
-		pr.val = newseq.first;
-#pragma omp critical
-		{
-			if (pr.val > id) {
-				pos_buf.push_back(pr);
-			} else {
-				neg_buf.push_back(pr);
-			}
-		}
-	}
-}
-template<class T>
-void Predictor<T>::train()
-{
-	Feature<T> feat(k);
-	feat.set_save(true);
-
-	uint64_t max_feat = 0;
-	for (uint64_t i = 0; i < possible_feats.size(); i++) {
-		if (possible_feats.at(i).first > max_feat) {
-			max_feat |= possible_feats.at(i).first;
-		}
-	}
-	for (uint64_t i = 1; i <= max_feat; i *= 2) {
-		if (i & max_feat) {
-			feat.add_feature(i, Combo::xy);
-		}
-	}
-	feat.normalize(training);
-	feat.normalize(testing);
-	feat.finalize();
-
-
-
-	// cout << "Class Training:" << endl;
-	// for (auto p : training) {
-	// 	cout << p.val << " ";
-	// }
-	// cout << "Class Testing:" << endl;
-	// for (auto p : testing) {
-	// 	cout << p.val << " ";
-	// }
-	if (mode & PRED_MODE_CLASS) {
-		train_class(&feat);
-		if (mode & PRED_MODE_REGR) {
-			// vector<Point<T>*> f_points_tr, f_points_test;
-			// for (int i = 0; i < 10; i++) {
-			// 	f_points_tr.push_back(training[rand()%training.size()].first);
-			// 	f_points_test.push_back(training[rand()%training.size()].first);
-			// }
-			// training.clear();
-			// testing.clear();
-			// for (auto p : f_points_tr) {
-			// 	mutate_seqs(p, 50, training, 100 * id, 100);
-			// 	mutate_seqs(p, 50, training, 60, 100 * id);
-			// }
-			// for (auto p : f_points_test) {
-			// 	mutate_seqs(p, 50, testing, 100 * id, 100);
-			// 	mutate_seqs(p, 50, testing, 60, 100 * id);
-			// }
-			// filter();
-			auto func = [&](pra<T> pr) {
-				return pr.val <= id;
-			};
-			training.erase(std::remove_if(training.begin(), training.end(), func), training.end());
-			testing.erase(std::remove_if(testing.begin(), testing.end(), func), testing.end());
-			filter(training);//, "training");
-			filter(testing);//, "testing");
-
-		}
-	}
-	if (mode & PRED_MODE_REGR) {
-		train_regr(&feat);
-	}
-	cout << "Training size: " << training.size() << endl;
-	cout << "Testing size: " << testing.size() << endl;
-	// for (auto p : training) {
-	// 	cout << p.val << " ";
-	// }
-	cout << endl;
-	feat.set_save(false);
-	training.clear();
-	testing.clear();
-	possible_feats.clear();
-	is_trained = true;
-}
-
-template<class T>
-void Predictor<T>::train_class(Feature<T>* feat)
-{
-	auto c_size = feat->get_combos().size();
-	for (int i = 0; i < c_size; i++) {
-		feat->remove_feature();
-	}
-	vector<uintmax_t> used_list;
-	double abs_best_acc = 0;
-//	cout << "possible feats at one step: " << possible_feats.size() << endl;
-	Progress prog(possible_feats.size() * max_num_feat, "Feature selection:");
-
-	std::ostringstream oss;
-	for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) {
-		double best_class_acc = abs_best_acc;
-		uintmax_t best_idx = -1, cur_idx = 1;
-		auto best_class_feat = possible_feats.front();
-		for (uint64_t i = 0; i < possible_feats.size(); i++) {
-			if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) {
-				continue;
-			}
-			auto rfeat = possible_feats[i];
-		        feat->add_feature(rfeat.first, rfeat.second);
-			feat->normalize(training);
-			feat->finalize();
-			auto name = feat->feat_names().back();
-			auto pr = class_train(training, *feat, id);
-			auto class_ac = class_test(testing, *feat, pr.second, id);
-			feat->remove_feature();
-			prog++;
-//			cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name  << " acc: " << get<0>(class_ac) << " sens: " << get<1>(class_ac) << " spec: " << get<2>(class_ac) << endl;
-			if (get<0>(class_ac) > best_class_acc) {
-				best_class_acc = get<0>(class_ac);
-				best_class_feat = rfeat;
-				best_idx = i;
-			}
-		}
-		if (best_class_acc > abs_best_acc || num_feat <= min_num_feat) {
-			feat->add_feature(best_class_feat.first, best_class_feat.second);
-			feat->normalize(training);
-			feat->finalize();
-			abs_best_acc = best_class_acc;
-			used_list.push_back(best_idx);
-			oss << "Feature added: " << best_class_feat.first << " " << (int)best_class_feat.second << endl;
-			oss << "Accuracy: " << best_class_acc << endl;
-			possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_class_feat), possible_feats.end());
-		}
-	}
-	prog.end();
-	cout << oss.str();
-	feat_c = new Feature<T>(*feat);
-	feat_c->set_save(false);
-	auto pr = class_train(training, *feat_c, id);
-	cout << "Training ACC: " << pr.first << endl;
-	c_glm = pr.second;
-	auto train_results = class_test(training, *feat_c, c_glm, id);//, "train");
-	cout << "Training ACC: " << get<0>(train_results) << " " << get<1>(train_results) << " " << get<2>(train_results) << endl;
-	auto test_results = class_test(testing, *feat_c, c_glm, id);//, "test");
-	double class_acc = get<0>(test_results);
-	cout << "Testing ACC: " << class_acc << " " << get<1>(test_results) << " " << get<2>(test_results) << endl;
-
-	cout << "Features: "<< endl;
-	for (auto line : feat_c->feat_names()) {
-		cout << "\t" << line << endl;
-	}
-}
-template<class T>
-void Predictor<T>::train_regr(Feature<T>* feat)
-{
-	auto c_size = feat->get_combos().size();
-	for (int i = 0; i < c_size; i++) {
-		feat->remove_feature();
-	}
-	vector<uintmax_t> used_list;
-	double abs_best_regr = 1000000;
-	for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) {
-		double best_regr_err = abs_best_regr;
-		uintmax_t best_idx = -1, cur_idx = 1;
-		auto best_regr_feat = possible_feats.front();
-		for (uint64_t i = 0; i < possible_feats.size(); i++) {
-			if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) {
-				continue;
-			}
-			auto rfeat = possible_feats[i];
-		        feat->add_feature(rfeat.first, rfeat.second);
-			feat->normalize(training);
-			feat->finalize();
-			auto pr = regression_train(training, *feat);
-			auto name = feat->feat_names().back();
-			double regr_mse = regression_test(testing, *feat, pr.second);
-			feat->remove_feature();
-
-			cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name << " err: " << regr_mse << endl;
-			if (regr_mse < best_regr_err) {
-				best_regr_err = regr_mse;
-				best_regr_feat = rfeat;
-				best_idx = i;
-			}
-		}
-		if (best_regr_err < abs_best_regr) {
-			feat->add_feature(best_regr_feat.first, best_regr_feat.second);
-			feat->normalize(training);
-			feat->finalize();
-			abs_best_regr = best_regr_err;
-			used_list.push_back(best_idx);
-			//possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_regr_feat), possible_feats.end());
-		}
-	}
-	feat_r = new Feature<T>(*feat);
-	feat_r->set_save(false);
-	auto pr = regression_train(training, *feat_r);
-	r_glm = pr.second;
-	double tr_regr_mse = regression_test(testing, *feat_r, r_glm); // "training"
-	cout << "Training Mean Error: " << pr.first << endl;
-	double regr_mse = regression_test(testing, *feat_r, r_glm);//, "testing");
-	cout << "Testing Mean Error: " << regr_mse << endl;
-	cout << "Features: "<< endl;
-	for (auto line : feat_r->feat_names()) {
-		cout << "\t" << line << endl;
-	}
-	// auto w = r_glm.get_weights();
-	// for (int r = 0; r < w.getNumRow(); r++) {
-	// 	for (int c = 0; c < w.getNumCol(); c++) {
-	// 		cout << w.get(r, c) << " ";
-	// 	}
-	// 	cout << endl;
-	// }
-	// for (auto combo : feat.get_combos()) {
-	// 	cout << combo.first << " " <<
-	// }
-
-}
-
-template class Predictor<uint8_t>;
-template class Predictor<uint16_t>;
-template class Predictor<uint32_t>;
-template class Predictor<uint64_t>;
-template class Predictor<int>;
-template class Predictor<double>;
diff --git a/src/cluster/src/Progress.cpp b/src/cluster/src/Progress.cpp
deleted file mode 100644
index e16ef06..0000000
--- a/src/cluster/src/Progress.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "Progress.h"
-#include <iostream>
-#include <sstream>
-
-Progress::Progress(long num, std::string prefix_)
-{
-	pmax = num;
-	ended = 0;
-	pcur = 0;
-	prefix = prefix_;
-	last = "";
-	barWidth = 70 - (prefix.size()+1);
-	print();
-}
-
-void Progress::print()
-{
-	std::ostringstream oss;
-	double prog = (double)pcur / pmax;
-	oss << prefix << " [";
-	int pos = barWidth * prog;
-	for (int i = 0; i < barWidth; i++) {
-		if (i < pos) {
-			oss << "=";
-		} else if (i == pos) {
-			oss << ">";
-		} else {
-			oss << " ";
-		}
-	}
-	oss << "] " << int(prog * 100.0) << " %\r";
-	if (oss.str() != last) {
-		last = oss.str();
-		std::cout << last;
-		std::cout.flush();
-	}
-}
-
-void Progress::end()
-{
-	if (!ended) {
-		pcur = pmax;
-		print();
-		std::cout << std::endl;
-	}
-	ended = true;
-}
-
-void Progress::operator++()
-{
-	pcur++;
-	print();
-}
-void Progress::operator++(int)
-{
-	print();
-	pcur++;
-}
-
-
-void Progress::operator+=(size_t num)
-{
-	pcur += num;
-	print();
-}
diff --git a/src/cluster/src/Random.h b/src/cluster/src/Random.h
deleted file mode 100644
index 3131b34..0000000
--- a/src/cluster/src/Random.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef RANDOM_H // -*- C++ -*-
-#define RANDOM_H
-#include <random>
-
-class Random {
-	std::mt19937 rng;
-public:
-	Random() : rng(std::random_device()()) {}
-
-	template<class T>
-	T randMod(T max) {
-		std::uniform_int_distribution<T> distribution(0, max-1);
-		return distribution(rng);
-	}
-
-	double random() {
-		std::uniform_real_distribution<double> distribution(0.0, 1.0);
-		return distribution(rng);
-	}
-};
-
-#endif
diff --git a/src/cluster/src/SingMute.cpp b/src/cluster/src/SingMute.cpp
deleted file mode 100644
index 45f1610..0000000
--- a/src/cluster/src/SingMute.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-#include "SingMute.h"
-#include <set>
-#include "Random.h"
-
-
-
-
-void generate_unique_set(size_t cmd_size, std::set<long>& ret, int num_elts, const std::set<long>& bad_set_1, const std::set<long>& bad_set_2, const std::vector<bool> &valid, Random& rng)
-{
-	while (ret.size() <= num_elts) {
-		long idx = rng.randMod<long>(cmd_size);
-		if (valid[idx] &&
-		    ret.find(idx) == ret.end() &&
-		    bad_set_1.find(idx) == bad_set_1.end() &&
-		    bad_set_2.find(idx) == bad_set_2.end()) {
-
-			ret.insert(idx);
-		}
-	}
-}
-char SingMute::randNucl()
-{
-	char character;
-	int value = rng.randMod<int>(percAs + percCs + percGs + percTs);
-	if (value < percAs) {
-		character = 'A';
-	} else if (value < percAs + percCs) {
-		character = 'C';
-	} else if (value < percAs + percCs + percGs) {
-		character = 'G';
-	} else {
-		character = 'T';
-	}
-	return character;
-}
-void SingMute::init(const std::vector<bool> &valid)
-{
-	maxInsert = 0;
-	maxDel = 0;
-	maxSwitch = 0;
-	if (num_mut == 1) {
-		maxInsert = 1;
-		maxDel = 0;
-		maxSwitch = 0;
-	} else if (num_mut == 0) {
-		out_seq = *seq;
-		return;
-	} else {
-		maxSwitch = rng.randMod<long>(num_mut);
-		num_mut -= maxSwitch;
-
-		if (maxSwitch % 2 == 1 && num_mut >= 1) {
-			maxSwitch++;
-			num_mut--;
-		} else if (num_mut == 0) {
-			maxSwitch--;
-			num_mut++;
-		}
-		if (num_mut > 1) {
-			maxInsert = rng.randMod<long>(num_mut);
-			num_mut -= maxInsert;
-		} else {
-			maxInsert = num_mut;
-			num_mut -= maxInsert;
-		}
-		maxDel = num_mut;
-	}
-	size_t seq_len = seq->length();
-
-	maxDel *= seq_len / 100.0;
-	maxInsert *= seq_len / 100.0;
-	maxSwitch *= seq_len / 100.0;
-	alignmentLength = maxInsert;
-	IBP = maxDel + maxSwitch;
-
-
-	std::vector<char> command_str(seq_len, 'S');
-
-	std::set<long> s_ins, s_del, s_switch;
-	generate_unique_set(command_str.size(), s_ins, maxInsert, s_del, s_switch, valid, rng);
-	generate_unique_set(command_str.size(), s_del, maxDel, s_ins, s_switch, valid, rng);
-	generate_unique_set(command_str.size(), s_switch, maxSwitch, s_ins, s_del, valid, rng);
-	for (auto idx : s_ins) {
-		command_str[idx] = 'I';
-	}
-	for (auto idx : s_del) {
-		command_str[idx] = 'D';
-	}
-	for (auto idx : s_switch) {
-		command_str[idx] = 'W';
-	}
-	out_seq = "";
-	out_seq.reserve(maxInsert + seq_len - maxDel + 1);
-
-	for (long i = 0; i < seq_len; i++) {
-		auto cmd = command_str.at(i);
-		switch (cmd) {
-		case 'I': {
-			out_seq += randNucl();
-			out_seq += seq->at(i);
-			break;
-		}
-		case 'S': {
-			out_seq += seq->at(i);
-			break;
-		}
-		case 'D': {
-			break;
-		}
-		case 'W': {
-			out_seq += randNucl();
-			break;
-		}
-		}
-	}
-}
diff --git a/src/cluster/src/SingleFeature.cpp b/src/cluster/src/SingleFeature.cpp
deleted file mode 100644
index bdc441c..0000000
--- a/src/cluster/src/SingleFeature.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#include "SingleFeature.h"
-
-template<class T>
-void SingleFeature<T>::normalize(const vector<pair<Point<T>*,Point<T>*> > &pairs)
-{
-	for (auto p : pairs) {
-		double d;
-		if (rc.empty()) {
-			d = raw(p.first, p.second);
-		} else {
-			d = rraw(p.first, p.second, rc, rv);
-		}
-		if (!min_set || d < min) {
-			min = d;
-			min_set = true;
-		}
-		if (!max_set || d > max) {
-			max = d;
-			max_set = true;
-		}
-	}
-}
-
-template<class T>
-double SingleFeature<T>::operator()(Point<T> *a, Point<T> *b) const
-{
-	double d;
-	if (rc.empty()) {
-		d = raw(a, b);
-	} else {
-		d = rraw(a, b, rc, rv);
-	}
-//	std::cout << "Raw: " << d << std::endl;
-	double f = (d - min) / (max - min);
-//	std::cout << "Normalized: " << f << std::endl;
-	f = std::min(1.0, std::max(0.0, f));
-	if (is_sim) {
-		return f;
-	} else {
-		return 1.0 - f;
-	}
-}
-
-
-template class SingleFeature<uint8_t>;
-template class SingleFeature<uint16_t>;
-template class SingleFeature<uint32_t>;
-template class SingleFeature<uint64_t>;
-template class SingleFeature<int>;
-template class SingleFeature<double>;
diff --git a/src/cluster/src/SingleFeature.h b/src/cluster/src/SingleFeature.h
deleted file mode 100644
index efa882c..0000000
--- a/src/cluster/src/SingleFeature.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef SINGLEFEATURE_H
-#define SINGLEFEATURE_H
-
-#include "Point.h"
-#include <functional>
-
-template<class T>
-class SingleFeature {
-public:
-	SingleFeature(std::function<double(Point<T>*, Point<T>*)> f, bool is_sim_=true)
-		: raw(f), is_sim(is_sim_), min_set(false), max_set(false) {}
-	SingleFeature(std::function<double(Point<T>*, Point<T>*, const vector<int>&, const vector<int>&)> f, vector<int> rrv, vector<int> rrc, bool is_sim_=true)
-		: rraw(f), is_sim(is_sim_), min_set(false), max_set(false), rv(rrv), rc(rrc) {}
-	void normalize(const vector<pair<Point<T>*,Point<T>*> > &pairs);
-	double operator()(Point<T>*, Point<T>*) const;
-	double min, max;
-private:
-	std::function<double(Point<T>*, Point<T>*)> raw;
-	std::function<double(Point<T>*, Point<T>*, const vector<int>&, const vector<int>&)> rraw;
-	vector<int> rv, rc;
-	const bool is_sim;
-	bool max_set, min_set;
-
-};
-
-#endif
diff --git a/src/cluster/src/SingleMute.cpp b/src/cluster/src/SingleMute.cpp
deleted file mode 100644
index 1f435f7..0000000
--- a/src/cluster/src/SingleMute.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/**
- * Author: Alex Baumgartner
- * The Bioinformatics Toolsmith Laboratory, the University of Tulsa
- * 5/15/2018
- *
- * Purpose:
- *	The pupose of this module is to perform single mutations on sequences
- */
-
-#include "SingleMute.h"
-#include <random>
-
-int intRandMod_(int max) {
-    static thread_local std::mt19937 generator;
-    std::uniform_int_distribution<int> distribution(0, max-1);
-    return distribution(generator);
-}
-
-SingleMute::SingleMute(int a, int c, int g, int t, int alloc) {
-	percAs = a;
-	percCs = c;
-	percGs = g;
-	percTs = t;
-	//If allocation is 0, all sub allocations are 0
-	if (alloc == 0) {
-		maxDel = 0;
-		maxInsert = 0;
-		maxSwitch = 0;
-	}
-	//Arbitrary, if only 1 percent is allocated, then only insert gets an allocation
-	else if (alloc == 1) {
-		maxSwitch = 0;
-		maxDel = 0;
-		maxInsert = 1;
-	}
-	//Otherwise, allocations are assigned randomly
-	else {
-		//Max switch gets a random allocation,
-		//but allocation has to be even
-		//(don't want to switch something with itself)
-		maxSwitch = intRandMod_(alloc);//rand() % alloc;
-		alloc -= maxSwitch;
-		//If alloc is odd,
-		//and there is still percent that can be allocated
-		if (maxSwitch % 2 == 1 && alloc >= 1) {
-			//Make allocation 1 less,
-			//and switch allocation one more (now even)
-			maxSwitch++;
-			alloc--;
-		}
-		//Otherwise, make allocation one larger,
-		//switch allocation one less (even)
-		else if (alloc == 0) {
-			maxSwitch--;
-			alloc++;
-		}
-		//If alloc is greater than 1 (must be for % purposes),
-		//calculate random value for inerst allocation
-		if (alloc > 1) {
-			maxInsert = intRandMod_(alloc);//rand() % alloc;
-			alloc -= maxInsert;
-		} else {
-			maxInsert = alloc;
-			alloc -= maxInsert;
-		}
-		//Max delete is assigned whatever is left
-		maxDel = alloc;
-	}
-}
-
-int SingleMute::getAlignmentLength(){
-	return alignmentLength;
-}
-
-int SingleMute::getIBP(){
-	return IBP;
-}
-
-void SingleMute::genSing(string * sequence, vector<bool> mutes) {
-	seq = sequence;
-	//Assign vector of mutes to inputted vector
-	validIndexes = new vector<int>();
-	validIndexes->reserve(mutes.size());
-//	n_valid_indices = mutes.size();
-	//Adds all valid indexes to the validIndexes vector
-	for(int i = 0; i < mutes.size(); i++){
-		if(mutes.at(i)){
-			validIndexes->push_back(i);
-		}
-	}
-	n_valid_indices = validIndexes->size();
-	float tempFloat;
-	//Calculate number of characters each mutation can mutate
-	tempFloat = maxDel / 100.0;
-	maxDel = (int) (tempFloat * seq->length());
-	tempFloat = maxInsert / 100.0;
-	maxInsert = (tempFloat * seq->length());
-	tempFloat = maxSwitch / 100.0;
-	maxSwitch = (tempFloat * seq->length());
-	//Calculates Alignment length and identical base pairs
-	alignmentLength = maxInsert;
-	IBP = maxDel + maxSwitch;
-	//Vectors to keep track of where insertions and deletions need to be made
-	insertions = new vector<int>();
-	insertions->reserve(maxInsert);
-	deletions = new vector<int>();
-	deletions->reserve(maxDel);
-	//Since switch makes 2 invalid,
-	//switchNucl is run maxSwitch/2 times
-	for (int i = 0; i < maxSwitch; i++) {
-		switchNucl();
-	}
-	//Insert maxInsert times
-	for (int i = 0; i < maxInsert; i++) {
-		insert();
-	}
-	//Delete maxDel nucleotides
-	for (int i = 0; i < maxDel; i++) {
-		deleteNucl();
-	}
-	//perfroms deletions and insertions
-	performInsertAndDelete();
-}
-
-void SingleMute::insert() {
-	//Calculate the index to insert at
-	int index = intRandMod_(n_valid_indices);//rand() % validIndexes->size();
-	insertions->push_back(validIndexes->at(index));
-	std::swap(validIndexes->at(index), validIndexes->at(n_valid_indices-1));
-	n_valid_indices--;
-	//Remove that as a valid index
-//	validIndexes->erase(validIndexes->begin() + index, validIndexes->begin() + index + 1);
-}
-
-void SingleMute::deleteNucl() {
-	//Choose a valid index to delete
-	int index = intRandMod_(n_valid_indices);//rand() % validIndexes->size();
-	deletions->push_back(validIndexes->at(index));
-	std::swap(validIndexes->at(index), validIndexes->at(n_valid_indices-1));
-	n_valid_indices--;
-	//Remove from the
-//	validIndexes->erase(validIndexes->begin() + index, validIndexes->begin() + index + 1);
-}
-
-void SingleMute::switchNucl() {
-	//Pick a random valid index
-	int index = intRandMod_(n_valid_indices);//rand() % validIndexes->size();
-	char character = seq->at(validIndexes->at(index));
-	int value;
-	//Keep generating characters until one different than the one we are trying to switch is found
-	while(character == seq->at(validIndexes->at(index))){
-		value = intRandMod_(percAs + percCs + percGs + percTs);
-		if (value < percAs) {
-			character = 'A';
-		} else if (value < percAs + percCs) {
-			character = 'C';
-		} else if (value < percAs + percCs + percGs) {
-			character = 'G';
-		} else {
-			character = 'T';
-		}
-	}
-	//Switch that character
-	seq->at(validIndexes->at(index)) = character;
-	std::swap(validIndexes->at(index), validIndexes->at(n_valid_indices-1));
-	n_valid_indices--;
-	//Remove the chosen index as a valid index
-//	validIndexes->erase(validIndexes->begin() + index, validIndexes->begin() + index + 1);
-}
-
-void SingleMute::performInsertAndDelete(){
-	//sorts the vectors based
-	std::sort(insertions->begin(), insertions->end());
-	std::sort(deletions->begin(), deletions->end());
-	//Goes through both vectors untill all have been processed
-	for(int i = insertions->size() - 1, j = deletions->size() - 1; i >= 0 && j >= 0;){
-		//If i is -1, all insertions have been processed
-		if(i == -1){
-			removeNucl(deletions->at(j));
-			j--;
-		}
-		//If i is -1, all deletions have been processed
-		else if(j == -1){
-			insertNucl(insertions->at(i));
-			i--;
-		}
-		else{
-			//If the index of the current next insertion is higher than the next deletion, insert, else delete
-			if(insertions->at(i) > deletions->at(j)){
-				insertNucl(insertions->at(i));
-				i--;
-			}
-			else{
-				removeNucl(deletions->at(j));
-				j--;
-			}
-		}
-	}
-}
-
-void SingleMute::removeNucl(int index){
-	seq->erase(index, 1);
-}
-
-void SingleMute::insertNucl(int index){
-	string character;
-	//Use a weighted die to
-	//calculate which character to insert
-	int value = intRandMod_(percAs + percCs + percGs + percTs);
-	if (value < percAs) {
-		character = "A";
-	} else if (value < percAs + percCs) {
-		character = "C";
-	} else if (value < percAs + percCs + percGs) {
-		character = "G";
-	} else {
-		character = "T";
-	}
-	//insert at that index
-	seq->insert(index, character);
-}
diff --git a/src/cluster/src/SingleMute.h b/src/cluster/src/SingleMute.h
deleted file mode 100644
index b0bf93d..0000000
--- a/src/cluster/src/SingleMute.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- * Author: Alex Baumgartner
- * The Bioinformatics Toolsmith Laboratory, the University of Tulsa
- * 5/15/2018
- *
- * Purpose:
- *	The pupose of this module is to perform single mutations on sequences
- */
-
-#ifndef SINGLEMUTE_H
-#define  SINGLEMUTE_H
-
-#include <iostream>
-#include <vector>
-#include <string>
-#include <algorithm>
-
-using namespace std;
-
-class SingleMute {
-public:
-	/*
-	 Constructor, creates values
-	 	and assignes allocations based on inputted data
-
-	 @param:
-	 int: percentage of A's
-	 int: percentage of C's
-	 int: percentage of G's
-	 int: percentage of T's
-	 int: The total allocation for non-single mutations
-	 */
-	SingleMute(int, int, int, int, int);
-	/*
-	 Takes a string and mutates it based
-	 	on the allocation given in the constructor
-
-	 @param:
-	 std::string *: pointer to the sequence to be mutated
-	 std::vector<bool> : boolean vector of valid and invalid indexes
-	 */
-	void genSing(std::string *, std::vector<bool>);
-
-	int getAlignmentLength();
-
-	int getIBP();
-
-	~SingleMute(){delete validIndexes; delete insertions; delete deletions;};
-
-  private:
-	int percAs;
-	int percCs;
-	int percGs;
-	int percTs;
-	int maxDel;
-	int maxInsert;
-	int maxSwitch;
-	int alignmentLength;
-	int IBP;
-	std::vector<int> * validIndexes;
-	size_t n_valid_indices = 0;
-	std::vector<int> * deletions;
-	std::vector<int> * insertions;
-	std::string * seq;
-	/*
-	 Inserts a sequence randomly in the list
-	 	at a valid index
-	 */
-	void insert();
-	/*
-	 Deletes a random nucleotide
-	 	that has not been previously mutated
-	 */
-	void deleteNucl();
-	/*
-	 Switches two random nucleotides
-	 	that have not been mutated previously
-	 */
-	void switchNucl();
-	/*
-	Performs necessary insertions and deletions in the string based on the insertion and deletion vectors
-	*/
-	void performInsertAndDelete();
-
-	void removeNucl(int);
-
-	void insertNucl(int);
-};
-#endif
diff --git a/src/cluster/src/Trainer.cpp b/src/cluster/src/Trainer.cpp
deleted file mode 100644
index 432d624..0000000
--- a/src/cluster/src/Trainer.cpp
+++ /dev/null
@@ -1,930 +0,0 @@
-#include "Trainer.h"
-#include "HandleSeq.h"
-#include "Loader.h"
-#include "ClusterFactory.h"
-#include <algorithm>
-#include <set>
-#include <map>
-#include <cmath>
-#include "../../utility/GlobAlignE.h"
-#include "../../utility/AffineId.h"
-#include "needleman_wunsch.h"
-#include "Predictor.h"
-#include "GLM.h"
-#include "Feature.h"
-#include "Progress.h"
-#include <random>
-
-template<class T>
-double Trainer<T>::align(Point<T> *a, Point<T>* b) const
-{
-	auto sa = a->get_data_str();
-	auto sb = b->get_data_str();
-	int la = sa.length();
-	int lb = sb.length();
-
-	// needleman_wunsch nw(sa, sb, 2, -3, 5, 2);
-	// return nw.identity(nw.align());
-	GlobAlignE galign(sa.c_str(), 0, la-1,
-			  sb.c_str(), 0, lb-1,
-			  1, -1, 2, 1);
-
-	return galign.getIdentity();
-
-}
-
-
-template<class T>
-std::tuple<Point<T>*,double,size_t,size_t> Trainer<T>::get_close(Point<T> *p, bvec_iterator<T> istart, bvec_iterator<T> iend, bool &is_min_r) const
-{
-	int ncols = weights.getNumRow();
-#pragma omp declare reduction(pmax:std::tuple<Point<T>*,double,size_t,size_t>: \
-			      omp_out = get<1>(omp_in) > get<1>(omp_out) ? omp_in : omp_out ) \
-	initializer (omp_priv=std::make_tuple((Point<T>*)NULL,-1,0,0))
-
-	std::tuple<Point<T>*,
-		   double,
-		   size_t,
-		   size_t> result = std::tuple<Point<T>*, double, size_t, size_t>(NULL,
-				     -1,
-				     0,
-				     0);
-	bool has_found = false;
-
-	#ifdef DEBUG
-	cout << "begin " << istart.r << " " << istart.c << " end " << iend.r << " " << iend.c << endl;
-	for (auto data : *istart.col) {
-		cout << "\t" << data.size() << endl;
-	}
-	#endif
-// #pragma omp parallel for reduction(pmin:result), reduction(||:has_found)
-// 	for (bvec_iterator<T> i = istart; i <= iend; i++) {
-// 		if (i <= iend) {
-// 		Point<T>* pt = (*i).first;
-// 		double sum = weights.get(0, 0);
-// 		double dist = 0;
-// 		for (int col = 1; col < ncols; col++) {
-// 			if (col == 1) {
-// 				dist = ff.at(col-1)(pt, p);
-// 				sum += weights.get(col, 0) * dist;
-// 			} else {
-// 				sum += weights.get(col, 0) * ff.at(col-1)(pt, p);
-// 			}
-// 		}
-// 		double res = round(1.0 / (1 + exp(-sum)));
-
-// // set second to true if result is not 1.0
-// 		// which means it will be removed
-// 		result = std::make_pair(pt, dist);
-// 		has_found = (res != 1.0);
-// 		(*i).second = (res != 1.0);
-// 		}
-// 	}
-	bool is_min = true;
-#pragma omp parallel for reduction(pmax:result), reduction(&&:is_min)
-	for (bvec_iterator<T> i = istart; i <= iend; ++i) {
-		Point<T>* pt = (*i).first;
-		double sum = weights.get(0, 0);
-		double dist = 0;
-		auto cache = feat->compute(*pt, *p);
-		for (int col = 1; col < ncols; col++) {
-			if (col == 1) {
-				dist = (*feat)(col-1, cache);
-				sum += weights.get(col, 0) * dist;
-			} else {
-				sum += weights.get(col, 0) * (*feat)(col-1, cache);
-			}
-		}
-		double res = round(1.0 / (1 + exp(-sum)));
-		//cout << "res: " << res << " " << dist << endl;
-// set second to true if result is not 1.0
-		// which means it will be removed
-		result = (dist > std::get<1>(result)) ? std::make_tuple(pt, dist, i.r, i.c) : result;
-		is_min = is_min && (res != 1.0);
-//		has_found = has_found || (res != 1.0);
-		if (res == 1.0) {
-			*i = std::make_pair(pt, true);
-//			(*i).second = true;
-		}
-	}
-
-//	is_min = !has_found;
-	is_min_r = is_min;
-//	return get<0>(result);
-	return result;
-
-}
-
-template<class T>
-long Trainer<T>::merge(vector<Center<T> > &centers, long current, long begin, long last) const
-{
-#pragma omp declare reduction(ldpmax:std::pair<long,double>:			\
-			      omp_out = omp_in.second > omp_out.second ? omp_in : omp_out ) \
-	initializer (omp_priv=std::make_pair(0, std::numeric_limits<double>::min()))
-	std::pair<long,double> best = std::make_pair(0, std::numeric_limits<double>::min());
-	Point<T>* p = centers[current].getCenter();
-#pragma omp parallel for reduction(ldpmax:best)
-	for (long i = begin; i <= last; i++) {
-		double sum = weights.get(0, 0);
-		double dist = 0;
-		Point<T>* cen = centers[i].getCenter();
-		auto cache = feat->compute(*cen, *p);
-		for (int col = 1; col < weights.getNumRow(); col++) {
-			double d = (*feat)(col-1, cache);
-			if (col == 1) {
-				dist = d;
-			}
-			sum += weights.get(col, 0) * d;
-		}
-		double res = round(1.0 / (1 + exp(-sum)));
-
-		if (res == 1) {
-			best = best.second > dist ? best : std::make_pair(i, dist);
-		}
-	}
-	return best.first;
-}
-
-template<class T>
-vector<pair<Point<T>*,Point<T>*> > resize_vec(vector<pair<pair<Point<T>*,Point<T>*>, double> > &vec, size_t new_size)
-{
-	cout << "Vector size: " << vec.size() << " min size: " << new_size << endl;
-	vector<pair<Point<T>*, Point<T>*> > data;
-	if (vec.size() <= new_size) {
-		for (int i = 0; i < vec.size(); i++) {
-			data.push_back(vec[i].first);
-		}
-		return data;
-	}
-	using k = pair<pair<Point<T>*,Point<T>*>, double>;
-	std::sort(vec.begin(), vec.end(), [](const k& a, const k& b) {
-			return a.second < b.second;
-		});
-	double interval = (double)vec.size() / (vec.size() - new_size);
-	std::set<int> indices;
-	int i = 0;
-	for (double index = 0; round(index) < vec.size() && i < (vec.size() - new_size);
-	     i++, index += interval) {
-		int j = round(index);
-		indices.insert(j);
-	}
-
-	std::cout << "index size: " << indices.size() << std::endl;
-
-	// for (double index = 0; round(index) < vec.size() && indices.size() < new_size;
-	//      index += interval) {
-	// 	int j = round(index);
-	// 	indices.insert(vec[j]);
-	// }
-	// vec.erase(vec.begin(), std::remove_if(vec.begin(), vec.end(), [&](const k& a) {
-	// 			return indices.find(a) == indices.end();
-	// 		}));
-	for (auto iter = indices.rbegin(); iter != indices.rend(); iter++) {
-		int idx = *iter;
-		vec.erase(vec.begin() + idx);
-	}
-	if (vec.size() != new_size) {
-		cerr << "sizes are not the same: " << vec.size() << " " << new_size <<  endl;
-		throw "Resize did not work";
-	}
-	for (auto a : vec) {
-		data.push_back(a.first);
-	}
-	return data;
-}
-
-struct rng {
-	rng() {
-		srand(0);
-	}
-	int operator()(int n) const {
-		return rand() % n;
-	}
-};
-template<class T>
-	pair<vector<pair<Point<T>*,
-			 Point<T>*
-			 > >,
-	     vector<pair<Point<T>*,
-			 Point<T>*> > > Trainer<T>::get_labels(vector<pair<Point<T>*,Point<T>*> > &vec, double cutoff) const
-{
-
-	auto cmp = [](const pair<Point<T>*,Point<T>*> a, const pair<Point<T>*,Point<T>*> b) {
-		return a.first->get_header().compare(b.first->get_header()) < 0
-		||
-		(a.first->get_header() == b.first->get_header() && a.second->get_header().compare(b.second->get_header()) < 0);
-	};
-	auto scmp = [](const pair<pair<Point<T>*,Point<T>*>,double> a, const pair<pair<Point<T>*,Point<T>*>, double> b) {
-		return a.first.first->get_header().compare(b.first.first->get_header()) < 0
-		||
-		(a.first.first->get_header() == b.first.first->get_header() && a.first.second->get_header().compare(b.first.second->get_header()) < 0);
-	};
-
-	// todo: convert to std::map
-	std::set<pair<pair<Point<T>*,Point<T>*>, double>, decltype(scmp)> buf_pos(scmp), buf_neg(scmp);
-	std::vector<pair<pair<Point<T>*,Point<T>*>, double> > buf_vpos, buf_vneg;
-//	std::sort(vec.begin(), vec.end(), cmp);
-	// cout << "Before Pair: " << vec[0].first->get_header() << ", " << vec[0].second->get_header() << endl;
-	// cout << "Before Pair: " << vec[vec.size()-1].first->get_header() << ", " << vec[vec.size()-1].second->get_header() << endl;
-
-	rng gen;
-	random_shuffle(vec.begin(), vec.end(), gen);
-	// cout << "Pair: " << vec[0].first->get_header() << ", " << vec[0].second->get_header() << endl;
-	// cout << "Pair: " << vec[vec.size()-1].first->get_header() << ", " << vec[vec.size()-1].second->get_header() << endl;
-	vector<double> scores(vec.size());
-	Progress p(vec.size(), "Alignment");
-#pragma omp parallel for schedule(dynamic)
-	for (int i = 0; i < vec.size(); i++) {
-		double algn = align(vec[i].first, vec[i].second);
-		bool is_pos = algn >= cutoff;
-#pragma omp critical
-		{
-			scores[i] = algn;
-			p++;
-			if (is_pos) {
-				buf_pos.insert(make_pair(vec[i], algn));
-				//cout << vec[i].first->get_header() << " " << vec[i].second->get_header() << " " << algn << endl;
-			} else {
-				buf_neg.insert(make_pair(vec[i], algn));
-			}
-
-#ifdef DEBUG
-			cout << vec[i].first->get_header() << " WITH " << vec[i].second->get_header() << " " << algn << endl;
-			#endif
-
-		}
-	}
-	p.end();
-	std::sort(scores.begin(), scores.end());
-	std::cout << "positive=" << buf_pos.size() << " negative=" << buf_neg.size() << endl;
-	if (buf_pos.empty() || buf_neg.empty()) {
-		std::cout << "Identity value does not match sampled data: ";
-		if (buf_pos.empty()) {
-			std::cout << "Too many sequences below identity";
-		} else {
-			std::cout << "Too many sequences above identity";
-		}
-		std::cout << std::endl;
-		exit(0);
-	}
-	size_t m_size = std::min(buf_pos.size(), buf_neg.size());
-
-	std::cout << "resizing positive" << std::endl;
-	for (auto p : buf_pos) {
-		buf_vpos.push_back(p);
-	}
-	for (auto p : buf_neg) {
-		buf_vneg.push_back(p);
-	}
-	auto bp = resize_vec(buf_vpos, m_size);
-	std::cout << "resizing negative" << std::endl;
-	auto bn = resize_vec(buf_vneg, m_size);
-        auto ret = make_pair(bp, bn);
-	std::cout << "positive=" << ret.first.size() << " negative=" << ret.second.size() << endl;
-	return ret;
-
-}
-template<class T>
-void Trainer<T>::filter(Point<T> *p, vector<pair<Point<T> *, bool> > &vec) const
-{
-	for (auto& pt : vec) {
-		double sum = weights.get(0, 0);
-		auto cache = feat->compute(*pt.first, *p);
-		for (int col = 1; col < weights.getNumRow(); col++) {
-			sum += weights.get(col, 0) * (*feat)(col-1, cache);
-		}
-		double res = round(1.0 / (1 + exp(-sum)));
-		pt.second = (res != 1);
-	}
-	vec.erase(std::remove_if(vec.begin(), vec.end(), [](pair<Point<T>*, bool> p) {
-				return p.second;
-			}), vec.end());
-}
-
-template<class T>
-Point<T>* Trainer<T>::closest(Point<double> *p, vector<pair<Point<T> *, bool> > &vec) const
-{
-	Point<T>* best_pt = NULL;
-	double best_dist = 0;
-	for (auto& pt : vec) {
-		double sum = weights.get(0, 0);
-		double dist = pt.first->distance_d(*p);
-		if (best_pt == NULL || dist < best_dist) {
-			best_dist = dist;
-			best_pt = pt.first;
-		}
-	}
-	return best_pt;
-}
-
-template<class T>
-std::pair<matrix::Matrix,matrix::Matrix> Trainer<T>::generate_feat_mat(pair<vector<pair<Point<T> *, Point<T> *> >, vector<pair<Point<T> *, Point<T> *> > > &data, int ncols)
-{
-	int nrows = data.first.size() + data.second.size();
-	matrix::Matrix feat_mat(nrows, ncols);
-	matrix::Matrix labels(nrows, 1);
-#pragma omp parallel for
-	for (int i = 0; i < data.first.size(); i++) {
-		auto kv = data.first[i];
-		int row = i;
-		auto cache = feat->compute(*kv.first, *kv.second);
-		for (int col = 0; col < ncols; col++) {
-
-			if (col == 0) {
-				feat_mat.set(row, col, 1);
-			} else {
-//				double val = ff[col-1](kv.first, kv.second);
-				////#pragma omp critical
-				double val = (*feat)(col-1, cache);
-				feat_mat.set(row, col, val);
-			}
-
-		}
-		////#pragma omp critical
-		labels.set(row, 0, 1);
-	}
-#pragma omp parallel for
-	for (int i = 0; i < data.second.size(); i++) {
-		auto kv = data.second[i];
-		int row = data.first.size() + i;
-		auto cache = feat->compute(*kv.first, *kv.second);
-		for (int col = 0; col < ncols; col++) {
-
-			if (col == 0) {
-				feat_mat.set(row, col, 1);
-			} else {
-//				double val = ff[col-1](kv.first, kv.second);
-				////#pragma omp critical
-				double val = (*feat)(col-1, cache);
-				feat_mat.set(row, col, val);
-			}
-
-		}
-		////#pragma omp critical
-		labels.set(row, 0, -1);
-	}
-	return std::make_pair(feat_mat, labels);
-}
-template<class T>
-double Trainer<T>::train_n(pair<vector<pair<Point<T> *, Point<T> *> >, vector<pair<Point<T> *, Point<T> *> > > &data, int ncols)
-{
-	std::cout << "done" << endl;
-	cout << "Training on " << ncols << " columns" << endl;
-	int nrows = data.first.size() + data.second.size();
-
-	matrix::Matrix feat_mat(nrows, ncols);
-	matrix::Matrix labels(nrows, 1);
-	double avg_label = 0;
-#pragma omp parallel for
-	for (int i = 0; i < data.first.size(); i++) {
-		auto kv = data.first[i];
-		int row = i;
-		auto cache = feat->compute(*kv.first, *kv.second);
-		for (int col = 0; col < ncols; col++) {
-
-			if (col == 0) {
-				feat_mat.set(row, col, 1);
-			} else {
-//				double val = ff[col-1](kv.first, kv.second);
-				////#pragma omp critical
-				double val = (*feat)(col-1, cache);
-				feat_mat.set(row, col, val);
-			}
-
-		}
-		////#pragma omp critical
-		labels.set(row, 0, 1);
-	}
-#pragma omp parallel for
-	for (int i = 0; i < data.second.size(); i++) {
-		auto kv = data.second[i];
-		int row = data.first.size() + i;
-		auto cache = feat->compute(*kv.first, *kv.second);
-		for (int col = 0; col < ncols; col++) {
-
-			if (col == 0) {
-				feat_mat.set(row, col, 1);
-			} else {
-//				double val = ff[col-1](kv.first, kv.second);
-				////#pragma omp critical
-				double val = (*feat)(col-1, cache);
-				feat_mat.set(row, col, val);
-			}
-
-		}
-		////#pragma omp critical
-		labels.set(row, 0, -1);
-	}
-	for (int row = 0; row < nrows; row++) {
-		for (int col = 0; col < ncols; col++) {
-			double val = feat_mat.get(row, col);
-			std::cout << val << "\t";
-		}
-		std::cout << endl;
-	}
-	glm.train(feat_mat, labels);
-	weights = glm.get_weights();
-	#ifdef DEBUG
-	for (int i = 0; i < ncols; i++) {
-		cout << "weight: " << weights.get(i, 0) << endl;
-
-	}
-	#endif
-	matrix::Matrix p = glm.predict(feat_mat);
-	for (int row = 0; row < nrows; row++) {
-		if (p.get(row, 0) == 0) {
-			p.set(row, 0, -1);
-		}
-	}
-	auto tup = glm.accuracy(labels, p);
-	return get<0>(tup);
-}
-
-double random_between(double mute, double rng, double low, double high)
-{
-	double r_d = (double)rand() / RAND_MAX;
-	double mn = std::max(mute - rng, low);
-	double mx = std::min(mute + rng, high);
-	return r_d * (mx - mn) + mn;
-}
-
-template<class T>
-void Trainer<T>::mutate_seqs(Point<T>* p, size_t num_seq, vector<pra<T> > &pos_buf, vector<pra<T> > &neg_buf, double id_begin, double id_end, uintmax_t& _id)
-{
-	HandleSeq h(HandleSeq::BOTH);
-	ClusterFactory<T> factory(k);
-	double inc = (id_end - id_begin) / num_seq;
-	std::string bin_seq = p->get_data_str();
-	std::string seq;
-	for (auto c : bin_seq) {
-		switch (c) {
-		case 0:
-			seq += 'A';
-			break;
-		case 1:
-			seq += 'C';
-			break;
-		case 2:
-			seq += 'G';
-			break;
-		case 3:
-			seq += 'T';
-			break;
-		case 'N':
-			seq += 'C';
-			break;
-		default:
-			cout << "Invalid character " << c << endl;
-			cout << "from sequence " << bin_seq << endl;
-			throw 3;
-		}
-	}
-	for (size_t i = 0; i < num_seq; i++) {
-		double iter_id = id_begin + inc * (i + 0.5);
-		double actual_id = random_between(iter_id, inc, id_begin, id_end);
-		int mut = round(100 - actual_id);
-		auto newseq = h.mutate(seq, mut);
-		std::string chrom;
-		std::string header = p->get_header();
-		Point<T>* new_pt = Loader<T>::get_point(header, newseq.second, _id, k);
-		pra<T> pr;
-		pr.first = p->clone();
-		pr.second = new_pt;
-		pr.val = newseq.first;
-		if (pr.val > cutoff) {
-			pos_buf.push_back(pr);
-		} else {
-			neg_buf.push_back(pr);
-		}
-	}
-}
-
-template<class T>
-std::pair<std::pair<vector<pair<Point<T>*,Point<T>*> >,
-		    vector<pair<Point<T>*,Point<T>*> > >,
-	  std::pair<vector<pair<Point<T>*,Point<T>*> >,
-		    vector<pair<Point<T>*,Point<T>*> > > >
-Trainer<T>::new_get_labels(std::vector<Point<T>*> &points, size_t num_sample, double id, uintmax_t &_id)
-{
-	std::sort(points.begin(), points.end(), [](const Point<T>* a,
-						   const Point<T>* b) -> bool {
-			  return a->get_length() < b->get_length();
-		  });
-	std::pair<vector<pair<Point<T>*,Point<T>*> >,
-		  vector<pair<Point<T>*,Point<T>*> > > training, testing;
-	num_sample = min(num_sample, points.size());
-	vector<Point<T>*> f_points_tr, f_points_test;
-	size_t total_size = points.size();
-	for (int i = 0; i < num_sample; i++) {
-		int i1 = floor((double)i * total_size / (2 * num_sample));
-		int i2 = floor((i + 1) * (double)total_size / (2 * num_sample));
-		f_points_tr.push_back(points.at(i1));
-		f_points_test.push_back(points.at(i2));
-	}
-	std::vector<pra<T> > pos_buf, neg_buf;
-	cout << "mutating sequences" << endl;
-	for (auto p : f_points_tr) {
-		mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id);
-		mutate_seqs(p, 5, pos_buf, neg_buf, 40, 100 * id, _id);
-	}
-	size_t buf_size = std::min(pos_buf.size(), neg_buf.size());
-	cout << "training +: " << pos_buf.size() << endl;
-	cout << "training -: " << neg_buf.size() << endl;
-	std::vector<std::vector<pra<T> > > bins;
-	size_t num_bins;
-	for (int i = 0; i < 10; i++) {
-		double max_identity = id * 100 + (100 - 100.0 * id) * (i+1) / 10.0;
-		double min_identity = id * 100 + (100 - 100.0 * id) * i / 10.0;
-		cout << "I = " << i << " " << min_identity << " -> " << max_identity << endl;
-		bins.push_back(std::vector<pra<T> >());
-		for (auto p : pos_buf) {
-			if (p.val > min_identity && p.val < max_identity) {
-				bins[i].push_back(p);
-			}
-		}
-		for (auto p : neg_buf) {
-			if (p.val > min_identity && p.val < max_identity) {
-				bins[i].push_back(p);
-			}
-		}
-	}
-	std::random_shuffle(pos_buf.begin(), pos_buf.end());
-	std::random_shuffle(neg_buf.begin(), neg_buf.end());
-	for (size_t i = 0; i < buf_size; i++) {
-		cout << "TR: P " << pos_buf[i].val << endl;
-		cout << "TR: N " << neg_buf[i].val << endl;
-		if (pos_buf[i].val > id) {
-			training.first.emplace_back(pos_buf[i].first, pos_buf[i].second);
-		} else {
-			training.second.emplace_back(pos_buf[i].first, pos_buf[i].second);
-		}
-		if (neg_buf[i].val > id) {
-			training.first.emplace_back(neg_buf[i].first, neg_buf[i].second);
-		} else {
-			training.second.emplace_back(neg_buf[i].first, neg_buf[i].second);
-		}
-	}
-	pos_buf.clear();
-	neg_buf.clear();
-	for (auto p : f_points_test) {
-		mutate_seqs(p, 5, pos_buf, neg_buf, 100 * id, 100, _id);
-		mutate_seqs(p, 5, pos_buf, neg_buf, 40, 100 * id, _id);
-	}
-	buf_size = std::min(pos_buf.size(), neg_buf.size());
-	cout << "testing +: " << pos_buf.size() << endl;
-	cout << "testing -: " << neg_buf.size() << endl;
-	std::random_shuffle(pos_buf.begin(), pos_buf.end());
-	std::random_shuffle(neg_buf.begin(), neg_buf.end());
-	for (size_t i = 0; i < buf_size; i++) {
-		cout << "TE: P " << pos_buf[i].val << endl;
-		cout << "TE: N " << neg_buf[i].val << endl;
-		if (pos_buf[i].val > id) {
-			testing.first.emplace_back(pos_buf[i].first, pos_buf[i].second);
-		} else {
-			testing.second.emplace_back(pos_buf[i].first, pos_buf[i].second);
-		}
-		if (neg_buf[i].val > id) {
-			testing.first.emplace_back(neg_buf[i].first, neg_buf[i].second);
-		} else {
-			testing.second.emplace_back(neg_buf[i].first, neg_buf[i].second);
-		}
-	}
-	return make_pair(training, testing);
-}
-template<class T>
-void Trainer<T>::train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, double acc_cutoff)
-{
-
-	if (k != 0) {
-		std::cout << "Splitting data" << endl;
-		uintmax_t _id = points.size();
-		Predictor<T> pred(k, cutoff, PRED_MODE_CLASS, feat_type,
-				  mut_type, min_n_feat, max_n_feat, min_id);
-		pred.train(points, points, _id, n_points);
-		delete feat;
-		auto pr = pred.get_class();
-		feat = pr.first;
-		glm = pr.second;
-		weights = glm.get_weights();
-		return;
-	} else {
-		feat->add_feature(FEAT_ALIGN, Combo::xy);
-//		feat->normalize(training.first);
-		feat->finalize();
-		weights = matrix::Matrix(2, 1);
-		weights.set(0, 0, -1 * cutoff);
-		weights.set(1, 0, 1);
-		return;
-	}
-}
-
-template<class T>
-vector<pair<Point<T>*, Point<T>*> > Trainer<T>::split()
-{
-	// n_points total per side
-	// max_pts_from_one on each side
-	auto cmp = [](const pair<Point<T>*,Point<T>*> a, const pair<Point<T>*,Point<T>*> b) {
-			return a.first->get_header().compare(b.first->get_header()) < 0
-||
-										      (a.first->get_header() == b.first->get_header() && a.second->get_header().compare(b.second->get_header()) < 0);
-	};
-        set<pair<Point<T>*, Point<T>*>, decltype(cmp)> pairs(cmp);
-//	vector<pair<Point<T>*, Point<T>*> > pairs;
-	const size_t total_num_pairs = n_points * 2;
-	int aerr = 0;
-	int bandwidth = (1.0 - cutoff) * 10000;
-	vector<Point<T>*> indices;
-	std::sort(points.begin(), points.end(), [](const Point<T>* a,
-						   const Point<T>* b) -> bool {
-			  return a->get_length() < b->get_length();
-			  });
-	Point<T> *begin_pt = points[points.size()/2];
-
-	std::sort(points.begin(), points.end(), [&](const Point<T>* a,
-							    const Point<T>* b) -> bool {
-				  return a->distance(*begin_pt) < b->distance(*begin_pt);
-			  });
-	int num_iterations = ceil(((double)n_points) / max_pts_from_one) - 1;
-	for (int i = 0; i <= num_iterations; i++) {
-		int idx = i * (points.size()-1) / num_iterations;
-		indices.push_back(points[idx]);
-	}
-	cout << "Point pairs: " << indices.size() << endl;
-	size_t to_add_each = max_pts_from_one / 2;
-	Progress prog(indices.size(), "Sorting data");
-#pragma omp parallel for schedule(dynamic)
-	for (int i = 0; i < indices.size(); i++) {
-		vector<Point<T>*> pts = points;
-		Point<T>* p = indices[i];
-		std::sort(pts.begin(), pts.end(), [&](const Point<T>* a,
-						      const Point<T>* b) {
-				  return a->distance(*p) < b->distance(*p);
-			  });
-		// do binary search with alignment
-		size_t offset = pts.size() / 4;
-		size_t pivot = offset;
-		double closest_algn = 20000;
-		size_t best_pivot = 2 * offset;
-		for (pivot = 2 * offset; offset > 0; offset /= 2) {
-			double algn = align(p, pts[pivot]);
-			// cout << "Pivot: " << pivot << " point: " << pts[pivot]->get_header() << " sim: " << align(p, pts[pivot]) << endl;
-			if (fabs(algn - cutoff) < closest_algn) {
-				closest_algn = fabs(algn - cutoff);
-				best_pivot = pivot;
-			}
-			if (algn < cutoff) {
-				pivot -= offset;
-			} else if (algn > cutoff) {
-				pivot += offset;
-			} else {
-				break;
-			}
-		}
-//		cout << "Pivot: " << pivot << " point: " << pts[pivot]->get_header() << " sim: " << align(p, pts[pivot]) << endl;
-		// before: [0, pivot) size: to_add_each
-		// after: [pivot, size) size: to_add_each
-		double before_inc = (double)pivot / to_add_each;
-		double after_inc = ((double)(pts.size() - pivot)) / to_add_each;
-#pragma omp critical
-		{
-			prog++;
-			if (before_inc < 1) {
-				aerr = 1;
-			} else if (after_inc < 1) {
-				aerr = -1;
-			}
-		}
-		double before_start = 0;
-		double after_start = pivot;
-		double top_start = 0;
-		size_t size_before = pairs.size();
-		vector<pair<Point<T>*,Point<T>*> > buf;
-		// Adds points above cutoff by adding before_inc
-		for (int i = 0; i < to_add_each; i++) {
-			int idx = round(before_start);
-			int dist = pts[idx]->distance(*p);
-			//	cout << p->get_header() << " " << pts[idx]->get_header() << " " << dist << endl;
-			auto pr = p->get_header().compare(pts[idx]->get_header()) < 0 ? make_pair(p, pts[idx]) : make_pair(pts[idx], p);
-			buf.push_back(pr);
-			before_start += before_inc;
-		}
-		// Adds points before cutoff by adding after_inc
-		for (int i = 0; i < to_add_each && round(after_start) < pts.size(); i++) {
-			int idx = round(after_start);
-			int dist = pts[idx]->distance(*p);
-			//		cout << p->get_header() << " " << pts[idx]->get_header() << " " << dist << endl;
-			auto pr = p->get_header().compare(pts[idx]->get_header()) < 0 ? make_pair(p, pts[idx]) : make_pair(pts[idx], p);
-			buf.push_back(pr);
-			after_start += after_inc;
-		}
-#pragma omp critical
-		{
-			// Adds buffer to total pairs
-		// 	for (auto p : buf) {
-// 				pairs.push_back(p);
-// 			}
-			pairs.insert(std::begin(buf), std::end(buf));
-		}
-//			cout << "added " << pairs.size() - size_before << " pairs" << endl;
-	}
-	prog.end();
-	if (aerr < 0) {
-		cerr << "Warning: Alignment may be too small for sampling" << endl;
-	} else if (aerr > 0) {
-		cerr << "Warning: Alignment may be too large for sampling" << endl;
-	}
-	int i = 0;
-	for (auto a : pairs) {
-		cout << "Before Pair: " << a.first->get_header() << ", " << a.second->get_header() << endl;
-		if (++i == 4) {
-			break;
-		}
-	}
-	return std::vector<std::pair<Point<T>*,Point<T>*> >(pairs.begin(), pairs.end());
-}
-template<class T>
-std::pair<std::map<std::pair<Point<T>*, Point<T>*>, double>,
-	  std::map<std::pair<Point<T>*, Point<T>*>, double> >
-Trainer<T>::split_old() {
-	using train_map = std::map<std::pair<Point<T>*, Point<T>*>, double>;
-	std::pair<train_map, train_map> split;
-	int bandwidth = (1.0 - cutoff) * 10000;
-	size_t last_cutoff = points.size() / 2;
-	while (split.first.size() < n_points) {
-		Point<T> *p = points[last_cutoff];
-		std::sort(points.begin(), points.end(), [&](const Point<T>* a,
-							    const Point<T>* b) -> bool {
-				  return a->distance(*p) < b->distance(*p);
-			  });
-		int b_cutoff = points.size() / 2;
-		for (int offset = b_cutoff; offset >= 1; offset /= 2) {
-			int dist = p->distance(*points[b_cutoff]);
-			if (dist < bandwidth) {
-				b_cutoff += offset;
-			} else if (dist > bandwidth) {
-				b_cutoff -= offset;
-			} else {
-				break;
-			}
-		}
-		size_t cutoff_index = points.size();
-		const size_t count = split.first.size();
-
-		if (b_cutoff >= max_pts_from_one) {
-			double ratio = (double)b_cutoff / max_pts_from_one;
-			double sum = 0;
-			for (size_t q = 0; q < max_pts_from_one; q++) {
-				size_t i = round(sum);
-				if (i >= points.size()) {
-					cerr << "this shouldn't happen" << endl;
-					throw "this shouldn't happen";
-				}
-				double alignment = align(p, points[i]);
-				if (alignment < cutoff) {
-					cutoff_index = i + 10;
-					break;
-				}
-				if (split.first.size() < n_points) {
-					split.first[make_pair(p, points[i])] = alignment;
-				}
-				sum += ratio;
-			}
-		} else {
-			for (size_t i = 1; i < cutoff_index; i++) {
-				double alignment = align(p, points[i]);
-				if (alignment < cutoff) {
-					cutoff_index = i + 10;
-					break;
-				}
-				if (split.first.size() < n_points) {
-					split.first[make_pair(p, points[i])] = alignment;
-				}
-			}
-		}
-		size_t similar_points_added = split.first.size() - count;
-		size_t available_points = points.size() - cutoff_index;
-		if (available_points == 0 || available_points <= similar_points_added) {
-			cerr << "change cutoff value, points are too similar" << endl;
-			throw "change cutoff value, points are too similar";
-		}
-		double ratio = (double)(available_points - 1.0) / (double)similar_points_added;
-		double sum = 0;
-		for (size_t q = 0; q < similar_points_added; q++) {
-			size_t i = cutoff_index + round(sum);
-			if (i >= points.size()) {
-				break;
-			}
-			double alignment = align(p, points[i]);
-			split.second[make_pair(p, points[i])] = alignment;
-			sum += ratio;
-		}
-	        if (split.first.size() != split.second.size()) {
-			cerr << "something happened";
-			throw "something happened";
-		}
-		last_cutoff = cutoff_index;
-	}
-	for (auto p : points) {
-		p->set_data_str("");
-	}
-	return split;
-}
-
-
-int gcd(int a, int b)
-{
-	if (b <= 0) {
-		return a;
-	}
-	return gcd(b, a % b);
-}
-int gcd_vec(std::vector<int> v)
-{
-	int ret = v[0];
-	for (size_t i = 1; i < v.size(); i++) {
-		if (v[i] == 0) {
-			continue;
-		}
-		ret = gcd(ret, v[i]);
-	}
-	return ret;
-}
-
-inline int sign(double x) {
-	return (x > 0) - (x < 0);
-}
-void scale(double (&mat)[4][4], double &sigma, double& epsilon)
-{
-	double scale_factor = 100000;
-	std::vector<int> signs, scaled;
-	signs.push_back(sign(sigma));
-	scaled.push_back(round(scale_factor * fabs(sigma)));
-	signs.push_back(sign(epsilon));
-	scaled.push_back(round(scale_factor * fabs(epsilon)));
-	for (int i = 0; i < 4; i++) {
-		for (int j = 0; j < 4; j++) {
-			signs.push_back(sign(mat[i][j]));
-			scaled.push_back(round(scale_factor * fabs(mat[i][j])));
-		}
-	}
-	double common_div = gcd_vec(scaled);
-	sigma = signs[0] * scaled[0] / common_div;
-	epsilon = signs[1] * scaled[1] / common_div;
-	int count = 2;
-	for (int i = 0; i < 4; i++) {
-		for (int j = 0; j < 4; j++) {
-			mat[i][j] = signs[count] * scaled[count] / common_div;
-			count++;
-		}
-	}
-}
-
-template<class T>
-void Trainer<T>::init(double (&matrix)[4][4], double sig, double eps)
-{
-	scale(matrix, sig, eps);
-	for (int i = 0; i < 4; i++) {
-		for (int j = 0; j < 4; j++) {
-			mat[i][j] = (int)matrix[i][j];
-		}
-	}
-	sigma = (int)sig;
-	eps = (int)eps;
-	// sf.emplace_back([](Point<T>* a, Point<T> *b) {
-	// 	        return Feature<T>::manhattan(*a, *b);
-	// 	}, false);
-	// sf.emplace_back([](Point<T>* a, Point<T> *b) {
-	// 		return Feature<T>::length_difference(*a, *b);
-	// 	}, false);
-	// sf.emplace_back([](Point<T>* a, Point<T> *b) {
-	// 		return Feature<T>::rree_k_r(*a, *b);
-	// 	}, false);
-	// sf.emplace_back([](Point<T>* a, Point<T>* b) {
-	// 		return Feature<T>::length_difference(*a, *b);
-	// 	}, false);
-	// sf.emplace_back([](Point<T>* a, Point<T>* b) {
-	// 		return Feature<T>::intersection(*a, *b);
-	// 	}, true);
-	// sf.emplace_back([](Point<T>* a, Point<T>* b) {
-	// 		return Feature<T>::jenson_shannon(*a, *b);
-	// 	}, false);
-	// sf.emplace_back([](Point<T>* a, Point<T>* b) {
-	// 		return Feature<T>::simratio(*a, *b);
-	// 	}, true);
-	// sf.emplace_back([](Point<T>* a, Point<T>* b) {
-	// 		return Feature<T>::squaredchord(*a, *b);
-	// 	}, false);
-	// sf.emplace_back([](Point<T>* a, Point<T>* b) {
-	// 		return Feature<T>::manhattan(*a, *b);
-	// 	}, false);
-	// sf.emplace_back([](Point<T>* a, Point<T>* b) {
-	// 		return Feature<T>::pearson(*a, *b);
-	// 	}, true);
-
-}
-template class Trainer<uint8_t>;
-template class Trainer<uint16_t>;
-template class Trainer<uint32_t>;
-template class Trainer<uint64_t>;
-template class Trainer<int>;
-template class Trainer<double>;
diff --git a/src/cluster/src/Trainer.h b/src/cluster/src/Trainer.h
deleted file mode 100644
index 8801172..0000000
--- a/src/cluster/src/Trainer.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* -*- C++ -*- */
-#ifndef TRAINER_H
-#define TRAINER_H
-
-#include "Point.h"
-#include "GLM.h"
-#include "Feature.h"
-#include "bvec.h"
-#include "Center.h"
-#include "LogTable.h"
-#include <set>
-
-template<class T>
-class Trainer {
-public:
-	Trainer(std::vector<Point<T>*> v, size_t num_points, size_t largest_count, double cutoff_, size_t max_pts_from_one_, double (&matrix)[4][4], double sig, double eps, int ksize) : points(v), n_points(num_points), cutoff(cutoff_), max_pts_from_one(max_pts_from_one_), k(ksize) {
-		init(matrix, sig, eps);
-		uintmax_t size = 1000 * 1000 * 10;
-		feat = new Feature<T>(k);
-	};
-	~Trainer() { delete feat_mat; delete feat; }
-	std::pair<std::map<std::pair<Point<T>*, Point<T>*>, double>,
-		  std::map<std::pair<Point<T>*, Point<T>*>, double> > split_old();
-        vector<std::pair<Point<T>*,Point<T>*> > split();
-	double train_n(pair<vector<pair<Point<T>*,
-			 Point<T>*
-			 > >,
-	     vector<pair<Point<T>*,
-		       Point<T>*> > > &data, int ncols);
-	void train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, double acc_cutoff=97.5);
-	void mutate_seqs(Point<T>* p, size_t num_seq, vector<pra<T> > &pos_buf, vector<pra<T> > &neg_buf, double id_begin, double id_end, uintmax_t& _id);
-	std::tuple<Point<T>*,double,size_t,size_t> get_close(Point<T>*, bvec_iterator<T> istart, bvec_iterator<T> iend,  bool& is_min) const;
-//	vector<pair<int, double> > get_close(Point<T>*, const vector<pair<Point<T>*,int> > &,  bool& is_min) const;
-	std::pair<std::pair<vector<pair<Point<T>*,Point<T>*> >,
-		    vector<pair<Point<T>*,Point<T>*> > >,
-	  std::pair<vector<pair<Point<T>*,Point<T>*> >,
-		    vector<pair<Point<T>*,Point<T>*> > > >
-	new_get_labels(std::vector<Point<T>*> &points, size_t num_sample, double id, uintmax_t &_id);
-	void filter(Point<T>*, vector<pair<Point<T>*,bool> >&) const;
-	Point<T>* closest(Point<double>*, vector<pair<Point<T>*,bool> >&) const;
-	long merge(vector<Center<T> > &centers, long current, long begin, long end) const;
-//	Point<T>* merge(Point<T>*, vector<pair<Point<T>*,double> >&) const;
-private:
-	matrix::GLM glm;
-	matrix::Matrix weights;
-	double align(Point<T>* a, Point<T>* b) const;
-	std::pair<matrix::Matrix,matrix::Matrix> generate_feat_mat(pair<vector<pair<Point<T>*,
-			 Point<T>*
-			 > >,
-	     vector<pair<Point<T>*,
-					 Point<T>*> > > &data, int ncols);
-	void init(double (&matrix)[4][4], double sig, double eps);
-	pair<vector<pair<Point<T>*,
-			 Point<T>*
-			 > >,
-	     vector<pair<Point<T>*,
-			 Point<T>*> > > get_labels(vector<std::pair<Point<T>*,Point<T>*> >&, double cutoff) const;
-	Feature<T> *feat;
-	int mat[4][4];
-	int sigma, epsilon;
-	std::vector<Point<T>*> points;
-	matrix::Matrix *feat_mat = NULL;
-	size_t n_points, max_pts_from_one;
-	double cutoff;
-	int k;
-};
-#endif
diff --git a/src/cluster/src/needleman_wunsch.cpp b/src/cluster/src/needleman_wunsch.cpp
deleted file mode 100644
index 46d0b5b..0000000
--- a/src/cluster/src/needleman_wunsch.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/* -*- C++ -*-
- *
- * needleman_wunsch.cpp
- *
- * Author: Benjamin T James
- */
-#include "needleman_wunsch.h"
-
-
-//flags that can be combined
-#define HORIZ 1
-#define VERT  2
-#define DIAG  4
-void needleman_wunsch::fill(int i, int j)
-{
-	if (i == 0 || j == 0) {
-		if (i == j) {
-			int offset = at(i, j);
-			score[offset] = 0;
-			direction[offset] = DIAG; // for backtracking
-			horiz_gap_len[offset] = 0;
-			vert_gap_len[offset] = 0;
-		} else if (i == 0) {
-			int offset = at(0, j);
-			int last_offset = at(0, j-1);
-			score[offset] = score[last_offset] + gap(j);
-			horiz_gap_len[offset] = 0;
-			vert_gap_len[offset] = j;
-			direction[offset] = VERT;
-		} else { // j == 0
-			int offset = at(i, 0);
-			int last_offset = at(i-1, 0);
-			score[offset] = score[last_offset] + gap(i);
-			horiz_gap_len[offset] = i;
-			vert_gap_len[offset] = 0;
-			direction[offset] = HORIZ;
-		}
-		return;
-	}
-	int i_diag = at(i-1, j-1);
-	int i_horiz = at(i-1, j);
-	int i_vert = at(i, j-1);
-	int i_cur = at(i, j);
-
-	int hlen = horiz_gap_len[i_horiz] + 1;
-	int vlen = vert_gap_len[i_vert] + 1;
-
-	int diag_score = score[i_diag] + match_score(s1[i], s2[j]);
-	int horiz_score = score[i_horiz] + gap(hlen);
-	int vert_score = score[i_vert] + gap(vlen);
-	score[i_cur] = std::max(std::max(diag_score, horiz_score), vert_score);
-	direction[i_cur] = 0;
-
-	// we could match multiple high scores
-	if (score[i_cur] == diag_score) {
-		direction[i_cur] |= DIAG;
-	}
-	if (score[i_cur] == vert_score) {
-		direction[i_cur] |= VERT;
-		vert_gap_len[i_cur] = vlen;
-	} else {
-		vert_gap_len[i_cur] = 0;
-	}
-	if (score[i_cur] == horiz_score) {
-		direction[i_cur] |= HORIZ;
-		horiz_gap_len[i_cur] = hlen;
-	} else {
-		horiz_gap_len[i_cur] = 0;
-	}
-}
-
-std::pair<std::string, std::string>
-needleman_wunsch::backtrack()
-{
-	std::string a1 = "", a2 = "";
-	int cur_i = l1 - 1;
-	int cur_j = l2 - 1;
-	while (cur_i >= 0 && cur_j >= 0) {
-		uint8_t dir = direction[at(cur_i, cur_j)];
-		if (dir & DIAG) {
-			a1 += s1[cur_i--];
-			a2 += s2[cur_j--];
-		} else if (dir & HORIZ) {
-			a1 += s1[cur_i--];
-			a2 += '-';
-		} else if (dir & VERT) {
-			a1 += '-';
-			a2 += s2[cur_j--];
-		}
-	}
-	std::string r1(a1.rbegin(), a1.rend());
-	std::string r2(a2.rbegin(), a2.rend());
-	return std::make_pair(r1, r2);
-}
-
-
-std::pair<std::string, std::string>
-needleman_wunsch::align()
-{
-	for (int i = 0; i < l1; i++) {
-		for (int j = 0; j < l2; j++) {
-			fill(i, j);
-		}
-	}
-	return backtrack();
-}
-double needleman_wunsch::identity(std::pair<std::string, std::string> alignment) const
-{
-	int len = alignment.first.length();
-	double count = 0;
-	for (int i = 0; i < len; i++) {
-		if (alignment.first[i] == alignment.second[i]) {
-			count++;
-		}
-	}
-	return 1.0 * count / len;
-}
-
-int needleman_wunsch::gap(int gaplen) const
-{
-	return sigma + (gaplen - 1) * epsilon;
-}
-
-int needleman_wunsch::match_score(char a, char b) const
-{
-	return a == b ? match : mismatch;
-}
-
-needleman_wunsch::needleman_wunsch(const std::string &s1_, const std::string& s2_, int match_, int mismatch_, int sigma_, int epsilon_)
-{
-	int l1_ = s1_.length();
-	int l2_ = s2_.length();
-	if (l1_ >= l2_) {
-		l1 = l1_;
-		l2 = l2_;
-		s1 = s1_;
-		s2 = s2_;
-	} else {
-		l1 = l2_;
-		l2 = l1_;
-		s1 = s2_;
-		s2 = s1_;
-	}
-	sigma = -sigma_;
-	epsilon = -epsilon_;
-	match = match_;
-	mismatch = mismatch_;
-	int matlen = l1 * l2;
-	score = new int[matlen];
-	direction = new uint8_t[matlen];
-	horiz_gap_len = new int[matlen];
-	vert_gap_len = new int[matlen];
-}
diff --git a/src/cluster/src/needleman_wunsch.h b/src/cluster/src/needleman_wunsch.h
deleted file mode 100644
index 031ea10..0000000
--- a/src/cluster/src/needleman_wunsch.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -*- C++ -*-
- *
- * needleman_wunsch.h
- *
- * Author: Benjamin T James
- */
-
-#ifndef NEEDLEMAN_WUNSCH_H
-#define NEEDLEMAN_WUNSCH_H
-
-#include <string>
-
-class needleman_wunsch {
-public:
-	needleman_wunsch(const std::string& s1, const std::string& s2, int match_,  int mismatch_, int sigma_, int epsilon_);
-	~needleman_wunsch() {
-		delete[] score;
-		delete[] direction;
-		delete[] horiz_gap_len;
-		delete[] vert_gap_len;
-	}
-	double identity(std::pair<std::string, std::string> p) const;
-	std::pair<std::string, std::string>
-	align();
-private:
-	int gap(int gap_len) const;
-	int match_score(char a, char b) const;
-	inline int at(int a, int b) const { return a * l2 + b; };
-	void fill(int,int);
-	std::pair<std::string, std::string> backtrack();
-	int match, mismatch;
-	int sigma, epsilon;
-	std::string s1, s2;
-	int l1, l2;
-
-	int *score;
-	uint8_t *direction;
-	int *horiz_gap_len;
-	int *vert_gap_len;
-};
-
-
-#endif
diff --git a/src/clutil/Clock.cpp b/src/clutil/Clock.cpp
new file mode 100644
index 0000000..07da83b
--- /dev/null
+++ b/src/clutil/Clock.cpp
@@ -0,0 +1,19 @@
+/* -*- C++ -*- */
+/*
+ * Clock.cpp
+ *
+ * Author: Benjamin T James
+ */
+
+#include "Clock.h"
+#include <chrono>
+#include <ctime>
+
+static const auto _begin = std::chrono::system_clock::now();
+
+void Clock::stamp(std::string desc)
+{
+	auto end = std::chrono::system_clock::now();
+	std::chrono::duration<double> diff = end - _begin;
+	std::cout << "timestamp " << desc << " " << diff.count() << std::endl;
+}
diff --git a/src/clutil/Clock.h b/src/clutil/Clock.h
new file mode 100644
index 0000000..b251d51
--- /dev/null
+++ b/src/clutil/Clock.h
@@ -0,0 +1,16 @@
+// -*- C++ -*-
+/*
+ * Clock.h
+ *
+ * Author: Benjamin T James
+ */
+
+#ifndef CLOCK_H
+#define CLOCK_H
+#include <iostream>
+
+class Clock {
+public:
+	static void stamp(std::string desc);
+};
+#endif
diff --git a/src/clutil/Datatype.cpp b/src/clutil/Datatype.cpp
new file mode 100644
index 0000000..46fc67b
--- /dev/null
+++ b/src/clutil/Datatype.cpp
@@ -0,0 +1,19 @@
+/* -*- C++ -*- */
+/*
+ * Datatype.cpp
+ *
+ * Author: Benjamin T James
+ */
+
+#include "Datatype.h"
+std::string _dt_datatype = "";
+
+std::string Datatype::get()
+{
+	return _dt_datatype;
+}
+
+void Datatype::set(std::string s)
+{
+	_dt_datatype = s;
+}
diff --git a/src/clutil/Datatype.h b/src/clutil/Datatype.h
new file mode 100644
index 0000000..dd4df42
--- /dev/null
+++ b/src/clutil/Datatype.h
@@ -0,0 +1,17 @@
+// -*- C++ -*_
+/*
+ * Datatype.h
+ *
+ * Author: Benjamin T James
+ */
+
+#ifndef DATATYPE_H
+#define DATATYPE_H
+#include <string>
+
+class Datatype {
+public:
+	static std::string get();
+	static void set(std::string s);
+};
+#endif
diff --git a/src/cluster/src/DivergencePoint.cpp b/src/clutil/DivergencePoint.cpp
similarity index 98%
rename from src/cluster/src/DivergencePoint.cpp
rename to src/clutil/DivergencePoint.cpp
index 70e4e2d..d62996a 100644
--- a/src/cluster/src/DivergencePoint.cpp
+++ b/src/clutil/DivergencePoint.cpp
@@ -58,7 +58,7 @@ double DivergencePoint<T>::distance_d(Point<double>& p) const
 	uint64_t dist = 0;
 	uint64_t mag = 0;
 	for (auto i = 0; i < points.size(); i++) {
-		dist += 2 * min(points[i],(T)c.points[i]);
+		dist += 2 * min(points[i],(T)round(c.points[i]));
 		mag += points[i] + c.points[i];
 	}
 	double frac = (double)dist / mag;
@@ -99,8 +99,8 @@ template<class T>
 DivergencePoint<T>::DivergencePoint(const std::vector<T>& pts, uint64_t len)
 {
 	mag = 0;
-	points = pts;
 	for (unsigned int i = 0; i < pts.size(); i++) {
+		points.push_back(pts.at(i));
 		mag += pts.at(i);
 	}
 //	display();
diff --git a/src/cluster/src/DivergencePoint.h b/src/clutil/DivergencePoint.h
similarity index 98%
rename from src/cluster/src/DivergencePoint.h
rename to src/clutil/DivergencePoint.h
index 087bff1..68d0539 100644
--- a/src/cluster/src/DivergencePoint.h
+++ b/src/clutil/DivergencePoint.h
@@ -38,6 +38,7 @@ class DivergencePoint : public Point<T> {
 		d->set_id(get_id());
 		d->set_length(get_length());
 		d->set_stddev(get_stddev());
+		d->set_data_str(Point<T>::get_data_str());
 		return d;
 	}
 	DivergencePoint* create() const {
diff --git a/src/cluster/src/Histogram.cpp b/src/clutil/Histogram.cpp
similarity index 100%
rename from src/cluster/src/Histogram.cpp
rename to src/clutil/Histogram.cpp
diff --git a/src/cluster/src/Histogram.h b/src/clutil/Histogram.h
similarity index 100%
rename from src/cluster/src/Histogram.h
rename to src/clutil/Histogram.h
diff --git a/src/clutil/LCG.h b/src/clutil/LCG.h
new file mode 100644
index 0000000..8725771
--- /dev/null
+++ b/src/clutil/LCG.h
@@ -0,0 +1,51 @@
+// -*- C++ -*-
+/*
+ * LCG.h
+ *
+ * Author: Benjamin T James
+ */
+
+#ifndef LCG_H
+#define LCG_H
+
+#include <limits>
+#include <algorithm>
+#include <stdint.h>
+#include <iostream>
+
+class LCG {
+public:
+	LCG(uint64_t seed_) {
+		seed = seed_;
+	}
+
+	template<class T>
+        T randMod(T max) {
+		if (max == 0) {
+			return 0;
+		} else {
+			uint64_t x = random() % max;
+			return (T)x;
+		}
+	}
+
+	uint64_t nextRandSeed() {
+		return random();
+	}
+	double rand_between(double id, double range, double low, double high) {
+		uint64_t rnd = random();
+		double res = (double)rnd / std::numeric_limits<uint64_t>::max();
+		double mn = std::max(id - range, low);
+		double mx = std::min(id + range, high);
+		return mn + (mx - mn) * res;
+	}
+	uint64_t random() {
+		// MMIX random, from https://en.wikipedia.org/wiki/Linear_congruential_generator#Parameters_in_common_use
+		// Should be thread safe
+		seed = seed * 6364136223846793005 + 1442695040888963407;
+		return seed;
+	}
+private:
+	uint64_t seed;
+};
+#endif
diff --git a/src/clutil/Loader.cpp b/src/clutil/Loader.cpp
new file mode 100644
index 0000000..c15e3b3
--- /dev/null
+++ b/src/clutil/Loader.cpp
@@ -0,0 +1,223 @@
+/* -*- C++ -*-
+ *
+ * Loader.cpp
+ *
+ * Author: Benjamin T James
+ *
+ * Class which can 'preload' chunks of sequences from a file list,
+ * and then count the k-mers separately, which can be done in
+ * multiple threads
+ */
+#include "Loader.h"
+#include "Datatype.h"
+
+static uint64_t num_overflow = 0;
+std::string next_histogram(std::string cur_type)
+{
+	if (cur_type == "uint8_t") {
+		return "uint16_t";
+	} else if (cur_type == "uint16_t") {
+		return "uint32_t";
+	} else {
+		return "uint64_t";
+	}
+}
+
+template<class T>
+std::string Loader<T>::get_warning()
+{
+	if (num_overflow == 0) {
+		return "";
+	} else {
+		std::ostringstream oss;
+		oss << "For " << num_overflow << " sequences, the histogram type " << Datatype::get() << " was too small for holding sequences." << endl;
+		oss << "Performance may be slightly hindered, but can be improved by increasing the integral type (--datatype " << next_histogram(Datatype::get())  << ")" << endl;
+		return oss.str();
+	}
+}
+
+
+
+template<class V>
+void Loader<V>::fill_table(KmerHashTable<unsigned long, V> &table, ChromosomeOneDigit *chrom, std::vector<V>& values)
+{
+	const int k = table.getK();
+	auto segment = chrom->getSegment();
+	const char *seg_bases = chrom->getBase()->c_str();
+	for (vector<int> *v : *segment) {
+		int start = v->at(0);
+		int end = v->at(1);
+
+		// Hani Z Girgis added the following line
+		// It is possible
+		if(end - start + 1 >= k){
+			int r = table.wholesaleIncrementNoOverflow(seg_bases, start, end - k + 1);
+			if (r == -1) {
+				num_overflow++;
+				// #pragma omp critical
+				// {
+				// 	std::ostringstream oss;
+				// 	oss << "In header \"" << chrom->getHeader() << "\"" << endl;
+				// 	oss << "Histogram type " << Runner::get_datatype() << " is too small for holding sequences." << endl;
+				// 	oss << "Performance may be slightly hindered, but can be improved by increasing the integral type (--datatype " << next_histogram(Runner::get_datatype())  << ")" << endl;
+				// 	_loader_warning = oss.str();
+				// 	cerr << get_warning() << endl;
+				// }
+			}
+		}
+	}
+	std::string header = chrom->getHeader();
+	header = header.substr(1, header.find(' ')-1);
+	// Hani Z. Girgis added the following lines on 10/3/2018
+	// This should result in significant speed up.
+	unsigned long tableSize = table.getMaxTableSize();
+	values.reserve(values.size() + tableSize);
+	const V * valueArray = table.getValues();
+
+	copy(&valueArray[0], &valueArray[tableSize], back_inserter(values));
+
+    // Commented out by Hani Z. Girgis on 10/3/2018 and replaced by the code above
+	// std::vector<std::string> *keys = table.getKeys();
+	// for (std::string str : *keys) {
+	// 	values.push_back(table.valueOf(str.c_str()));
+	// }
+	// keys->clear();
+	// delete keys;
+}
+
+template<class T>
+bool Loader<T>::done() const
+{
+	return file_idx == files.size();
+}
+
+template<class T>
+void Loader<T>::preload(int tid)
+{
+	if (file_idx == files.size()) {
+		return;
+	}
+	for (uint64_t j = 0; j < chunk_size; j++) {
+		auto chrom = next();
+		if (chrom.first == "") {
+			return;
+		}
+		cache_list.at(tid).emplace_back(chrom.first, chrom.second);
+	}
+}
+
+
+// Modified by Hani Z. Girgis on Oct 2, 2018
+template<class T>
+Point<T>* Loader<T>::get_point(std::string header, const std::string &base, uintmax_t& id, int k, bool set_seq)
+{
+	ostringstream obase;
+	for (int i = 0; i < base.length(); i++) {
+		if (base[i] == 'A' || base[i] == 'C' ||
+		    base[i] == 'G' || base[i] == 'T') {
+			obase << base[i];
+		}
+	}
+	ChromosomeOneDigit * chrom;
+	if(Util::isDna){
+		chrom = new ChromosomeOneDigitDna();
+	}else{
+		chrom = new ChromosomeOneDigitProtein();
+	}
+
+	chrom->setHeader(header);
+	chrom->appendToSequence(obase.str());
+	chrom->finalize();
+	Point<T> *p = Loader<T>::get_point(chrom, id, k, set_seq);
+	delete chrom;
+	return p;
+}
+
+// Modified by Hani Z. Girgis on Oct 2, 2018
+template<class T>
+Point<T>* Loader<T>::get_point(ChromosomeOneDigit* chrom, uintmax_t& id, int k, bool set_seq)
+{
+
+	KmerHashTable<unsigned long, T> table(k, 1);
+	// Hani Z. Girgis changed the following line
+	// The table_k1 was initialized from 0 now it is 1
+	KmerHashTable<unsigned long, uint64_t> table_k1(1, 1);
+	std::vector<T> values;
+	vector<uint64_t> values_k1;
+	// values.clear();
+
+	Loader<T>::fill_table(table, chrom, values);
+	Loader<uint64_t>::fill_table(table_k1, chrom, values_k1);
+//	int tmplate = get_template(chrom->getHeader(), templates);
+	Point<T> *p = new DivergencePoint<T>(values, chrom->size());
+//	cout << "mag: " << ((DivergencePoint<T>*)p)->getPseudoMagnitude() << std::endl;
+	p->set_1mers(values_k1);
+	p->set_header(chrom->getHeader());
+	p->set_length(chrom->getEffectiveSize());
+	if (set_seq) {
+		p->set_data_str(*chrom->getBase());
+	}
+	// Added by Hani Z. Girgis on Oct 7 2018
+	p->setK(k);
+	DivergencePoint<T>* q = dynamic_cast<DivergencePoint<T>*>(p);
+	const auto N = q->points.size();
+	double aq = (double) q->getPseudoMagnitude() / N;
+	double sq = 0;
+	for (auto i = 0; i < N; i++) {
+		double qdiff = q->points[i] - aq;
+		sq += qdiff * qdiff;
+	}
+	sq = sqrt(sq / N);
+	q->set_stddev(sq);
+	p->set_id(id);
+	#pragma omp atomic
+	id++;
+
+	// Clean
+
+	return p;
+}
+
+
+
+
+
+
+
+template<class T>
+std::vector<Point<T>*> Loader<T>::load_next(int tid)
+{
+	std::vector<Point<T>*> points;
+	for (size_t i = 0; i < cache_list.at(tid).size(); i++) {
+	        auto pr = cache_list.at(tid).at(i);
+		Point<T>* p = get_point(pr.first, *pr.second, id_list.at(tid), k);
+		points.push_back(p);
+		delete pr.second;
+	}
+	cache_list.at(tid).clear();
+	return points;
+}
+
+template<class T>
+std::pair<std::string,std::string*> Loader<T>::next()
+{
+	auto n = maker->next();
+	if (n.first != "") {
+		return n;
+	}
+	delete maker;
+	maker = NULL;
+	file_idx++;
+	if (file_idx >= files.size()) {
+		return n;
+	}
+	maker = new SingleFileLoader(files.at(file_idx));
+	return maker->next();
+}
+
+template class Loader<double>;
+template class Loader<int>;
+template class Loader<uint64_t>;
+template class Loader<uint32_t>;
+template class Loader<uint16_t>;
+template class Loader<uint8_t>;
diff --git a/src/cluster/src/Loader.h b/src/clutil/Loader.h
similarity index 70%
rename from src/cluster/src/Loader.h
rename to src/clutil/Loader.h
index 28da845..ec3f569 100644
--- a/src/cluster/src/Loader.h
+++ b/src/clutil/Loader.h
@@ -11,9 +11,18 @@
 #ifndef LOADER_H
 #define LOADER_H
 
-#include "Point.h"
+#include <omp.h>
+
 #include "SingleFileLoader.h"
-#include "ClusterFactory.h"
+#include "Point.h"
+#include "DivergencePoint.h"
+#include "../nonltr/KmerHashTable.h"
+// Add by Hani Z. Girgis, PhD on Oct 2, 2018
+#include "../nonltr/ChromosomeOneDigit.h"
+#include "../nonltr/ChromosomeOneDigitDna.h"
+#include "../nonltr/ChromosomeOneDigitProtein.h"
+
+
 
 template<class T>
 class Loader {
@@ -41,6 +50,9 @@ class Loader {
 	};
 
 	~Loader() {
+		if (get_warning() != "") {
+			cerr << get_warning() << endl;
+		}
 		cache_list.clear();
 		id_list.clear();
 		if (maker != NULL) {
@@ -55,7 +67,11 @@ class Loader {
 	// multi-thread accessible
 	std::vector<Point<T>*> load_next(int tid);
 
-	static Point<T>* get_point(std::string header, const std::string &base, uintmax_t& id, int k);
+	static Point<T>* get_point(std::string header, const std::string &base, uintmax_t& id, int k, bool set_seq=true);
+	static Point<T>* get_point(ChromosomeOneDigit* dna, uintmax_t& id, int k, bool set_seq=true);
+
+	static void fill_table(KmerHashTable<unsigned long, T> &table, ChromosomeOneDigit *chrom, std::vector<T>& values);
+	static std::string get_warning();
 private:
 
 	std::pair<std::string,std::string*> next();
@@ -69,5 +85,7 @@ class Loader {
 	std::vector<std::string> files;
 	size_t file_idx = 0;
 	SingleFileLoader *maker = NULL;
+
 };
+
 #endif
diff --git a/src/cluster/src/Point.h b/src/clutil/Point.h
similarity index 76%
rename from src/cluster/src/Point.h
rename to src/clutil/Point.h
index a70bc20..4aac8ff 100644
--- a/src/cluster/src/Point.h
+++ b/src/clutil/Point.h
@@ -13,7 +13,7 @@
 #define POINT_H
 
 #include <string>
-#include "../../nonltr/ChromosomeOneDigit.h"
+#include "../nonltr/ChromosomeOneDigit.h"
 
 /*
  * Pure virtual class that defines behavior for
@@ -51,33 +51,48 @@ class Point {
 
 	virtual const vector<T>& get_data() const = 0;
 
-	void set_header(const std::string c) { header = c; };
+	void set_header(const std::string c) { header = string(c); };
 	const std::string get_header() const { return header; };
 
 	void set_data_str(const std::string& c) { data = c; };
 	const std::string & get_data_str() const { return data; };
 
 	void set_1mers(const vector<uint64_t> &vec) {
-		for (auto i = 0; i < 4; i++) {
-			one_mers[i] = vec[i];
-		}
+		// for (auto i = 0; i < Util::getAlphabetSize(); i++) {
+		// 	one_mers[i] = vec[i];
+		// }
+		one_mers = vector<uint64_t>(vec);
 	}
+
 	vector<uint64_t> get_1mers() const {
-		vector<uint64_t> vec;
-		for (auto i = 0; i < 4; i++) {
-			vec.push_back(one_mers[i]);
-		}
-		return vec;
+		// vector<uint64_t> vec;
+		// for (auto i = 0; i < Util::getAlphabetSize(); i++) {
+		// 	vec.push_back(one_mers[i]);
+		// }
+		// return vec;
+		return one_mers;
 	}
 	virtual unsigned long size() const = 0;
 	virtual void set_id(uintmax_t c_id) = 0;//{ id = c_id; };
 	virtual const uintmax_t get_id() const = 0;//{ return id; };
 	virtual void set_length(unsigned long len) = 0;
 	virtual unsigned long get_length() const = 0;
+
+	// Added by Hani Z. Girgis on Oct 7 2018
+	int getK(){
+		return k;
+	}
+	void setK(int k){
+		this->k = k;
+	}
+
 private:
-	uint64_t one_mers[4];
-        std::string header;
+	vector<uint64_t> one_mers;
+    std::string header;
 	std::string data;
+	// Added by Hani Z. Girgis on Oct 7 2018
+	// The k in k-mer used to build the table
+	int k;
 };
 
 #endif
diff --git a/src/clutil/Progress.cpp b/src/clutil/Progress.cpp
new file mode 100644
index 0000000..138763b
--- /dev/null
+++ b/src/clutil/Progress.cpp
@@ -0,0 +1,79 @@
+/* -*- C++ -*-
+ *
+ * Progress.cpp
+ *
+ * Author: Benjamin T James
+ *
+ * Progress bar that uses carriage return '\r'
+ * to seek to the beginning of a line to redraw
+ */
+#include "Progress.h"
+#include <iostream>
+Progress::Progress(long num, std::string prefix_)
+{
+	pmax = num;
+	ended = 0;
+	pcur = 0;
+	old_prog = -1;
+	prefix = prefix_;
+	barWidth = 70 - (prefix.size()+1);
+	print();
+}
+
+void Progress::print()
+{
+	#ifndef NOPROG
+	double prog = (double)pcur / pmax;
+	if (old_prog != int(prog * 100)) {
+		std::cout << prefix << " [";
+		int pos = barWidth * prog;
+		for (int i = 0; i < barWidth; i++) {
+			if (i < pos) {
+				std::cout << "=";
+			} else if (i == pos) {
+				std::cout << ">";
+			} else {
+				std::cout << " ";
+			}
+		}
+		std::cout << "] " << int(prog * 100.0) << " %\r";
+		std::cout.flush();
+	}
+	old_prog = int(prog * 100);
+	#endif
+}
+
+void Progress::end()
+{
+	if (!ended) {
+		pcur = pmax;
+		print();
+		std::cout << std::endl;
+	}
+	ended = true;
+}
+
+
+void Progress::set(int num)
+{
+	pcur = num;
+	print();
+}
+
+void Progress::operator++()
+{
+	pcur++;
+	print();
+}
+void Progress::operator++(int)
+{
+	print();
+	pcur++;
+}
+
+
+void Progress::operator+=(size_t num)
+{
+	pcur += num;
+	print();
+}
diff --git a/src/cluster/src/Progress.h b/src/clutil/Progress.h
similarity index 75%
rename from src/cluster/src/Progress.h
rename to src/clutil/Progress.h
index f59d948..fb7424b 100644
--- a/src/cluster/src/Progress.h
+++ b/src/clutil/Progress.h
@@ -3,6 +3,10 @@
  * Progress.h
  *
  * Author: Benjamin T James
+ *
+ * Progress bar that uses carriage return '\r'
+ * to seek to the beginning of a line to redraw
+ *
  */
 #include <iostream>
 #ifndef PROGRESS_H
@@ -16,14 +20,14 @@ class Progress {
 	void operator++();
 	void operator++(int);
 	void operator+=(size_t);
+	void set(int);
 private:
 	void print();
 	long pmax;
 	long pcur;
+	long old_prog;
 	bool ended;
 	std::string prefix;
 	int barWidth;
-
-	std::string last;
 };
 #endif
diff --git a/src/clutil/Random.h b/src/clutil/Random.h
new file mode 100644
index 0000000..52e1274
--- /dev/null
+++ b/src/clutil/Random.h
@@ -0,0 +1,61 @@
+// -*- C++ -*-
+/*
+ * Random.h
+ *
+ * Author: Benjamin T James
+ */
+
+#ifndef RANDOM_H
+#define RANDOM_H
+#include <random>
+#include <iostream>
+class Random {
+public:
+	Random(std::random_device::result_type seed=0xAA) : mt(seed) {}
+
+	template<class T>
+	T randMod(T max) {
+		T res;
+#pragma omp critical
+		{
+			if (max == 0) {
+				res = 0;
+			} else {
+				std::uniform_int_distribution<T> distribution(0, max-1);
+				res = distribution(mt);
+			}
+		}
+		return res;
+	}
+
+	double random() {
+		double res = 0;
+		#pragma omp critical
+		{
+		std::uniform_real_distribution<double> distribution(0.0, 1.0);
+		res = distribution(mt);
+		}
+		return res;
+	}
+	double rand_between(double id, double range, double low, double high) {
+		double res = 0;
+		#pragma omp critical
+		{
+		double mn = std::max(id - range, low);
+		double mx = std::min(id + range, high);
+		std::uniform_real_distribution<double> distribution(mn, mx);
+
+		res = distribution(mt);
+		}
+		return res;
+	}
+	std::random_device::result_type nextRandSeed() {
+		using rt = std::random_device::result_type;
+		return randMod<rt>(std::numeric_limits<rt>::max());
+	}
+	std::mt19937& gen() { return mt; }
+private:
+        std::mt19937 mt;
+
+};
+#endif
diff --git a/src/cluster/src/SingleFileLoader.cpp b/src/clutil/SingleFileLoader.cpp
similarity index 67%
rename from src/cluster/src/SingleFileLoader.cpp
rename to src/clutil/SingleFileLoader.cpp
index e62715f..9b61024 100644
--- a/src/cluster/src/SingleFileLoader.cpp
+++ b/src/clutil/SingleFileLoader.cpp
@@ -82,3 +82,42 @@ std::pair<std::string, std::string*> SingleFileLoader::next()
 //	std::cout << "next(): " << diff / CLOCKS_PER_SEC << std::endl;
 	return ret;
 }
+ChromosomeOneDigitDna* SingleFileLoader::nextChrom()
+{
+	ChromosomeOneDigitDna* ret = NULL;
+	if (!in->good()) {
+		return ret;
+	}
+	if (is_first) {
+		safe_getline(*in, buffer);
+		is_first = false;
+	}
+	do {
+		if (buffer[0] == '>') {
+			if (ret != NULL)  {
+				ret->finalize();
+				return ret;
+			}
+			ret = new ChromosomeOneDigitDna();
+			ret->setHeader(buffer);
+		} else if (buffer[0] == ' ' || buffer[0] == '\t') {
+			bool all_spaces = true;
+			for (auto c : buffer) {
+				if (c != ' ' && c != '\t') {
+					all_spaces = false;
+				}
+			}
+			if (!all_spaces) {
+				std::ostringstream oss;
+				oss << ret->getHeader() << buffer;
+				std::string new_header = oss.str();
+				ret->setHeader(new_header);
+			}
+		} else {
+			ret->appendToSequence(buffer);
+		}
+		safe_getline(*in, buffer);
+	} while (in->good());
+	ret->finalize();
+	return ret;
+}
diff --git a/src/cluster/src/SingleFileLoader.h b/src/clutil/SingleFileLoader.h
similarity index 85%
rename from src/cluster/src/SingleFileLoader.h
rename to src/clutil/SingleFileLoader.h
index d6b3c5d..d944a30 100644
--- a/src/cluster/src/SingleFileLoader.h
+++ b/src/clutil/SingleFileLoader.h
@@ -11,7 +11,7 @@
 #define SINGLEFILELOADER_H
 
 #include <fstream>
-
+#include "../nonltr/ChromosomeOneDigitDna.h"
 class SingleFileLoader {
 public:
 	SingleFileLoader(std::string file);
@@ -21,6 +21,7 @@ class SingleFileLoader {
 		}
 	}
 	std::pair<std::string,std::string*> next();
+	ChromosomeOneDigitDna* nextChrom();
 private:
 	std::ifstream *in;
 	std::string buffer;
diff --git a/src/fastcar/FC_Runner.cpp b/src/fastcar/FC_Runner.cpp
new file mode 100644
index 0000000..34bd459
--- /dev/null
+++ b/src/fastcar/FC_Runner.cpp
@@ -0,0 +1,635 @@
+/* -*- C++ -*-
+ *
+ * Runner.cpp
+ *
+ * Author: Benjamin T James
+ *
+ * Runner class that parses options and controls
+ * the process of the program.
+ */
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/sysinfo.h>
+#include <cstdlib>
+#include "../nonltr/ChromListMaker.h"
+#include "../clutil/DivergencePoint.h"
+#include "FC_Runner.h"
+#include "../predict/Predictor.h"
+#include "../clutil/Loader.h"
+#include "../clutil/Progress.h"
+#include "../clutil/Datatype.h"
+#include <omp.h>
+
+
+Runner::Runner(int argc, char **argv)
+{
+	get_opts(argc, argv);
+	srand(10);
+}
+
+int parseLine(char* line) {
+	int i = strlen(line);
+	const char* p = line;
+	while (*p < '0' || *p > '9') p++;
+	line[i-3] = '\0';
+	i = atoi(p);
+	return i;
+}
+
+void mem_used(std::string prefix)
+{
+	struct sysinfo memInfo;
+	sysinfo(&memInfo);
+	FILE* file = fopen("/proc/self/status", "r");
+	int result = -1;
+	char line[128];
+	while (fgets(line, 128, file)) {
+		if (strncmp(line, "VmSize:", 7) == 0) {
+			result = parseLine(line);
+			break;
+		}
+	}
+	fclose(file);
+	cout << prefix << ": used memory: " << result << " KB" << endl;
+}
+
+int Runner::run()
+{
+	if (pred64) {
+		k = pred64->get_k();
+	} else if (k == -1) {
+		uintmax_t total_length = 0;
+		uintmax_t total_num_seq = 0;
+		largest_count = 0;
+		Progress progress(files.size(), "Reading in sequences");
+		uintmax_t num_seq = 10000;
+		for (auto i = 0; i < files.size(); i++) {
+			auto f = files.at(i);
+			SingleFileLoader maker(f);
+
+			progress++;
+			uint64_t local_largest_count = 0;
+			std::pair<std::string,std::string*> pr;
+			while ((pr = maker.next()).first != "" && total_num_seq++ < num_seq) {
+				total_length += pr.second->length();
+			}
+		}
+		progress.end();
+		double avg_length = (double)total_length / total_num_seq;
+		k = std::max((int)(ceil(log(avg_length) / log(4)) - 1), 2);
+	}
+	cout << "K: " << k << endl;
+// #pragma omp parallel for reduction(max:largest_count)
+// 	for (size_t i = 0; i < sequences.size(); i++) {
+// 		std::vector<uint64_t> values;
+// 		KmerHashTable<unsigned long, uint64_t> table(k, 1);
+// 		ChromosomeOneDigitDna chrom;
+// 		chrom.setSequence(*sequences[i].second);
+// 		chrom.setHeader(sequences[i].first);
+// 		chrom.finalize();
+// 		fill_table<uint64_t>(table, &chrom, values);
+// 		uint64_t l_count = 0;
+// 		for (auto elt : values) {
+// 			if (elt > l_count) {
+// 				l_count = elt;
+// 			}
+// 		}
+// 		if (l_count > largest_count) {
+// 			largest_count = l_count;
+// 		}
+// 		values.clear();
+// 	}
+// 	largest_count *= 2;
+	uint64_t cap = 10000;
+	std::vector<ChromosomeOneDigit* > sequences(cap);
+	if (pred64 == NULL || Datatype::get() == "") {
+		uint64_t idx = 0;
+		Progress progress(cap, "Reading in sequences");
+		uint64_t largest_count = 0;
+
+
+		for (auto i = 0; i < files.size(); i++) {
+			auto f = files.at(i);
+			SingleFileLoader maker(f);
+			ChromosomeOneDigitDna* chrom = NULL;
+			while ((chrom = maker.nextChrom()) != NULL && idx < cap) {
+
+				sequences[idx] = chrom;
+				idx++;
+				progress++;
+			}
+		}
+		sequences.resize(idx);
+
+#pragma omp parallel for reduction(max:largest_count)
+		for (int i = 0; i < sequences.size(); i++) {
+			auto chrom = sequences[i];
+			std::vector<uint64_t> values;
+			KmerHashTable<unsigned long, uint64_t> table(k, 1);
+			Loader<uint64_t>::fill_table(table, chrom, values);
+			uint64_t l_count = *std::max_element(std::begin(values), std::end(values));
+			if (l_count > largest_count) {
+				largest_count = l_count;
+			}
+		}
+		progress.end();
+	} else if (pred64 != NULL) {
+		sequences.clear();
+		Datatype::set(pred64->get_datatype());
+		similarity = pred64->get_id();
+	}
+	if (Datatype::get() != "") {
+		std::string type = Datatype::get();
+		if (type == "uint8_t") {
+			largest_count = std::numeric_limits<uint8_t>::max();
+		} else if (type == "uint16_t") {
+			largest_count = std::numeric_limits<uint16_t>::max();
+		} else if (type == "uint32_t") {
+			largest_count = std::numeric_limits<uint32_t>::max();
+		} else if (type == "uint64_t") {
+			largest_count = std::numeric_limits<uint64_t>::max();
+		}
+	}
+	if (largest_count <= std::numeric_limits<uint8_t>::max()) {
+		Datatype::set("uint8_t");
+		cout << "Using 8 bit histograms" << endl;
+		return do_run<uint8_t>(sequences);
+	} else if (largest_count <= std::numeric_limits<uint16_t>::max()) {
+		Datatype::set("uint16_t");
+		cout << "Using 16 bit histograms" << endl;
+		return do_run<uint16_t>(sequences);
+	} else if (largest_count <= std::numeric_limits<uint32_t>::max()){
+		Datatype::set("uint32_t");
+	       	cout << "Using 32 bit histograms" << endl;
+		return do_run<uint32_t>(sequences);
+	} else if (largest_count <= std::numeric_limits<uint64_t>::max()) {
+		Datatype::set("uint64_t");
+	       	cout << "Using 64 bit histograms" << endl;
+		return do_run<uint64_t>(sequences);
+	} else {
+		throw "Too big sequence";
+	}
+}
+
+
+void Runner::usage(std::string progname) const
+{
+	int num_threads = omp_get_max_threads();
+	std::cout << "Usage: " << progname << " *.fasta --query queryFile.fasta --id 0.90 [optional_arguments]" << std::endl << std::endl;
+	std::cout << "Options: " << std::endl;
+	std::cout << "\t" << "--id        "<<"\t" <<"identityValue" << "\t\t" << "Use this alignment identity (0.0 to 1.0) for classification" << std::endl;
+	std::cout << "\t" << "-q|--query  "<<"\t" <<"queryFile.fasta" << "\t\t" << "Run the database against this query file" << std::endl;
+	std::cout << "\t" << "-k|--kmer   "<<"\t" << "N"<<"\t\t\t" << "Usually calculated by going through the data and finding the ceil(log_4(Length_avg))-1,"<< std::endl;
+	std::cout << "\t\t\t\t\t\t    " << "so if provided, it can save computational time. Increasing the k-mer increases memory usage four-fold."<< std::endl;
+        std::cout << "\t" << "--datatype  "<<"\t" <<"uintX_t" << "\t\t\t" << "If provided, instead of running through the data another time," << std::endl;
+	std::cout << "\t\t\t\t\t\t    " << "provide the maximum data type to not overflow, one of {uint8_t, uint16_t, uint32_t, uint64_t}" << std::endl;
+        std::cout << "\t" << "-c|--chunk  "<<"\t" << chunk_size << "\t\t\t" << "Process N (a positive integer number) sequences at once in the multithreading model." << std::endl;
+	std::cout << "\t" << "--dump      "<<"\t" <<"weights.txt" << "\t\t" << "Instead of running, only train the model(s) and dump the weights" <<  std::endl;
+	std::cout << "\t" << "--no-format "<<"\t\t\t\t" << "Print the full header instead of the abbreviated header when printing output" <<  std::endl;
+
+
+	std::cout << "\t" << "-o|--output "<<"\t" <<"output.search" << "\t\t" << "Output file, to which numbers 0 through [num_threads] are appended. Each file contains data computed by each thread." << std::endl;
+        std::cout << "\t" << "-r|--recover"<<"\t" <<"weights.txt" << "\t\t" << "Instead of training, use a pre-computed weights file to avoid re-training" << std::endl;
+	std::cout << "\t" << "-f|--feat   "<<"\t" <<"fast" << "\t\t\t"<<"Use a small,fast set of possible features (fast) or a larger, slower-to-train set of possible features (slow)"<<std::endl;
+	std::cout << "\t" << "-m|--mode   "<<"\t" <<"rc"   << "\t\t\t"<<"Use the provided mode, either \"c\" for classification (print all pairs above threshold, but no provided alignment value)," << std::endl;
+	std::cout << "\t\t\t\t\t\t    " << "\"r\" for regression only, meaning all pairs are printed, or" << std::endl;
+	std::cout << "\t\t\t\t\t\t    " << "\"rc\" for both (default), printing all pairs above the threshold with alignment identity predictions." <<  std::endl;
+	std::cout << "\t" << "-s|--sample "<<"\t" << sample_size << "\t\t\t" << "Use this many template sequences, from which 5 positive (above the --id threshold)" << std::endl;
+	std::cout << "\t\t\t\t\t\t    " << "and 10 negative alignments will be generated, yielding ([sample_size] x 15) total training points." << std::endl;
+        std::cout << "\t" << "--mut-type  "<<"\t" << "single"    << "\t\t\t" << "Use this mutation type to generate synthetic alignments." << std::endl;
+	std::cout << "\t\t\t\t\t\t    " << "Options for mutation type are \"single\", \"nonsingle-typical\", \"both\" (for single and nonsingle-typical)," << std::endl;
+	std::cout << "\t\t\t\t\t\t    " << "\"nonsingle-all\", and \"all\" (single, nonsingle, and atypical nonsingle)." <<  std::endl;
+	std::cout << "\t" << "-t|--threads"<<"\t" << num_threads << "\t\t\t" << "Set the number of threads used from this number to a lower number." << std::endl << std::endl;
+#ifndef VERSION
+        #define VERSION "(undefined)"
+        #endif
+        std::cout << "Version " << VERSION << " compiled on " << __DATE__ << " " << __TIME__;
+        #ifdef _OPENMP
+        std::cout << " with OpenMP " << _OPENMP;
+        #else
+        std::cout << " without OpenMP";
+        #endif
+	std::cout << std::endl;
+}
+
+void Runner::get_opts(int argc, char **argv)
+{
+	for (int i = 1; i < argc; i++) {
+		string arg = argv[i];
+		if (arg == "--id" && i + 1 < argc) {
+			try {
+				std::string opt = argv[i+1];
+				similarity = std::stod(opt);
+				if (similarity <= 0 || similarity >= 1) {
+					throw std::invalid_argument("");
+				}
+			} catch(std::exception e) {
+				cerr << "Similarity must be between 0 and 1" << endl;
+				exit(EXIT_FAILURE);
+			}
+			i++;
+		} else if ((arg == "-c" || arg == "--chunk") && i + 1 < argc) {
+			chunk_size = strtol(argv[i+1], NULL, 10);
+			if (errno) {
+				perror(argv[i+1]);
+				exit(EXIT_FAILURE);
+			} else if (chunk_size <= 0) {
+				fprintf(stderr, "Chunk size must be greater than 0.\n");
+				exit(EXIT_FAILURE);
+			}
+			i++;
+		} else if ((arg == "--dump") && i + 1 < argc) {
+			dump_str = argv[++i];
+			dump = true;
+		} else if (arg == "--noformat" || arg == "--no-format") {
+			format = false;
+		} else if ((arg == "--datatype") && i + 1 < argc) {
+			std::string val = argv[++i];
+			if (val == "uint8_t" || val == "8" || val == "uint8") {
+				Datatype::set("uint8_t");
+			} else if (val == "uint16_t" || val == "16" || val == "uint16") {
+				Datatype::set("uint16_t");
+			} else if (val == "uint32_t" || val == "32" || val == "uint32") {
+				Datatype::set("uint32_t");
+			} else if (val == "uint64_t" || val == "64" || val == "uint64") {
+				Datatype::set("uint64_t");
+			} else {
+				cerr << "Histogram data type must have a valid data type or size: one of 8, 16, 32, 64" << endl;
+				exit(EXIT_FAILURE);
+			}
+		} else if ((arg == "-k" || arg == "--kmer") && i + 1 < argc) {
+			k = strtol(argv[i+1], NULL, 10);
+			if (errno) {
+				perror(argv[i+1]);
+				exit(EXIT_FAILURE);
+			} else if (k <= 0) {
+				fprintf(stderr, "K must be greater than 0.\n");
+				exit(EXIT_FAILURE);
+			}
+			align = false;
+			i++;
+		} else if ((arg == "-o" || arg == "--output") && i + 1 < argc) {
+			output = string(argv[i+1]);
+			i++;
+		} else if ((arg == "-q" || arg == "--query") && i + 1 < argc) {
+			char* qfile = argv[++i];
+			struct stat st;
+			stat(qfile, &st);
+			if (S_ISREG(st.st_mode)) {
+				qfiles.emplace_back(qfile);
+			} else {
+				usage(*argv);
+				exit(EXIT_FAILURE);
+			}
+		} else if ((arg == "-r" || arg == "--recover") && i + 1 < argc) {
+			recover = true;
+			dump_str = argv[++i];
+			pred64 = new Predictor<uint64_t>(dump_str);
+			similarity = pred64->get_id();
+			k = pred64->get_k();
+		} else if ((arg == "-f" || arg == "--feat") && i + 1 < argc) {
+			std::string val = argv[++i];
+			if (val == "fast") {
+				feats = PRED_FEAT_FAST;
+			} else if (val == "slow") {
+				feats = PRED_FEAT_FAST | PRED_FEAT_DIV;
+			} else {
+				cerr << "Features must be either \"fast\" or \"slow\"" << endl;
+			}
+		} else if ((arg == "-m" || arg == "--mode") && i + 1 < argc) {
+			std::string val = argv[++i];
+			if (val == "c") {
+				mode |= PRED_MODE_CLASS;
+			} else if (val == "r") {
+				mode |= PRED_MODE_REGR;
+			} else if (val == "cr" || val == "rc") {
+				mode |= PRED_MODE_CLASS | PRED_MODE_REGR;
+			} else {
+				cerr << "Mode must be either c, r, or a combination" << endl;
+				exit(EXIT_FAILURE);
+			}
+		} else if ((arg == "-s" || arg == "--sample") && i + 1 < argc) {
+			sample_size = strtol(argv[i+1], NULL, 10);
+			if (errno) {
+				perror(argv[i+1]);
+				exit(EXIT_FAILURE);
+			} else if (sample_size <= 0) {
+				fprintf(stderr, "Sample size must be greater than 0.\n");
+				exit(EXIT_FAILURE);
+			}
+			i++;
+		} else if ((arg == "--mut-type") && i + 1 < argc) {
+			std::string opt = argv[i+1];
+			if (opt == "all") {
+				mut_type = HandleSeq::BOTH | HandleSeq::ATYPICAL;
+			} else if (opt == "both") {
+				mut_type = HandleSeq::BOTH;
+			} else if (opt == "snp" || opt == "single") {
+				mut_type = HandleSeq::SINGLE;
+			} else if (opt == "nonsingle-typical") {
+				mut_type = HandleSeq::NON_SINGLE;
+			} else if (opt == "nonsingle-all") {
+				mut_type = HandleSeq::NON_SINGLE | HandleSeq::ATYPICAL;
+			} else if (opt == "all-but-reversion") {
+				mut_type = HandleSeq::BOTH | HandleSeq::TRANSLOCATION;
+			} else if (opt == "all-but-translocation") {
+				mut_type = HandleSeq::BOTH | HandleSeq::REVERSION;
+			} else {
+				cerr << "Options for mutation type are \"single\", \"nonsingle-typical\", \"both\" (for single and nonsingle-typical), \"nonsingle-all\", and \"all\" (single, nonsingle, and atypical nonsingle)." << endl;
+				exit(1);
+			}
+			i++;
+		} else if ((arg == "-t" || arg == "--threads") && i + 1 < argc) {
+			try {
+				std::string opt = argv[i+1];
+				int threads = std::stoi(opt);
+				if (threads <= 0) {
+					throw std::invalid_argument("");
+				}
+				#ifdef _OPENMP
+				omp_set_num_threads(threads);
+				#endif
+			} catch (std::exception e) {
+				cerr << "Number of threads must be greater than 0." << endl;
+				exit(1);
+			}
+
+			i++;
+
+		} else if ((arg == "-h") || (arg == "--help")) {
+			usage(*argv);
+			exit(EXIT_FAILURE);
+		} else {
+			struct stat st;
+			if (stat(argv[i], &st) == 0 && S_ISREG(st.st_mode)) {
+				files.push_back(argv[i]);
+			} else {
+				usage(*argv);
+				exit(EXIT_FAILURE);
+			}
+		}
+	}
+	if (files.empty()) {
+		usage(*argv);
+		exit(EXIT_FAILURE);
+	}
+}
+
+
+double global_mat[4][4] = {{1, -1, -1, -1},
+			   {-1, 1, -1, -1},
+			   {-1, -1, 1, -1},
+			   {-1, -1, -1, 1}};
+double global_sigma = -2;
+double global_epsilon = -1;
+
+template<class T>
+long bin_search(const std::vector<Point<T>*> &points, size_t begin, size_t last, size_t length)
+{
+	if (last < begin) {
+		return 0;
+	}
+	size_t idx = begin + (last - begin) / 2;
+	if (points.at(idx)->get_length() == length) {
+		while (idx > 0 && points[idx-1]->get_length() == length) {
+			idx--;
+		}
+		return idx;
+	} else if (points.at(idx)->get_length() > length) {
+		if (begin == idx) { return idx; }
+		return bin_search(points, begin, idx-1, length);
+	} else {
+		return bin_search(points, idx+1, last, length);
+	}
+}
+
+std::string format_header(std::string hdr)
+{
+	long len = hdr.length();
+	long b_idx = 0;
+	if (hdr[0] == '>') {
+		b_idx++;
+	}
+	for (long i = b_idx; i < len; i++) {
+		if (hdr[i] == ' ' || hdr[i] == '\t') {
+			len = i + 1;
+			break;
+		}
+	}
+	return hdr.substr(b_idx, len - b_idx);
+}
+
+template<class T>
+void work(const std::vector<Point<T>*> &queries, const std::vector<Point<T>*> &pts, double similarity, Predictor<T>* pred, std::string delim, std::ofstream &out, uintmax_t &num_pred_pos, bool format)
+{
+	if (pts.empty()) {
+		return;
+	}
+	uint8_t mode = pred->get_mode();
+	for (auto query : queries) {
+		size_t q_len = query->get_length();
+		size_t begin_length = q_len * similarity;
+		size_t end_length = q_len / similarity;
+		size_t start = bin_search(pts, 0, pts.size()-1,
+					  begin_length);
+
+		for (size_t i = start;
+		     i < pts.size() && pts[i]->get_length() <= end_length;
+		     i++) {
+			double sim = 0.0;
+			bool cls = true;
+
+			if (mode & PRED_MODE_CLASS) {
+				cls = pred->close(pts[i], query);
+
+			}
+			if (!cls) {
+				continue;
+			}
+			num_pred_pos++;
+			if (mode & PRED_MODE_REGR) {
+				sim = pred->similarity(pts[i], query);
+			} else {
+				sim = 1;
+			}
+			if (mode & PRED_MODE_CLASS) {
+//				sim = (sim > similarity) ? sim : 0;
+			}
+			if (sim > 0) {
+				if (format) {
+					out << format_header(query->get_header()) << delim << format_header(pts[i]->get_header()) << delim << 100 * sim << endl;
+				} else {
+					out << query->get_header() << delim << pts[i]->get_header() << delim << 100 * sim << endl;
+				}
+			}
+		}
+	}
+}
+
+template<class T>
+int Runner::do_run(std::vector<ChromosomeOneDigit* > &seqs)
+{
+	using DNA=ChromosomeOneDigit;
+	using pvec = vector<Point<T> *>;
+	using pmap = map<Point<T>*, pvec*>;
+	srand(0xFF);
+	mem_used("before do_run");
+	size_t num_points = 0;
+	uintmax_t _id = 0;
+
+
+
+
+		// Sorting all sequences based on length
+	std::sort(seqs.begin(), seqs.end(), [](DNA* a, DNA* b) {
+			return a->getBase()->length() < b->getBase()->length();
+		});
+		cout << "sample_size: " << sample_size << endl;
+		double increment = std::max(1.0, (double)seqs.size() / sample_size);
+		for (double i = 0; round(i) < seqs.size(); i += increment) {
+			indices.push_back(round(i));
+			//	cout << "index: " << round(i) << " length: " << seqs.at(round(i)).second->length() << endl;
+		}
+		std::vector<Point<T>*> trpoints(indices.size());
+		#pragma omp parallel for
+		for (size_t i = 0; i < indices.size(); i++) {
+			auto chrom = seqs.at(indices.at(i));
+			Point<T>* p = Loader<T>::get_point(chrom, _id, k);
+			trpoints[i] = p;
+		}
+		for (auto p : seqs) {
+			delete p;
+		}
+		seqs.clear();
+
+	indices.clear();
+	mem_used("after selection");
+	cout << "TRpoints.size(): " << trpoints.size() << endl;
+
+	// std::sort(trpoints.begin(), trpoints.end(), [](const Point<T>* a, const Point<T>* b) {
+	// 		return a->get_length() < b->get_length(); });
+
+	int n_threads = omp_get_max_threads();
+	Predictor<T> *pred = NULL;
+	if (recover) {
+		pred = new Predictor<T>(dump_str);
+
+	} else {
+		if (mode == 0) {
+			cout << "No mode specified, using regression and classification by default" << endl;
+			mode = PRED_MODE_REGR | PRED_MODE_CLASS;
+		}
+		if (feats == 0) {
+			cout << "No feature set specified, using fast features by default" << endl;
+			feats = PRED_FEAT_FAST;
+		}
+		if ((mode & PRED_MODE_CLASS) == PRED_MODE_CLASS && similarity < 0) {
+			cout << "Classification specified, but no identity score given. Please supply a cutoff with \"--id\"" << endl;
+			exit(EXIT_FAILURE);
+		} else if (similarity < 0) {
+			similarity = 0.9;
+		}
+
+		pred = new Predictor<T>(k, similarity, mode, feats, mut_type, 4);
+		auto before = clock();
+		mem_used("before predictor training");
+		pred->train(trpoints, _id, 10, sample_size);
+
+		double elapsed = (clock() - before);
+		elapsed /= CLOCKS_PER_SEC;
+		cout << "Training time: " << elapsed << endl;
+		for (auto p : trpoints) {
+			delete p;
+		}
+		trpoints.clear();
+		if (dump) {
+			pred->save(dump_str, Datatype::get());
+			exit(0);
+		}
+	}
+	mem_used("after predictor training");
+
+	std::vector<std::ofstream> output_list;
+	for (int i = 0; i < n_threads; i++) {
+		std::ostringstream oss;
+		oss << output << i;
+		output_list.emplace_back(oss.str());
+	}
+
+
+	string delim = "\t";
+	if (!format) {
+		delim = "!";
+	}
+	uint64_t query_id_start = num_points;
+	int num_query = num_points;
+	Loader<T> qloader(qfiles, n_threads * num_points, chunk_size, 1, k, query_id_start);
+	mem_used("before loop");
+	uintmax_t num_pred_pos = 0;
+	while (!qloader.done()) {
+		qloader.preload(0);
+		auto queries = qloader.load_next(0);
+		Loader<T> loader(files, 0, chunk_size, n_threads, k);
+
+
+		while (!loader.done()) {
+			int n_iter = n_threads;
+			mem_used("during inner loop");
+			for (int h = 0; h < n_iter; h++) {
+				loader.preload(h);
+			}
+			#pragma omp parallel for
+			for (int h = 0; h < n_iter; h++) {
+				int tid = omp_get_thread_num();
+				auto pts = loader.load_next(tid);
+				std::sort(std::begin(pts), std::end(pts), [](Point<T>*a, Point<T>*b) {
+						return a->get_length() < b->get_length();
+					});
+				work(queries, pts, similarity, pred, delim, output_list[tid], num_pred_pos, format);
+				for (auto p : pts) {
+					delete p;
+				}
+			}
+		}
+
+		for (auto q : queries) {
+			delete q;
+		}
+		mem_used("mid loop");
+	}
+	mem_used("after loop");
+	cout << "# of predicted positive: " << num_pred_pos << endl;
+	std::string warn = Loader<T>::get_warning();
+	if (warn != "") {
+		cout << warn << endl;
+	}
+	return 0;
+}
+
+
+template<class T>
+void Runner::print_output(const map<Point<T>*, vector<Point<T>*>*> &partition) const
+{
+	cout << "Printing output" << endl;
+	std::ofstream ofs;
+	ofs.open(output, std::ofstream::out);
+	int counter = 0;
+	for (auto const& kv : partition) {
+		if (kv.second->size() == 0) {
+			continue;
+		}
+		ofs << ">Cluster " << counter << endl;
+		int pt = 0;
+		for (auto p : *kv.second) {
+			string s = p->get_header();
+			ofs << pt << "\t"  << p->get_length() << "nt, " << s << "... " << endl;
+			pt++;
+		}
+		counter++;
+	}
+	ofs.close();
+}
diff --git a/src/fastcar/FC_Runner.h b/src/fastcar/FC_Runner.h
new file mode 100644
index 0000000..54b851a
--- /dev/null
+++ b/src/fastcar/FC_Runner.h
@@ -0,0 +1,53 @@
+/* -*- C++ -*-
+ *
+ * Runner.h
+ *
+ * Author: Benjamin T James
+ *
+ * Runner class, sets default params
+ * and runs program
+ */
+#ifndef FC_RUNNER_H
+#define FC_RUNNER_H
+
+#include <iostream>
+#include <map>
+#include <set>
+#include "../clutil/Point.h"
+#include "../predict/Predictor.h"
+#include "../predict/HandleSeq.h"
+#include "../nonltr/ChromosomeOneDigitDna.h"
+using namespace std;
+
+class Runner {
+public:
+	Runner(int argc, char** argv);
+	~Runner() { indices.clear(); files.clear(); qfiles.clear(); if (pred64) {delete pred64;}};
+	int run();
+private:
+	void usage(std::string progname) const;
+	template<class T> int do_run(std::vector<ChromosomeOneDigit*> &sequences);
+	template<class T> void print_output(const map<Point<T>*, vector<Point<T>*>*> &m) const;
+	int k = -1;
+        int bandwidth;
+	double similarity = -1;
+	long largest_count = 0;
+	bool align = false;
+	bool recover = false;
+	int sample_size = 300;
+	int mut_type = HandleSeq::SINGLE;
+	uint8_t mode = 0;
+	uint64_t feats = 0;
+	uint64_t chunk_size = 10000;
+	std::vector<std::string> files, qfiles;
+	std::vector<size_t> indices;
+	bool dump = false;
+	bool format = true;
+	string output = "output.search";
+	string dump_str = "weights.txt";
+	void get_opts(int argc, char** argv);
+	Predictor<uint64_t> *pred64 = NULL;
+
+
+};
+#endif
diff --git a/src/fastcar/fastcar.cpp b/src/fastcar/fastcar.cpp
new file mode 100644
index 0000000..c4f81fa
--- /dev/null
+++ b/src/fastcar/fastcar.cpp
@@ -0,0 +1,12 @@
+/* -*- C++ -*-
+ *
+ * main.cpp
+ *
+ * Author: Benjamin T James
+ */
+#include "FC_Runner.h"
+int main(int argc, char **argv)
+{
+	Runner runner(argc, argv);
+	return runner.run();
+}
diff --git a/src/nonltr/ChromListMaker.cpp b/src/nonltr/ChromListMaker.cpp
index e684c3a..5857c07 100644
--- a/src/nonltr/ChromListMaker.cpp
+++ b/src/nonltr/ChromListMaker.cpp
@@ -9,8 +9,9 @@
 
 namespace nonltr {
 
-ChromListMaker::ChromListMaker(string seqFileIn) {
+ChromListMaker::ChromListMaker(string seqFileIn, bool is_oneseq_) {
 	seqFile = seqFileIn;
+	is_oneseq = is_oneseq_;
 	chromList = new vector<Chromosome *>();
 }
 
@@ -50,36 +51,110 @@ const vector<Chromosome *> * ChromListMaker::makeChromList() {
 	ifstream in(seqFile.c_str());
 	bool isFirst = true;
 	Chromosome * chrom;
-
+	vector<uint64_t> size_list = getSize();
+	uint64_t cur_seq = 0;
+	if (is_oneseq) {
+		uint64_t sum = 0;
+		for (uint64_t len : size_list) {
+			sum += len + 50;
+		}
+		size_list.clear();
+		size_list.push_back(sum);
+	}
 	while (in.good()) {
 		string line;
 		safe_getline(in, line);
 		if (line[0] == '>') {
 			if (!isFirst) {
-				chrom->finalize();
-				chromList->push_back(chrom);
+				if (is_oneseq) {
+					std::string interseq(50, 'N');
+					//	chrom->insert(interseq);
+					chrom->appendToSequence(interseq);
+				} else {
+					chrom->finalize();
+					chromList->push_back(chrom);
+					chrom = new Chromosome(size_list.at(cur_seq++));
+					chrom->setHeader(line);
+				}
 			} else {
 				isFirst = false;
+				chrom = new Chromosome(size_list.at(cur_seq++));
+				chrom->setHeader(line);
 			}
+		} else if (line[0] == ' ' || line[0] == '\t') {
+		} else {
+			//	chrom->insert(line);
+			chrom->appendToSequence(line);
+		}
+	}
+	chrom->finalize();
+	chromList->push_back(chrom);
+	in.close();
 
-			chrom = new Chromosome();
-			chrom->setHeader(line);
+	return chromList;
+}
+
+const vector<uint64_t> ChromListMaker::getSize() {
+	ifstream in(seqFile.c_str());
+	vector<uint64_t> size_list;
+	uint64_t current_size = 0;
+	while (in.good()) {
+		string line;
+		safe_getline(in, line);
+		if (line[0] == '>') {
+			if (current_size > 0) {
+				size_list.push_back(current_size);
+			}
+			current_size = 0;
 		} else if (line[0] == ' ' || line[0] == '\t') {
-			bool all_spaces = true;
-			for (auto c : line) {
-				if (c != ' ' && c != '\t') {
-					all_spaces = false;
+		} else {
+			current_size += line.length();
+		}
+	}
+	size_list.push_back(current_size);
+	return size_list;
+}
+const vector<Chromosome *> * ChromListMaker::makeChromOneDigitDnaList() {
+	ifstream in(seqFile.c_str());
+	bool isFirst = true;
+	ChromosomeOneDigitDna * chrom;
+	vector<uint64_t> size_list = getSize();
+	uint64_t cur_seq = 0;
+	if (is_oneseq) {
+		uint64_t sum = 0;
+		for (uint64_t len : size_list) {
+			sum += len + 50;
+		}
+		if (sum > 0) {
+			sum -= 50;
+		}
+		size_list.clear();
+		size_list.push_back(sum);
+	}
+	while (in.good()) {
+		string line;
+		safe_getline(in, line);
+		if (line[0] == '>') {
+			if (!isFirst) {
+				if (is_oneseq) {
+					std::string interseq(50, 'N');
+					chrom->insert(interseq);
+				} else {
+					chrom->finalize();
+					chromList->push_back(chrom);
+					chrom = new ChromosomeOneDigitDna(size_list.at(cur_seq++));
+					chrom->setHeader(line);
 				}
+			} else {
+				isFirst = false;
+				chrom = new ChromosomeOneDigitDna(size_list.at(cur_seq++));
+				chrom->setHeader(line);
+
 			}
-			if (all_spaces) {
-				continue;
-			}
-			std::ostringstream oss;
-			oss << chrom->getHeader() << line;
-			std::string new_header = oss.str();
-			chrom->setHeader(new_header);
+		} else if (line[0] == ' ' || line[0] == '\t') {
 		} else {
-			chrom->appendToSequence(line);
+			chrom->insert(line);
+//			chrom->appendToSequence(line);
 		}
 	}
 	chrom->finalize();
@@ -89,10 +164,10 @@ const vector<Chromosome *> * ChromListMaker::makeChromList() {
 	return chromList;
 }
 
-const vector<Chromosome *> * ChromListMaker::makeChromOneDigitList() {
+const vector<Chromosome *> * ChromListMaker::makeChromOneDigitProteinList() {
 	ifstream in(seqFile.c_str());
 	bool isFirst = true;
-	ChromosomeOneDigit * chrom;
+	ChromosomeOneDigitProtein * chrom;
 
 	while (in.good()) {
 		string line;
@@ -105,7 +180,7 @@ const vector<Chromosome *> * ChromListMaker::makeChromOneDigitList() {
 				isFirst = false;
 			}
 
-			chrom = new ChromosomeOneDigit();
+			chrom = new ChromosomeOneDigitProtein();
 			chrom->setHeader(line);
 		} else {
 			chrom->appendToSequence(line);
diff --git a/src/nonltr/ChromListMaker.h b/src/nonltr/ChromListMaker.h
index a60fe2f..1a9d771 100644
--- a/src/nonltr/ChromListMaker.h
+++ b/src/nonltr/ChromListMaker.h
@@ -1,8 +1,9 @@
 /*
  * ChromListMaker.h
  *
- *  Created on: Mar 13, 2014
- *      Author: Hani Zakaria Girgis, PhD
+ *   Created on: Mar 13, 2014
+ *  Modified on: Oct 2, 2018
+ *       Author: Hani Zakaria Girgis, PhD
  */
 
 #ifndef CHROMLISTMAKER_H_
@@ -12,7 +13,8 @@
 #include <vector>
 
 #include "Chromosome.h"
-#include "ChromosomeOneDigit.h"
+#include "ChromosomeOneDigitDna.h"
+#include "ChromosomeOneDigitProtein.h"
 
 #include "../utility/Util.h"
 
@@ -25,12 +27,14 @@ class ChromListMaker {
 private:
 	vector<Chromosome *> * chromList;
 	string seqFile;
-
+	bool is_oneseq;
 public:
-	ChromListMaker(string);
+	ChromListMaker(string, bool is_oneseq_=false);
 	virtual ~ChromListMaker();
+	const vector<uint64_t> getSize();
 	const vector<Chromosome *> * makeChromList();
-	const vector<Chromosome *> * makeChromOneDigitList();
+	const vector<Chromosome *> * makeChromOneDigitDnaList();
+	const vector<Chromosome *> * makeChromOneDigitProteinList();
 
 };
 
diff --git a/src/nonltr/Chromosome.cpp b/src/nonltr/Chromosome.cpp
index 2bea802..7a2f53a 100644
--- a/src/nonltr/Chromosome.cpp
+++ b/src/nonltr/Chromosome.cpp
@@ -5,6 +5,7 @@
  *      Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH
  */
 #include "Chromosome.h"
+#include <string.h>
 
 Chromosome::Chromosome() {
 	header = string("");
@@ -14,6 +15,14 @@ Chromosome::Chromosome() {
 	isFinalized = false;
 }
 
+Chromosome::Chromosome(uint64_t size) {
+	header = string("");
+	base = string(size, ' ');
+	str_len = 0;
+	isHeaderReady = false;
+	isBaseReady = false;
+	isFinalized = false;
+}
 Chromosome::Chromosome(string fileName) {
 	chromFile = fileName;
 	readFasta();
@@ -32,6 +41,12 @@ Chromosome::Chromosome(string fileName, int len) {
 	help(len, true);
 }
 
+Chromosome::Chromosome(string fileName, int len, int maxLength) {
+	chromFile = fileName;
+	readFasta(maxLength);
+	help(len, true);
+}
+
 Chromosome::Chromosome(string &seq, string &info) {
 	header = info;
 	base = seq;
@@ -81,6 +96,22 @@ void Chromosome::appendToSequence(const string& line) {
 	}
 }
 
+void Chromosome::insert(const string& line) {
+	if (isFinalized) {
+		string msg("This chromosome has been finalized. ");
+		msg.append("The sequence cannot be modified.");
+		throw InvalidOperationException(msg);
+	} else {
+
+		memcpy((char*)base.c_str() + str_len,
+		       line.c_str(),
+		       line.length());
+		str_len += line.length();
+		isBaseReady = true;
+	}
+}
+
+
 void Chromosome::finalize() {
 	if (isFinalized) {
 		string msg("This chromosome has been already finalized. ");
@@ -97,26 +128,52 @@ void Chromosome::finalize() {
 }
 
 void Chromosome::help(int len, bool canMerge) {
+	canClean = true;
+
 	effectiveSize = 0;
 	segLength = len;
 	segment = new vector<vector<int> *>();
-//	segment->reserve(100);
+	//segment->reserve(100);
 
 	toUpperCase();
-	removeN();
-	if (canMerge) {
+
+	if(Util::isDna){
+		baseCount = new vector<int>(4, 0);
+		makeBaseCount();
+	}
+
+	removeAmbiguous();
+
+	if (Util::isDna && (canMerge && base.size() > 20)) {
 		mergeSegments();
 	}
+
 	makeSegmentList();
 	calculateEffectiveSize();
+
 }
 
 Chromosome::~Chromosome() {
 	base.clear();
 
-	Util::deleteInVector(segment);
-	segment->clear();
-	delete segment;
+	//cerr << "~Chromosome() 1" << endl;
+
+	if (canClean) {
+		while (!segment->empty()) {
+			segment->back()->clear();
+			delete segment->back();
+			segment->pop_back();
+		}
+		segment->clear();
+
+		// Util::deleteInVector(segment);
+		delete segment;
+		if(Util::isDna){
+			baseCount->clear();
+			delete baseCount;
+		}
+	}
+	//cerr << "~Chromosome() 2" << endl;
 }
 
 void Chromosome::readFasta() {
@@ -125,6 +182,14 @@ void Chromosome::readFasta() {
 	base = string("");
 
 	ifstream in(chromFile.c_str());
+	if (in.fail()) {
+		string msg("Cannot open ");
+		msg.append(chromFile);
+		msg.append(". System code is: ");
+		msg.append(Util::int2string(errno));
+		throw InvalidInputException(msg);
+	}
+
 	while (in.good()) {
 		string line;
 		getline(in, line);
@@ -147,6 +212,42 @@ void Chromosome::readFasta() {
 	in.close();
 }
 
+void Chromosome::readFasta(int maxLength) {
+	bool isFirst = true;
+	header = string("");
+	base = string("");
+
+	ifstream in(chromFile.c_str());
+	if (in.fail()) {
+		string msg("Cannot open ");
+		msg.append(chromFile);
+		msg.append(". System code is: ");
+		msg.append(Util::int2string(errno));
+		throw InvalidInputException(msg);
+	}
+
+	while (in.good() && base.size() < maxLength) {
+		string line;
+		getline(in, line);
+		if (line[0] == '>') {
+			if (!isFirst) {
+				string msg = "Chromosome file: ";
+				msg = msg + chromFile;
+				msg =
+						msg
+								+ " must have one sequence only. But it has more than one.";
+				throw InvalidInputException(msg);
+			} else {
+				header = line;
+				isFirst = false;
+			}
+		} else {
+			base.append(line);
+		}
+	}
+	in.close();
+}
+
 /**
  * Convert alphabet to upper case if it has not been done before
  **/
@@ -159,20 +260,21 @@ void Chromosome::toUpperCase() {
 /**
  * Segment coordinates are inclusive [s,e]
  **/
-void Chromosome::removeN() {
+void Chromosome::removeAmbiguous() {
 	// Store non-N index
 	int start = -1;
+	char uncertainChar = Util::isDna? 'N' : 'X';
 	for (int i = 0; i < base.size(); i++) {
-		if (base[i] != 'N' && start == -1) {
+		if (base[i] != uncertainChar && start == -1) {
 			start = i;
-		} else if (base[i] == 'N' && start != -1) {
+		} else if (base[i] == uncertainChar && start != -1) {
 			vector<int> * v = new vector<int>();
 			v->push_back(start);
 			v->push_back(i - 1);
 			segment->push_back(v);
 
 			start = -1;
-		} else if (i == base.size() - 1 && base[i] != 'N' && start != -1) {
+		} else if (i == base.size() - 1 && base[i] != uncertainChar && start != -1) {
 			vector<int> * v = new vector<int>();
 			v->push_back(start);
 			v->push_back(i);
@@ -181,48 +283,73 @@ void Chromosome::removeN() {
 			start = -1;
 		}
 	}
+
+	// Test code
+	// for(auto seg : *segment){
+	// 	cerr << seg->at(0) << "-" << seg->at(1) << endl;
+	// }
 }
 
 /**
+ * Applied to DNA only--not proteins.
  * If the gap between two consecutive segments is less than 10 bp.
  * Segments that are shorter than 20 bp are not added.
  */
 void Chromosome::mergeSegments() {
-	vector<vector<int> *> * mSegment = new vector<vector<int> *>();
 
-	int s = segment->at(0)->at(0);
-	int e = segment->at(0)->at(1);
+	// cout << "Segment size is " << segment->size() << endl;
+	// cout << base << endl;
 
-	for (int i = 1; i < segment->size(); i++) {
-		int s1 = segment->at(i)->at(0);
-		int e1 = segment->at(i)->at(1);
+	if (segment->size() > 0) {
+		vector<vector<int> *> * mSegment = new vector<vector<int> *>();
+		int s = segment->at(0)->at(0);
+		int e = segment->at(0)->at(1);
 
-		if (s1 - e < 10) {
-			e = e1;
-		} else {
-			if (e - s + 1 >= 20) {
-				vector<int> * seg = new vector<int>();
-				seg->push_back(s);
-				seg->push_back(e);
-				mSegment->push_back(seg);
+		for (int i = 1; i < segment->size(); i++) {
+			int s1 = segment->at(i)->at(0);
+			int e1 = segment->at(i)->at(1);
+
+			/*
+			 if(e1 - s1 + 1 <= 2000){
+			 cout << "s1:" << s1 << " e1: " << e1 << endl;
+			 }
+			 */
+
+			if (s1 - e < 10) {
+				e = e1;
+			} else {
+				if (e - s + 1 >= 20) {
+					vector<int> * seg = new vector<int>();
+					seg->push_back(s);
+					seg->push_back(e);
+					mSegment->push_back(seg);
+				}
+
+				// Test start
+				/*
+				 if (e - s + 1 <= 100) {
+				 cout << "Removing: " << base.substr(s, e - s + 1) << endl;
+				 }
+				 */
+				// Test end
+				s = s1;
+				e = e1;
 			}
+		}
 
-			s = s1;
-			e = e1;
+		// Handle the last index
+		if (e - s + 1 >= 20) {
+			vector<int> * seg = new vector<int>();
+			seg->push_back(s);
+			seg->push_back(e);
+			mSegment->push_back(seg);
 		}
-	}
 
-	// Handle the last index
-	if (e - s + 1 >= 20) {
-		vector<int> * seg = new vector<int>();
-		seg->push_back(s);
-		seg->push_back(e);
-		mSegment->push_back(seg);
+		Util::deleteInVector(segment);
+		segment->clear();
+		delete segment;
+		segment = mSegment;
 	}
-
-	Util::deleteInVector(segment);
-	segment->clear();
-	segment = mSegment;
 }
 
 void Chromosome::makeSegmentList() {
@@ -261,14 +388,22 @@ const string* Chromosome::getBase() {
 	return &base;
 }
 
+string& Chromosome::getBaseRef() {
+	return base;
+}
+
+string& Chromosome::getHeaderRef() {
+	return header;
+}
+
 const vector<vector<int> *> * Chromosome::getSegment() {
 	return segment;
 }
 
-void Chromosome::printSegmentList(){
+void Chromosome::printSegmentList() {
 	int l = segment->size();
 	cout << "Segment list size = " << l << endl;
-	for(int i = 0; i < l; i++){
+	for (int i = 0; i < l; i++) {
 		cout << segment->at(i)->at(0) << "\t";
 		cout << segment->at(i)->at(1) << endl;
 	}
@@ -296,6 +431,11 @@ int Chromosome::getEffectiveSize() {
 }
 
 int Chromosome::getGcContent() {
+	if(!Util::isDna){
+		cerr << "Calculating GC content on a protein sequence is not allowed." << endl;
+		throw std::exception();
+	}
+
 	int gc = 0;
 	int size = base.size();
 	for (int i = 0; i < size; i++) {
@@ -306,3 +446,37 @@ int Chromosome::getGcContent() {
 	}
 	return gc;
 }
+
+void Chromosome::makeBaseCount() {
+	if(!Util::isDna){
+		cerr << "Counting nucleotides in a protein sequence is not allowed." << endl;
+		throw std::exception();
+	}
+
+	int size = base.size();
+	for (int i = 0; i < size; i++) {
+		switch (base.at(i)) {
+		case 'A':
+			baseCount->at(0)++;
+			break;
+;		case 'C':
+			baseCount->at(1)++;
+			break;
+		case 'G':
+			baseCount->at(2)++;
+			break;
+		case 'T':
+			baseCount->at(3)++;
+			break;
+		}
+	}
+}
+
+vector<int> * Chromosome::getBaseCount() {
+	if(!Util::isDna){
+		cerr << "Counting nucleotides in a protein sequence is not allowed." << endl;
+		throw std::exception();
+	}
+
+	return baseCount;
+}
diff --git a/src/nonltr/Chromosome.h b/src/nonltr/Chromosome.h
index 0632458..adb42c2 100644
--- a/src/nonltr/Chromosome.h
+++ b/src/nonltr/Chromosome.h
@@ -27,9 +27,11 @@ namespace nonltr {
 class Chromosome: public IChromosome {
 public:
 	Chromosome();
+	Chromosome(uint64_t);
 	Chromosome(string);
 	Chromosome(string, bool);
 	Chromosome(string, int);
+	Chromosome(string, int, int);
 	Chromosome(string &, string&);
 	Chromosome(string &, string&, int);
 
@@ -37,6 +39,9 @@ class Chromosome: public IChromosome {
 
 	virtual ~Chromosome();
 
+	virtual string& getBaseRef();
+	virtual string& getHeaderRef();
+
 	virtual const string* getBase();
 	virtual const vector<vector<int> *> * getSegment();
 	virtual void printSegmentList();
@@ -47,19 +52,23 @@ class Chromosome: public IChromosome {
 	virtual void setSequence(string&);
 	virtual void appendToSequence(const string&);
 	virtual void finalize();
-
+	virtual vector<int> * getBaseCount();
+	virtual void insert(const string&);
 
 protected:
 	string chromFile;
 	string header;
 	string base;
+	int str_len;
+
 	int effectiveSize;
 	int segLength;
 
 	vector<vector<int> *> * segment;
 	void readFasta();
+	void readFasta(int);
 	void toUpperCase();
-	void removeN();
+	void removeAmbiguous();
 	void mergeSegments();
 	virtual void help(int, bool);
 	void makeSegmentList();
@@ -69,9 +78,11 @@ class Chromosome: public IChromosome {
 	bool isHeaderReady;
 	bool isBaseReady;
 	bool isFinalized;
+	bool canClean = false;
 
 	void reverseSegments();
-
+	void makeBaseCount();
+	vector<int> * baseCount;
 };
 }
 
diff --git a/src/nonltr/ChromosomeOneDigit.cpp b/src/nonltr/ChromosomeOneDigit.cpp
index 9af2c51..2783d7a 100644
--- a/src/nonltr/ChromosomeOneDigit.cpp
+++ b/src/nonltr/ChromosomeOneDigit.cpp
@@ -3,25 +3,10 @@
  *
  *  Created on: Jul 31, 2012
  *      Author: Hani Zakaria Girgis, PhD at the NCB1/NLM/NIH
- * A	A
- * T	T
- * G	G
- * C	C
- * R	G or A
- * Y	T or C
- * M	A or C
- * K	G or T
- * S	G or C
- * W	A or T
- * H	A or C or T
- * B	G or T or C
- * V	G or C or A
- * D	G or T or A
- * N	G or T or A or C
  */
 #include <iostream>
 #include <map>
-
+#include <sstream>
 #include "Chromosome.h"
 #include "ChromosomeOneDigit.h"
 #include "../exception/InvalidInputException.h"
@@ -32,6 +17,12 @@ namespace nonltr {
 
 ChromosomeOneDigit::ChromosomeOneDigit() :
 		Chromosome() {
+	//cout << "The no args constructor is called" << endl;
+}
+
+ChromosomeOneDigit::ChromosomeOneDigit(uint64_t s) :
+		Chromosome(s) {
+	//cout << "The no args constructor is called" << endl;
 }
 
 ChromosomeOneDigit::ChromosomeOneDigit(string fileName) :
@@ -39,16 +30,22 @@ ChromosomeOneDigit::ChromosomeOneDigit(string fileName) :
 	help();
 }
 
-ChromosomeOneDigit::ChromosomeOneDigit(string seq, string info) :
+ChromosomeOneDigit::ChromosomeOneDigit(string fileName, int segmentLength,
+		int maxLength) :
+		Chromosome(fileName, segmentLength, maxLength) {
+	help();
+}
+
+ChromosomeOneDigit::ChromosomeOneDigit(string& seq, string& info) :
 		Chromosome(seq, info) {
+	//cout << "Two string constructor is called" << endl;
 	help();
 }
 
-void ChromosomeOneDigit::help() {
-	// Build codes
-	buildCodes();
-	// Modify the sequence in the super class
-	encodeNucleotides();
+ChromosomeOneDigit::ChromosomeOneDigit(string& seq, string& info, int length) :
+		Chromosome(seq, info, length) {
+	//cout << "Two string constructor is called" << endl;
+	help();
 }
 
 void ChromosomeOneDigit::finalize() {
@@ -56,190 +53,82 @@ void ChromosomeOneDigit::finalize() {
 	help();
 }
 
-void ChromosomeOneDigit::buildCodes() {
+void ChromosomeOneDigit::help() {
+	// Can delete the codes
+	canClean = true;
+
 	// Make map
 	codes = new map<char, char>();
 
-	// Certain nucleotides
-	codes->insert(map<char, char>::value_type('A', (char) 0));
-	codes->insert(map<char, char>::value_type('C', (char) 1));
-	codes->insert(map<char, char>::value_type('G', (char) 2));
-	codes->insert(map<char, char>::value_type('T', (char) 3));
-
-	// Common uncertain nucleotide
-	// codes->insert(map<char, char>::value_type('N', (char) 4));
-
-	// Uncertain nucleotides
-	codes->insert(map<char, char>::value_type('R', codes->at('G')));
-	codes->insert(map<char, char>::value_type('Y', codes->at('C')));
-	codes->insert(map<char, char>::value_type('M', codes->at('A')));
-	codes->insert(map<char, char>::value_type('K', codes->at('T')));
-	codes->insert(map<char, char>::value_type('S', codes->at('G')));
-	codes->insert(map<char, char>::value_type('W', codes->at('T')));
-	codes->insert(map<char, char>::value_type('H', codes->at('C')));
-	codes->insert(map<char, char>::value_type('B', codes->at('T')));
-	codes->insert(map<char, char>::value_type('V', codes->at('A')));
-	codes->insert(map<char, char>::value_type('D', codes->at('T')));
-	codes->insert(map<char, char>::value_type('N', codes->at('C')));
-	codes->insert(map<char, char>::value_type('X', codes->at('G')));
+	// Build codes
+	buildCodes();
+	// Modify the sequence in the super class
+	encode();
 }
 
 ChromosomeOneDigit::~ChromosomeOneDigit() {
-	codes->clear();
-	delete codes;
-}
-
-/**
- * This method converts nucleotides in the segments to single digit codes
- */
-void ChromosomeOneDigit::encodeNucleotides() {
-
-  for (int s = 0; s < segment->size(); s++) {
-    int segStart = segment->at(s)->at(0);
-    int segEnd = segment->at(s)->at(1);
-    for (int i = segStart; i <= segEnd; i++) {
-      if (codes->count(base[i]) > 0) {
-	base[i] = codes->at(base[i]);
-      } else {
-	string msg = "Invalid nucleotide: ";
-	msg.append(1, base[i]);
-	throw InvalidInputException(msg);
-      }
-    }
-  }
-
-  // Digitize skipped segments
-  int segNum = segment->size();
-  if(segNum > 0){
-    // The first interval - before the first segment
-    int segStart = 0; 
-    int segEnd = segment->at(0)->at(0)-1; 
-
-    for (int s = 0; s <= segNum; s++) {      
-      for (int i = segStart; i <= segEnd; i++) {
-	char c = base[i];
-	if(c != 'N'){
-	  if (codes->count(c) > 0) {
-	    base[i] = codes->at(c);
-	  } else {
-	    string msg = "Invalid nucleotide: ";
-	    msg.append(1, c);
-	    throw InvalidInputException(msg);
-	  }
+	if (canClean) {
+		codes->clear();
+		delete codes;
 	}
-      }
-
-      // The regular intervals between two segments
-      if(s < segNum-1){
-	segStart = segment->at(s)->at(1)+1;
-	segEnd = segment->at(s+1)->at(0)-1;
-      }
-      // The last interval - after the last segment
-      else if(s == segNum - 1){
-	segStart = segment->at(s)->at(1)+1;
-	segEnd = base.size()-1;
-      } 
-    } 
-  }
 }
 
-/*
-void ChromosomeOneDigit::encodeNucleotides() {
-	int seqLen = base.size();
-
-	for (int i = 0; i < seqLen; i++) {
-		if (codes->count(base[i]) > 0) {
-			base[i] = codes->at(base[i]);
-		} else {
-			string msg = "Invalid nucleotide: ";
-			msg.append(1, base[i]);
-			throw InvalidInputException(msg);
-		}
-	}
-
-}
-*/
-
 /**
- * Cannot be called on already finalized object.
- */
-void ChromosomeOneDigit::makeR() {
-	//cout << "Making reverse ..." << endl;
-	makeReverse();
-	reverseSegments();
-}
-
-/**
- * Cannot be called on already finalized object.
+ * This method converts nucleotides in the segments to single digit codes
  */
-void ChromosomeOneDigit::makeRC() {
-	//cout << "Making reverse complement ..." << endl;
-	makeComplement();
-	makeReverse();
-	reverseSegments();
-}
-
-void ChromosomeOneDigit::makeComplement() {
-	map<char, char> complement;
-
-	// Certain nucleotides
-	complement.insert(map<char, char>::value_type((char) 0, (char) 3));
-	complement.insert(map<char, char>::value_type((char) 1, (char) 2));
-	complement.insert(map<char, char>::value_type((char) 2, (char) 1));
-	complement.insert(map<char, char>::value_type((char) 3, (char) 0));
-
-	// Unknown nucleotide
-	complement.insert(map<char, char>::value_type('N', 'N'));
-	// complement.insert(map<char, char>::value_type((char) 4, (char) 4));
-
-	// Convert a sequence to its complement
-	int seqLen = base.size();
-	for (int i = 0; i < seqLen; i++) {
-		if (complement.count(base[i]) > 0) {
-			base[i] = complement.at(base[i]);
-		} else {
-			cerr << "Error: The digit " << (char) base[i];
-			cerr << " does not represent a base." << endl;
-			exit(2);
+void ChromosomeOneDigit::encode() {
+
+	for (int s = 0; s < segment->size(); s++) {
+		int segStart = segment->at(s)->at(0);
+		int segEnd = segment->at(s)->at(1);
+		for (int i = segStart; i <= segEnd; i++) {
+
+			if (codes->count(base[i]) > 0) {
+				base[i] = codes->at(base[i]);
+			} else {
+				string msg = "Invalid nucleotide: ";
+				std::ostringstream oss;
+				int b_int = base[i];
+				oss << msg << b_int;
+				throw InvalidInputException(oss.str());
+			}
 		}
 	}
-}
-
-void ChromosomeOneDigit::makeReverse() {
-	int last = base.size() - 1;
-
-	// Last index to be switched
-	int middle = base.size() / 2;
-
-	for (int i = 0; i < middle; i++) {
-		char temp = base[last - i];
-		base[last - i] = base[i];
-		base[i] = temp;
-	}
-}
 
-void ChromosomeOneDigit::reverseSegments() {
+	// Digitize skipped segments
+	char uncertainChar = Util::isDna? 'N' : 'X';
 	int segNum = segment->size();
-	int lastBase = size() - 1;
-
-	// Calculate the coordinate on the main strand
-	for (int i = 0; i < segNum; i++) {
-		vector<int> * seg = segment->at(i);
-
-		int s = lastBase - seg->at(1);
-		int e = lastBase - seg->at(0);
-		seg->clear();
-		seg->push_back(s);
-		seg->push_back(e);
-	}
-
-	// Reverse the regions within the list
-	int lastRegion = segNum - 1;
-	int middle = segNum / 2;
-	for (int i = 0; i < middle; i++) {
-		vector<int> * temp = segment->at(lastRegion - i);
-		(*segment)[lastRegion - i] = segment->at(i);
-		(*segment)[i] = temp;
+	if (segNum > 0) {
+		// The first interval - before the first segment
+		int segStart = 0;
+		int segEnd = segment->at(0)->at(0) - 1;
+
+		for (int s = 0; s <= segNum; s++) {
+			for (int i = segStart; i <= segEnd; i++) {
+				char c = base[i];
+
+				if (c != uncertainChar) {
+					if (codes->count(c) > 0) {
+						base[i] = codes->at(c);
+					} else {
+						string msg = "ChromosomeOneDigit::encode() found invalid letter: ";
+						msg.append(1, c);
+						throw InvalidInputException(msg);
+					}
+				}
+			}
+
+			// The regular intervals between two segments
+			if (s < segNum - 1) {
+				segStart = segment->at(s)->at(1) + 1;
+				segEnd = segment->at(s + 1)->at(0) - 1;
+			}
+			// The last interval - after the last segment
+			else if (s == segNum - 1) {
+				segStart = segment->at(s)->at(1) + 1;
+				segEnd = base.size() - 1;
+			}
+		}
 	}
 }
 
diff --git a/src/nonltr/ChromosomeOneDigit.h b/src/nonltr/ChromosomeOneDigit.h
index 384698f..19875eb 100644
--- a/src/nonltr/ChromosomeOneDigit.h
+++ b/src/nonltr/ChromosomeOneDigit.h
@@ -15,28 +15,28 @@ namespace nonltr {
 class ChromosomeOneDigit: public Chromosome {
 
 private:
-	/* Fields */
-	map<char, char> * codes;
-
-	/* Methods */
+	void encode();
 	void help();
-	void buildCodes();
-	void encodeNucleotides();
 
-	void makeReverse();
-	void makeComplement();
-	void reverseSegments();
+
+protected:
+	bool canClean = false;
+	map<char, char> * codes;
+	virtual void buildCodes() = 0;
+
 
 public:
 	/* Methods */
 	ChromosomeOneDigit();
+	ChromosomeOneDigit(uint64_t);
 	ChromosomeOneDigit(string);
-	ChromosomeOneDigit(string, string);
+	ChromosomeOneDigit(string, int, int);
+	ChromosomeOneDigit(string&, string&);
+	ChromosomeOneDigit(string&, string&, int);
 	virtual ~ChromosomeOneDigit();
 	virtual void finalize();
 
-	void makeR();
-	void makeRC();
+
 };
 }
 
diff --git a/src/nonltr/ChromosomeOneDigitDna.cpp b/src/nonltr/ChromosomeOneDigitDna.cpp
new file mode 100644
index 0000000..9f8bbf7
--- /dev/null
+++ b/src/nonltr/ChromosomeOneDigitDna.cpp
@@ -0,0 +1,154 @@
+#include "ChromosomeOneDigitDna.h"
+
+namespace nonltr{
+
+ChromosomeOneDigitDna::ChromosomeOneDigitDna() : ChromosomeOneDigit() {}
+ChromosomeOneDigitDna::ChromosomeOneDigitDna(uint64_t s) : ChromosomeOneDigit(s) {}
+
+ChromosomeOneDigitDna::ChromosomeOneDigitDna(string fileName) :
+	ChromosomeOneDigit(fileName){
+
+}
+
+ChromosomeOneDigitDna::ChromosomeOneDigitDna(string fileName, int segmentLength, int maxLength) :
+	ChromosomeOneDigit(fileName, segmentLength, maxLength) {
+
+}
+
+ChromosomeOneDigitDna::ChromosomeOneDigitDna(string& seq, string& info) :
+	ChromosomeOneDigit(seq, info){
+
+}
+
+ChromosomeOneDigitDna::ChromosomeOneDigitDna(string& seq, string& info, int length) :
+	ChromosomeOneDigit(seq, info, length) {
+}
+
+ChromosomeOneDigitDna::~ChromosomeOneDigitDna(){
+
+}
+
+/**
+ * A	A
+ * T	T
+ * G	G
+ * C	C
+ * R	G or A
+ * Y	T or C
+ * M	A or C
+ * K	G or T
+ * S	G or C
+ * W	A or T
+ * H	A or C or T
+ * B	G or T or C
+ * V	G or C or A
+ * D	G or T or A
+ * N	G or T or A or C
+ */
+void ChromosomeOneDigitDna::buildCodes() {
+	// Certain nucleotides
+	codes->insert(map<char, char>::value_type('A', (char) 0));
+	codes->insert(map<char, char>::value_type('C', (char) 1));
+	codes->insert(map<char, char>::value_type('G', (char) 2));
+	codes->insert(map<char, char>::value_type('T', (char) 3));
+
+	// Uncertain nucleotides
+	codes->insert(map<char, char>::value_type('R', codes->at('G')));
+	codes->insert(map<char, char>::value_type('Y', codes->at('C')));
+	codes->insert(map<char, char>::value_type('M', codes->at('A')));
+	codes->insert(map<char, char>::value_type('K', codes->at('T')));
+	codes->insert(map<char, char>::value_type('S', codes->at('G')));
+	codes->insert(map<char, char>::value_type('W', codes->at('T')));
+	codes->insert(map<char, char>::value_type('H', codes->at('C')));
+	codes->insert(map<char, char>::value_type('B', codes->at('T')));
+	codes->insert(map<char, char>::value_type('V', codes->at('A')));
+	codes->insert(map<char, char>::value_type('D', codes->at('T')));
+	codes->insert(map<char, char>::value_type('N', codes->at('C')));
+	codes->insert(map<char, char>::value_type('X', codes->at('G')));
+}
+
+/**
+ * Cannot be called on already finalized object.
+ */
+void ChromosomeOneDigitDna::makeR() {
+	//cout << "Making reverse ..." << endl;
+	makeReverse();
+	reverseSegments();
+}
+
+/**
+ * Cannot be called on already finalized object.
+ */
+void ChromosomeOneDigitDna::makeRC() {
+	//cout << "Making reverse complement ..." << endl;
+	makeComplement();
+	makeReverse();
+	reverseSegments();
+}
+
+void ChromosomeOneDigitDna::makeComplement() {
+	map<char, char> complement;
+
+	// Certain nucleotides
+	complement.insert(map<char, char>::value_type((char) 0, (char) 3));
+	complement.insert(map<char, char>::value_type((char) 1, (char) 2));
+	complement.insert(map<char, char>::value_type((char) 2, (char) 1));
+	complement.insert(map<char, char>::value_type((char) 3, (char) 0));
+
+	// Unknown nucleotide
+	complement.insert(map<char, char>::value_type('N', 'N'));
+	// complement.insert(map<char, char>::value_type((char) 4, (char) 4));
+
+	// Convert a sequence to its complement
+	int seqLen = base.size();
+	for (int i = 0; i < seqLen; i++) {
+		if (complement.count(base[i]) > 0) {
+			base[i] = complement.at(base[i]);
+		} else {
+			cerr << "Error: The digit " << (char) base[i];
+			cerr << " does not represent a base." << endl;
+			exit(2);
+		}
+	}
+}
+
+void ChromosomeOneDigitDna::makeReverse() {
+	int last = base.size() - 1;
+
+	// Last index to be switched
+	int middle = base.size() / 2;
+
+	for (int i = 0; i < middle; i++) {
+		char temp = base[last - i];
+		base[last - i] = base[i];
+		base[i] = temp;
+	}
+}
+
+void ChromosomeOneDigitDna::reverseSegments() {
+	int segNum = segment->size();
+	int lastBase = size() - 1;
+
+	// Calculate the coordinate on the main strand
+	for (int i = 0; i < segNum; i++) {
+		vector<int> * seg = segment->at(i);
+
+		int s = lastBase - seg->at(1);
+		int e = lastBase - seg->at(0);
+		seg->clear();
+		seg->push_back(s);
+		seg->push_back(e);
+	}
+
+	// Reverse the regions within the list
+	int lastRegion = segNum - 1;
+	int middle = segNum / 2;
+	for (int i = 0; i < middle; i++) {
+		vector<int> * temp = segment->at(lastRegion - i);
+		(*segment)[lastRegion - i] = segment->at(i);
+		(*segment)[i] = temp;
+	}
+}
+
+
+}
diff --git a/src/nonltr/ChromosomeOneDigitDna.h b/src/nonltr/ChromosomeOneDigitDna.h
new file mode 100644
index 0000000..7bd9dc7
--- /dev/null
+++ b/src/nonltr/ChromosomeOneDigitDna.h
@@ -0,0 +1,36 @@
+/*
+ * ChromosomeOneDigitDna.h
+ * Created on: September 28, 2018
+ *     Author: Hani Z. Girgis, PhD
+ */
+
+ #ifndef HROMOSOMEONEDIGITDNA_H_
+ #define HROMOSOMEONEDIGITDNA_H_
+
+#include "ChromosomeOneDigit.h"
+
+namespace nonltr{
+	class ChromosomeOneDigitDna: public ChromosomeOneDigit{
+		private:
+			void makeReverse();
+			void makeComplement();
+			void reverseSegments();
+
+		protected:
+			virtual void buildCodes();
+
+		public:
+			ChromosomeOneDigitDna();
+		        ChromosomeOneDigitDna(uint64_t);
+			ChromosomeOneDigitDna(string);
+			ChromosomeOneDigitDna(string, int, int);
+			ChromosomeOneDigitDna(string&, string&);
+			ChromosomeOneDigitDna(string&, string&, int);
+			virtual ~ChromosomeOneDigitDna();
+
+			void makeR();
+			void makeRC();
+	};
+}
+
+#endif
diff --git a/src/nonltr/ChromosomeOneDigitProtein.cpp b/src/nonltr/ChromosomeOneDigitProtein.cpp
new file mode 100644
index 0000000..7add5af
--- /dev/null
+++ b/src/nonltr/ChromosomeOneDigitProtein.cpp
@@ -0,0 +1,64 @@
+#include "ChromosomeOneDigitProtein.h"
+
+namespace nonltr{
+
+ChromosomeOneDigitProtein::ChromosomeOneDigitProtein() : 
+	ChromosomeOneDigit() {
+
+}
+
+ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string fileName) : 
+	ChromosomeOneDigit(fileName){
+
+}
+
+ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string fileName, int segmentLength, int maxLength) : 
+	ChromosomeOneDigit(fileName, segmentLength, maxLength) {
+
+}
+
+ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string& seq, string& info) : 
+	ChromosomeOneDigit(seq, info){
+
+}
+
+ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string& seq, string& info, int length) : 
+	ChromosomeOneDigit(seq, info, length) {
+}
+
+ChromosomeOneDigitProtein::~ChromosomeOneDigitProtein(){
+
+}
+
+void ChromosomeOneDigitProtein::buildCodes() {
+	// https://en.wikipedia.org/wiki/Proteinogenic_amino_acid
+	codes->insert(map<char, char>::value_type('A', (char) 0));
+	codes->insert(map<char, char>::value_type('C', (char) 1));
+	codes->insert(map<char, char>::value_type('D', (char) 2));
+	codes->insert(map<char, char>::value_type('E', (char) 3));
+	codes->insert(map<char, char>::value_type('F', (char) 4));
+	codes->insert(map<char, char>::value_type('G', (char) 5));
+	codes->insert(map<char, char>::value_type('H', (char) 6));
+	codes->insert(map<char, char>::value_type('I', (char) 7));
+	codes->insert(map<char, char>::value_type('K', (char) 8));
+	codes->insert(map<char, char>::value_type('L', (char) 9));
+	codes->insert(map<char, char>::value_type('M', (char) 10));
+	codes->insert(map<char, char>::value_type('N', (char) 11));
+	codes->insert(map<char, char>::value_type('O', (char) 12));
+	codes->insert(map<char, char>::value_type('P', (char) 13));
+	codes->insert(map<char, char>::value_type('Q', (char) 14));
+	codes->insert(map<char, char>::value_type('R', (char) 15));
+	codes->insert(map<char, char>::value_type('S', (char) 16));
+	codes->insert(map<char, char>::value_type('T', (char) 17));
+	codes->insert(map<char, char>::value_type('U', (char) 18));
+	codes->insert(map<char, char>::value_type('V', (char) 19));
+	codes->insert(map<char, char>::value_type('W', (char) 20));
+	codes->insert(map<char, char>::value_type('Y', (char) 21));
+
+	// Uncertain uncleotides
+	codes->insert(map<char, char>::value_type('B', codes->at('D')));
+	codes->insert(map<char, char>::value_type('Z', codes->at('E')));
+	codes->insert(map<char, char>::value_type('J', codes->at('L')));
+}
+
+}// End namespace
\ No newline at end of file
diff --git a/src/nonltr/ChromosomeOneDigitProtein.h b/src/nonltr/ChromosomeOneDigitProtein.h
new file mode 100644
index 0000000..b5f78ee
--- /dev/null
+++ b/src/nonltr/ChromosomeOneDigitProtein.h
@@ -0,0 +1,28 @@
+/*
+ * ChromosomeOneDigitProtein.h
+ * Created on: October 2, 2018
+ *     Author: Hani Z. Girgis, PhD
+ */
+
+ #ifndef HROMOSOMEONEDIGITPROTEIN_H_
+ #define HROMOSOMEONEDIGITPROTEIN_H_
+
+#include "ChromosomeOneDigit.h"
+
+namespace nonltr{
+	class ChromosomeOneDigitProtein: public ChromosomeOneDigit{
+
+		protected:
+			virtual void buildCodes();
+
+		public:
+			ChromosomeOneDigitProtein();
+			ChromosomeOneDigitProtein(string);
+			ChromosomeOneDigitProtein(string, int, int);
+			ChromosomeOneDigitProtein(string&, string&);
+			ChromosomeOneDigitProtein(string&, string&, int);
+			virtual ~ChromosomeOneDigitProtein();
+	};
+}
+
+#endif
\ No newline at end of file
diff --git a/src/nonltr/KmerHashTable.cpp b/src/nonltr/KmerHashTable.cpp
index dc53505..56fd5cd 100644
--- a/src/nonltr/KmerHashTable.cpp
+++ b/src/nonltr/KmerHashTable.cpp
@@ -8,6 +8,7 @@
 #include <math.h>
 #include <iostream>
 #include <fstream>
+#include <limits>
 
 #include "../utility/Util.h"
 #include "../exception/InvalidInputException.h"
@@ -222,6 +223,38 @@ void KmerHashTable<I, V>::wholesaleIncrement(const char* sequence,
 	}*/
 }
 
+
+/**
+ * Call wholesaleIncrement on the segment itself.
+ * Then, call it again on the reverse complement of this segment.
+ *
+ * sequence: is a long sequence usually a long segment of a chromosome.
+ * sFirstKmer: is the start index of the first k-mer.
+ * sLastKmer: is the start index of the last k-mer.
+ */
+template<class I, class V>
+int  KmerHashTable<I, V>::wholesaleIncrementNoOverflow(const char* sequence,
+		int firstKmerStart, int lastKmerStart) {
+	// Increment k-mer's in the forward strand
+	vector<I> hashList = vector<I>();
+	hash(sequence, firstKmerStart, lastKmerStart, &hashList);
+	int ret = 0;
+	int size = hashList.size();
+	for (int i = 0; i < size; i++) {
+		I keyHash = hashList.at(i);
+		if (keyHash >= maxTableSize) {
+			cerr << "array out of bounds" << endl;
+			throw "";
+		}
+		if (values[keyHash] < std::numeric_limits<V>::max()) {
+			values[keyHash]++;
+		} else {
+			ret = -1;
+		}
+	}
+	return ret;
+}
+
 /**
  * Increment the entry associated with the key by one.
  */
@@ -384,7 +417,7 @@ vector<string>* KmerHashTable<I, V>::getKeys() {
 template<class I, class V>
 void KmerHashTable<I, V>::printTable(string output) {
 	vector<const char *> keys;
-//	getKeys(keys);
+	//getKeys(keys);
 
 	ofstream out(output.c_str());
 
diff --git a/src/nonltr/KmerHashTable.h b/src/nonltr/KmerHashTable.h
index 7c38e23..cd072af 100644
--- a/src/nonltr/KmerHashTable.h
+++ b/src/nonltr/KmerHashTable.h
@@ -57,6 +57,7 @@ class KmerHashTable: public ITableView<I,V> {
 	void increment(const char*);
 	void increment(const char*, int);
 	void wholesaleIncrement(const char*, int, int);
+	int wholesaleIncrementNoOverflow(const char*, int, int);
 
 	void addReverseComplement();
 	I countNonInitialEntries();
diff --git a/src/RepeatsDetector.cpp b/src/nonltr/RepeatsDetector.cpp
similarity index 96%
rename from src/RepeatsDetector.cpp
rename to src/nonltr/RepeatsDetector.cpp
index 443cf24..74f525d 100644
--- a/src/RepeatsDetector.cpp
+++ b/src/nonltr/RepeatsDetector.cpp
@@ -12,13 +12,13 @@
 #include <cmath>
 #include <vector>
 
-#include "nonltr/Trainer.h"
-#include "nonltr/KmerHashTable.h"
-#include "nonltr/TableBuilder.h"
-#include "nonltr/HMM.h"
-#include "nonltr/Scanner.h"
-#include "nonltr/ChromListMaker.h"
-#include "utility/Util.h"
+#include "../nonltr/Trainer.h"
+#include "../nonltr/KmerHashTable.h"
+#include "../nonltr/TableBuilder.h"
+#include "../nonltr/HMM.h"
+#include "../nonltr/Scanner.h"
+#include "../nonltr/ChromListMaker.h"
+#include "../utility/Util.h"
 
 using namespace std;
 using namespace nonltr;
@@ -67,7 +67,7 @@ void drive(map<string, string> * const param) {
       Util::deleteFile(param->at(MSK_PRM));
     }
   }
-  
+
   if (param->count(RPT_PRM) > 0) {
     if (param->count(GNM_PRM) > 0) {
       cout << "Deleting pre-existing files under " << param->at(RPT_PRM);
@@ -78,7 +78,7 @@ void drive(map<string, string> * const param) {
       Util::deleteFile(param->at(RPT_PRM));
     }
   }
-  
+
   if (param->count(SCO_PRM) > 0 && param->count(GNM_PRM) > 0) {
     cout << "Deleting pre-existing files under " << param->at(SCO_PRM);
     cout << endl;
@@ -97,21 +97,21 @@ void drive(map<string, string> * const param) {
 
   // Process the input
   int k = atoi(param->at(LEN_PRM).c_str());
-  
+
   if (param->count(GNM_PRM) > 0) {
     string genomeDir = param->at(GNM_PRM);
     int order = atoi(param->at(ORD_PRM).c_str());
     double s = atoi(param->at(GAU_PRM).c_str());
     double t = atoi(param->at(THR_PRM).c_str());
     int minObs = atoi(param->at(MIN_PRM).c_str());
-    
+
     // Adjust the threshold when it is one because of the log base.
     if (((int) t) == 1) {
       t = 1.5;
       cout << "The base of the logarithmic function is adjusted." << endl;
     }
-    
-    
+
+
     // This part or the next
     Trainer * trainer;
     if (param->count(CND_PRM) > 0) {
@@ -119,20 +119,20 @@ void drive(map<string, string> * const param) {
     } else {
       trainer = new Trainer(genomeDir, order, k, s, t, minObs);
     }
-    
-    
+
+
     if (param->count(TBL_PRM)) {
       cout << "Printing the count of the kmer's to: ";
       cout << param->at(TBL_PRM) << endl;
       trainer->printTable(param->at(TBL_PRM));
     }
-    
+
     if (param->count(HMO_PRM) > 0) {
       cout << "Printing the HMM to: " << endl;
       cout << param->at(HMO_PRM) << endl;
       trainer->printHmm(param->at(HMO_PRM));
     }
-    
+
     // Stage 3: Scan
     cout << endl << endl;
     cout << "Stage 4: Scanning ..." << endl;
@@ -141,33 +141,33 @@ void drive(map<string, string> * const param) {
     if (param->count(DIR_PRM) > 0) {
       Util::readChromList(param->at(DIR_PRM), fileList, string("fa"));
     }
-    
+
     int chromCount = fileList->size();
     for (int i = 0; i < chromCount; i++) {
       cout << "Scanning: " << fileList->at(i) << endl;
-      
+
       // Output file name
       string path(fileList->at(i));
       int slashLastIndex = path.find_last_of(Util::fileSeparator);
       int dotLastIndex = path.find_last_of(".");
       string nickName = path.substr(slashLastIndex + 1, dotLastIndex - slashLastIndex - 1);
-      
+
       // Process each sequence with the ith file
       ChromListMaker * maker = new ChromListMaker(fileList->at(i));
-      const vector<Chromosome *> * chromList = maker->makeChromOneDigitList();
+      const vector<Chromosome *> * chromList = maker->makeChromOneDigitDnaList();
 
       ChromListMaker * oMaker = new ChromListMaker(fileList->at(i));
       const vector<Chromosome *> * oChromList;
       if (param->count(MSK_PRM) > 0) {
 	oChromList = oMaker->makeChromList();
       }
-      
+
       for (int h = 0; h < chromList->size(); h++) {
-	ChromosomeOneDigit * chrom = dynamic_cast<ChromosomeOneDigit *>(chromList->at(h));
-	
+	ChromosomeOneDigitDna * chrom = dynamic_cast<ChromosomeOneDigitDna *>(chromList->at(h));
+
 	// Scan the forward strand
 	Scanner * scanner = new Scanner(trainer->getHmm(), k, chrom,trainer->getTable());
-	
+
 	// Scan the reverse complement
 	chrom->makeRC();
 	Scanner * scannerRC = new Scanner(trainer->getHmm(), k, chrom, trainer->getTable());
@@ -175,8 +175,8 @@ void drive(map<string, string> * const param) {
 	scanner->mergeWithOtherRegions(scannerRC->getRegionList());
 	delete scannerRC;
 	chrom->makeRC();
-	
-	
+
+
 	// Scan the reverse
 	chrom->makeR();
 	Scanner * scannerR = new Scanner(trainer->getHmm(), k, chrom, trainer->getTable());
@@ -186,14 +186,14 @@ void drive(map<string, string> * const param) {
 
 	//@@ The chromosome now has the sequence of the reverse strand
 	// The actual strand is calculated if the user requested the scores.
-	
+
 	// Print according to the user's requests
 	bool canAppend = (h == 0) ? false : true;
-	
+
 	if (param->count(SCO_PRM) > 0) {
 	  // Calculate the forward strand from the reverse
 	  chrom->makeR();
-	  
+
 	  string scoFile = param->at(SCO_PRM) + Util::fileSeparator + nickName + ".scr";
 	  if (!canAppend) {
 	    cout << "Printing scores to: " << scoFile << endl;
@@ -203,7 +203,7 @@ void drive(map<string, string> * const param) {
 	  scorer->printScores(scoFile, canAppend);
 	  delete scorer;
 	}
-	
+
 	if (param->count(RPT_PRM) > 0) {
 	  string rptFile = param->at(RPT_PRM) + Util::fileSeparator + nickName + ".rpt";
 	  if (!canAppend) {
@@ -211,7 +211,7 @@ void drive(map<string, string> * const param) {
 	  }
 	  scanner->printIndex(rptFile, canAppend, atoi(param->at(FRM_PRM).c_str()));
 	}
-	
+
 	if (param->count(MSK_PRM) > 0) {
 	  string mskFile = param->at(MSK_PRM) + Util::fileSeparator + nickName + ".msk";
 	  if (!canAppend) {
@@ -220,41 +220,41 @@ void drive(map<string, string> * const param) {
 	  Chromosome * oChrom = oChromList->at(h);
 	  scanner->printMasked(mskFile, *oChrom, canAppend);
 	}
-	
+
 	// Free memory
 	delete scanner;
       }
-      
+
       delete maker;
       delete oMaker;
     }
-    
+
     // Free memory
     fileList->clear();
     delete fileList;
     delete trainer;
   } else if (param->count(HMI_PRM) > 0) {
     HMM * hmm = new HMM(param->at(HMI_PRM));
-    
+
     string chromFile = param->at(SEQ_PRM);
     string scoresFile = param->at(SCI_PRM);
-    
-    ChromosomeOneDigit * chrom = new ChromosomeOneDigit(chromFile);
+
+    ChromosomeOneDigitDna * chrom = new ChromosomeOneDigitDna(chromFile);
     Scanner * scanner = new Scanner(hmm, k, chrom, scoresFile);
-    
+
     if (param->count(RPT_PRM) > 0) {
       string rptFile = param->at(RPT_PRM);
       cout << "Printing locations to: " << rptFile << endl;
       scanner->printIndex(rptFile, false, atoi(param->at(FRM_PRM).c_str()));
     }
-    
+
     if (param->count(MSK_PRM) > 0) {
       string mskFile = param->at(MSK_PRM);
       cout << "Printing masked sequence to: " << mskFile << endl;
       Chromosome oChrom(chromFile);
       scanner->printMasked(mskFile, oChrom, false);
     }
-    
+
     // Free memory
     delete scanner;
     delete chrom;
@@ -266,7 +266,7 @@ int main(int argc, char * argv[]) {
   cout << endl << endl;
   cout << "This is Red (REpeat Detector) designed and developed by ";
   cout << "Hani Zakaria Girgis, PhD." << endl << endl;
-  
+
   cout << "Version: 05/22/2015" << endl << endl;
 
   string message = string("Valid argument pairs:\n");
@@ -278,8 +278,8 @@ int main(int argc, char * argv[]) {
   message.append("\t\tFiles with \".fa\" extension in this directory are NOT used for completing the table.\n");
   message.append("\t\tThese Files MUST have different names from those in the genome directory.\n");
   message.append("\t\tThese Files are scanned for repeats.\n");
-  
-  
+
+
   message.append("\t-len word length equals k defining the k-mer. The default is floor(log_4(genome size)).\n");
   message.append("\t-ord order of the background Markov chain. The default is floor(k/2)-1.\n");
   message.append("\t-gau half width of the mask. The default is based on the GC content.\n");
@@ -290,7 +290,7 @@ int main(int argc, char * argv[]) {
   message.append("\t-tbl file where the table of the adjusted counts is written, optional.\n");
   message.append("\t-sco directory where scores are saved, optional.\n");
   message.append("\t\tScore files have the \".scr\" extension.\n");
-  
+
   message.append("\t-cnd directory where candidate regions are saved, optional.\n");
   message.append("\t\tCandidates files have the \".cnd\" extension.\n");
   message.append("\t-rpt directory where repeats locations are saved, optional.\n");
@@ -300,7 +300,7 @@ int main(int argc, char * argv[]) {
 
   message.append("\t-frm the format of the output: 1 (chrName:start-end) or 2 (chrName\tstart\tend).\n");
   message.append("\t\tThe output format are zero based and the end is exclusive.\n");
-  
+
   message.append("\t-hmo file where the HMM is saved, optional.\n\n");
 
   message.append("Examples:\n");
@@ -342,11 +342,11 @@ int main(int argc, char * argv[]) {
 	return 1;
       }
     }
-    
-    
+
+
     // Check if the user provided the essential arguments
-    
-    
+
+
     if (param->count(LEN_PRM) == 0) {
       if (param->count(GNM_PRM) > 0) {
 	// Calculate the size of the genome
@@ -365,9 +365,9 @@ int main(int argc, char * argv[]) {
 	}
 	fileList->clear();
 	delete fileList;
-	
+
 	double temp = log(genomeLength) / log(4.0);
-	
+
 	int k = floor(temp);
 	cout << "The recommended k is " << k << "." << endl;
 	if (k > 15) {
@@ -382,17 +382,17 @@ int main(int argc, char * argv[]) {
 	  k = 12;
 	}
 	cout << endl;
-	
+
 	string kString = Util::int2string(k);
 	param->insert(map<string, string>::value_type(LEN_PRM, kString));
-	
+
       } else {
 	cerr << "The word length is required." << endl;
 	cerr << message << endl;
 	return 1;
       }
     }
-    
+
     if(param->count(FRM_PRM) == 0){
       cout << "Using the default output format chrName:start-end" << endl;
       param->insert(map<string, string>::value_type(FRM_PRM, Util::int2string(Scanner::FRMT_POS)));
@@ -404,21 +404,21 @@ int main(int argc, char * argv[]) {
 	return 1;
       }
     }
-    
+
     if (param->count(GNM_PRM) > 0) {
       Util::checkFile(param->at(GNM_PRM));
-      
+
       if (param->count(ORD_PRM) == 0) {
 	double k = atoi(param->at(LEN_PRM).c_str());
 	int o = floor(k / 2.0) - 1;
-	
+
 	cout << "Using the default background order: " << o << ".";
 	cout << endl;
-	
+
 	string oString = Util::int2string(o);
 	param->insert(map<string, string>::value_type(ORD_PRM, oString));
       }
-      
+
       if (param->count(THR_PRM) == 0) {
 	cout << "Using the default threshold: 2." << endl;
 	param->insert(map<string, string>::value_type(THR_PRM, string("2")));
@@ -430,7 +430,7 @@ int main(int argc, char * argv[]) {
 	  return 1;
 	}
       }
-      
+
       if (param->count(MIN_PRM) == 0) {
 	cout << "Using the default minimum of the observed count of k-mers: 3." << endl;
 	param->insert(map<string, string>::value_type(MIN_PRM, string("3")));
@@ -442,10 +442,10 @@ int main(int argc, char * argv[]) {
 	  return 1;
 	}
       }
-      
+
       if (param->count(GAU_PRM) == 0) {
 	cout << "Calculating GC content ..." << endl;
-	
+
 	// 1: Count the gc content of the input genome
 	long genomeLength = 0;
 	long genomeGc = 0;
@@ -463,7 +463,7 @@ int main(int argc, char * argv[]) {
 	}
 	fileList->clear();
 	delete fileList;
-	
+
 	// 2: Calculate the gc content of the input genome
 	double gc = 100.00 * genomeGc / genomeLength;
 	int w = 20;
@@ -477,7 +477,7 @@ int main(int argc, char * argv[]) {
       }
     } else if (param->count(HMI_PRM) > 0) {
       Util::checkFile(param->at(HMI_PRM));
-      
+
       if (param->count(SEQ_PRM) == 0) {
 	cerr << "The sequence file is required.";
 	cerr << endl;
@@ -486,7 +486,7 @@ int main(int argc, char * argv[]) {
       } else {
 	Util::checkFile(param->at(SEQ_PRM));
       }
-      
+
       if (param->count(SCI_PRM) == 0) {
 	cerr << "The scores file is required.";
 	cerr << endl;
@@ -495,14 +495,14 @@ int main(int argc, char * argv[]) {
       } else {
 	Util::checkFile(param->at(SCI_PRM));
       }
-      
+
     } else {
       cerr << "A mode is required: training and scanning (-gnm) or ";
       cerr << "scanning only (-hmi)." << endl;
       cerr << message << endl;
       return 1;
     }
-    
+
     // Check optional parameters
     if (param->count(TBL_PRM) > 0 && param->count(GNM_PRM) == 0) {
       cerr << "Printing the k-mer table is optional with -gnm only.";
@@ -510,14 +510,14 @@ int main(int argc, char * argv[]) {
       cerr << message << endl;
       return 1;
     }
-    
+
     if (param->count(HMO_PRM) > 0 && param->count(GNM_PRM) == 0) {
       cerr << "Printing the HMM is optional with -gnm only.";
       cerr << endl;
       cerr << message << endl;
       return 1;
     }
-    
+
     if (param->count(SCO_PRM) > 0 && param->count(GNM_PRM) == 0) {
       cerr << "Printing the scores is optional with -gnm only.";
       cerr << endl;
@@ -526,7 +526,7 @@ int main(int argc, char * argv[]) {
     } else if (param->count(SCO_PRM) > 0 && param->count(GNM_PRM) > 0) {
       Util::checkFile(param->at(SCO_PRM));
     }
-    
+
 
     if (param->count(CND_PRM) > 0 && param->count(GNM_PRM) == 0) {
       cerr << "Printing candidate regions is optional with -gnm only.";
@@ -536,8 +536,8 @@ int main(int argc, char * argv[]) {
     } else if (param->count(CND_PRM) > 0 && param->count(GNM_PRM) > 0) {
       Util::checkFile(param->at(CND_PRM));
     }
-    
-	
+
+
     if (param->count(DIR_PRM) > 0 && param->count(GNM_PRM) == 0) {
       cerr << "Processing additional sequences is optional with -gnm only.";
       cerr << endl;
@@ -546,15 +546,15 @@ int main(int argc, char * argv[]) {
     } else if (param->count(DIR_PRM) > 0 && param->count(GNM_PRM) > 0) {
       Util::checkFile(param->at(DIR_PRM));
     }
-    
+
     if (param->count(MSK_PRM) > 0 && param->count(GNM_PRM) > 0) {
       Util::checkFile(param->at(MSK_PRM));
     }
-    
+
     if (param->count(RPT_PRM) > 0 && param->count(GNM_PRM) > 0) {
       Util::checkFile(param->at(RPT_PRM));
     }
-    
+
     // Print out the parameters table
     typedef map<string, string> myMap;
     myMap::iterator sIter = param->begin();
@@ -565,10 +565,10 @@ int main(int argc, char * argv[]) {
       sIter++;
     }
     cout << endl;
-    
+
     // Start!
     drive(param);
-    
+
     // Clear parameters when done.
     param->clear();
     delete param;
@@ -577,7 +577,7 @@ int main(int argc, char * argv[]) {
     cerr << endl;
     cerr << message << endl;
   }
-  
+
   //return EXIT_SUCCESS;
   return 0;
 }
diff --git a/src/nonltr/TableBuilder.cpp b/src/nonltr/TableBuilder.cpp
index 32733a9..d038aab 100644
--- a/src/nonltr/TableBuilder.cpp
+++ b/src/nonltr/TableBuilder.cpp
@@ -31,7 +31,7 @@ void TableBuilder::buildTable() {
 	for (int i = 0; i < fileList->size(); i++) {
 		cout << "Counting k-mers in " << fileList->at(i) << " ..." << endl;
 		ChromListMaker * maker = new ChromListMaker(fileList->at(i));
-		const vector<Chromosome *> * chromList = maker->makeChromOneDigitList();
+		const vector<Chromosome *> * chromList = maker->makeChromOneDigitDnaList();
 
 		for (int h = 0; h < chromList->size(); h++) {
 			ChromosomeOneDigit * chrom =
diff --git a/src/nonltr/Trainer.cpp b/src/nonltr/Trainer.cpp
index 3e8865f..1a86a8f 100644
--- a/src/nonltr/Trainer.cpp
+++ b/src/nonltr/Trainer.cpp
@@ -106,7 +106,7 @@ void Trainer::stage2() {
 		cout << "Calculating the percentage in: " << fileList->at(i) << " ...";
 		cout << endl;
 		ChromListMaker * maker = new ChromListMaker(fileList->at(i));
-		const vector<Chromosome *> * chromList = maker->makeChromOneDigitList();
+		const vector<Chromosome *> * chromList = maker->makeChromOneDigitDnaList();
 
 		for (int h = 0; h < chromList->size(); h++) {
 			ChromosomeOneDigit * chrom =
@@ -190,7 +190,7 @@ void Trainer::stage3() {
 
 		// Read sequences in the file
 		ChromListMaker * maker = new ChromListMaker(fileList->at(i));
-		const vector<Chromosome *> * chromList = maker->makeChromOneDigitList();
+		const vector<Chromosome *> * chromList = maker->makeChromOneDigitDnaList();
 
 		for (int h = 0; h < chromList->size(); h++) {
 			ChromosomeOneDigit * chrom = dynamic_cast<ChromosomeOneDigit *>(chromList->at(h));
@@ -216,7 +216,7 @@ void Trainer::stage3() {
 				}
 				trainingRegionList = detector->getRegionList();
 
-				
+
 			}
 
 			if (isCON && isConRepAvailable) {
@@ -225,7 +225,7 @@ void Trainer::stage3() {
 					locList->mergeWithAnotherList(detector->getRegionList());
 				}
 				trainingRegionList = locList->getList();
-				
+
 			}
 
 			// The candidate regions are already copied to the location list
@@ -236,7 +236,7 @@ void Trainer::stage3() {
 
 			// Train the HMM
 			if(isCND || (isCON && isConRepAvailable)){
-				
+
 				scorer->takeLog(t);
 				scoreList = scorer->getScores();
 				hmm->train(scoreList, chrom->getSegment(), trainingRegionList);
diff --git a/src/predict/BestFirstSelector.cpp b/src/predict/BestFirstSelector.cpp
new file mode 100644
index 0000000..c28d39b
--- /dev/null
+++ b/src/predict/BestFirstSelector.cpp
@@ -0,0 +1,258 @@
+// -*- C++ -*-
+/*
+ * BestFirstSelector.cpp
+ *
+ * Author: Benjamin T James
+ */
+
+#include "BestFirstSelector.h"
+#include "../clutil/Progress.h"
+#include <set>
+#include <queue>
+
+template<class T>
+pair<Feature<T>*,matrix::GLM> BestFirstSelector<T>::train_regression(Feature<T>* feat, const vector<pra<T> > &training,const vector<pra<T> > &testing)
+{
+	matrix::GLM glm;
+	return {NULL,glm};
+}
+
+using FeatPair = std::pair<uint64_t, Combo>;
+
+struct Compare {
+	bool operator()(const std::pair<set<FeatPair>,double> &a, const std::pair<set<FeatPair>,double> &b) {
+		return a.second < b.second;
+	}
+};
+
+using pqueue = std::priority_queue<std::pair<set<FeatPair>,double>, std::vector<std::pair<set<FeatPair>,double> >, Compare>;
+
+vector<set<FeatPair> > children_of(set<FeatPair> feat,
+				       const vector<FeatPair>& all_feats,
+				       const set<set<FeatPair> >& closed_list,
+
+				       const set<set<FeatPair> > &open_list)
+{
+	vector<set<FeatPair> > out;
+	for (auto fp : all_feats) {
+		set<FeatPair> temp = feat;
+		auto pos = std::find(temp.begin(), temp.end(), fp);
+		if (pos == temp.end()) {
+			temp.insert(fp);
+		} else {
+			temp.erase(pos);
+		}
+		auto pos_bad = std::find(closed_list.begin(), closed_list.end(), temp);
+		if (!temp.empty() && pos_bad == closed_list.end()) {
+			auto pos_good = std::find(open_list.begin(), open_list.end(), temp);
+			if (pos_good == open_list.end()) {
+				out.push_back(temp);
+			}
+		}
+	}
+        return out;
+}
+template<class T>
+std::string feat_name(Feature<T>* feat)
+{
+	std::ostringstream oss;
+	auto feat_names = feat->feat_names();
+	for (int i = 0; i < feat_names.size(); i++) {
+		oss << feat_names[i];
+		if (i < feat_names.size() - 1) {
+			oss << " + ";
+		}
+	}
+	return oss.str();
+}
+template<class T>
+std::string feature_name(const set<FeatPair>& feat_list, int k)
+{
+	Feature<T> feat(k);
+	for (auto fpair : feat_list) {
+		feat.add_feature(fpair.first, fpair.second);
+	}
+	return feat_name(&feat);
+}
+
+template<class T>
+Feature<T>* load_feat(Feature<T>* old_feat, const set<FeatPair>& feat_list, const vector<pra<T> > &training)
+{
+
+	Feature<T>* feat = new Feature<T>(*old_feat);
+	feat->set_save(true);
+	auto c_size = feat->get_combos().size();
+	for (int i = 0; i < c_size; i++) {
+		feat->remove_feature();
+	}
+	set<uint64_t> single_feats;
+	for (FeatPair fp : feat_list) {
+		for (uint64_t i = 1; i <= fp.first; i *= 2) {
+			if (i & fp.first) {
+				single_feats.insert(i);
+			}
+		}
+		feat->add_feature(fp.first, fp.second);
+	}
+	for (uint64_t i : single_feats) {
+		auto minmax = old_feat->get_normal(i);
+		// #pragma omp critical
+		// {
+		// 	cout << "Feature " << Feature<T>::log2(i) << " min: " << minmax.first << " max: " << minmax.second << endl;
+		// }
+		feat->set_normal(i, minmax.first, minmax.second);
+	}
+
+//	feat->normalize(training);
+	feat->finalize();
+
+	return feat;
+}
+
+template<class T>
+void calculate_table(Feature<T>* feat, vector<FeatPair> possible_feats, const vector<pra<T> > &training,const vector<pra<T> > &testing)
+{
+	auto c_size = feat->get_combos().size();
+	for (int i = 0; i < c_size; i++) {
+		feat->remove_feature();
+	}
+	for (FeatPair fp : possible_feats) {
+		feat->add_feature(fp.first, fp.second);
+	}
+	feat->set_save(true);
+	feat->normalize(training);
+	feat->finalize();
+	for (auto pr : testing) {
+		feat->compute(*pr.first, *pr.second);
+	}
+}
+template<class T>
+pair<string,double> feature_accuracy(const set<FeatPair>& feat_set, Feature<T>* old_feat, const vector<pra<T> > &training,const vector<pra<T> > &testing, double id)
+{
+	Feature<T>* feat = NULL;
+	feat = load_feat(old_feat, feat_set, training);
+//	cout << "Considering " << name << " ";
+	auto pr = FeatureSelector<T>::class_train(training, *feat, id);
+	auto class_ac = FeatureSelector<T>::class_test(testing, *feat, pr.second, id);
+	double class_accuracy = get<0>(class_ac);
+//	cout << "Accuracy: " << class_accuracy << endl;
+	std::string name = feat_name(feat);
+	delete feat;
+	return make_pair(name, class_accuracy);
+}
+
+pair<int,int> feat_list_sizes(const set<set<FeatPair> >& flist)
+{
+	int minimum = flist.begin()->size();
+	int maximum = 0;
+	for (auto item : flist) {
+		if (item.size() < minimum) {
+			minimum = item.size();
+		}
+		if (item.size() > maximum) {
+			maximum = item.size();
+		}
+	}
+	return make_pair(minimum, maximum);
+}
+template<class T>
+void evaluate(const vector<set<FeatPair> > &item_list, set<set<FeatPair> > &open_map, pqueue &open_heap, Feature<T>* feat, const vector<pra<T> > &training,const vector<pra<T> > &testing, double id, int max_num_feat)
+{
+	std::ostringstream oss;
+// of size " << item_list[0].size();
+
+
+//	Progress prog(item_list.size(), oss.str());
+	auto minmax = feat_list_sizes(open_map);
+	oss << "Evaluating features " << minmax.second << "/" << max_num_feat;
+	Progress prog(item_list.size(), oss.str());
+	#pragma omp parallel for
+	for (int i = 0; i < item_list.size(); i++) {
+		const set<FeatPair>& item = item_list[i];
+		auto feat_acc = feature_accuracy(item, feat, training, testing, id);
+		double acc = feat_acc.second;
+		string name = feat_acc.first;
+		#pragma omp critical
+		{
+			prog++;
+//			cout << name << ": " << acc << endl;
+			open_map.insert(item);
+			open_heap.push(std::make_pair(item, acc));
+		}
+	}
+	prog.end();
+}
+
+template<class T>
+std::pair<Feature<T>*,matrix::GLM> BestFirstSelector<T>::train_class(Feature<T>* feat, const vector<pra<T> > &training,const vector<pra<T> > &testing, double id)
+{
+	set<FeatPair> feat_set, best_feat_set;
+	set<set<FeatPair> > closed_list, open_list;
+	pqueue open_heap;
+
+	int last_best_changed = 0;
+	double best_acc = -100;
+	const double eps = 0;
+
+	cout << "Calculating all features" << endl;
+	calculate_table(feat, possible_feats, training, testing);
+	// prime the open_map
+	vector<set<FeatPair> > children = children_of(feat_set, possible_feats, closed_list, open_list);
+	evaluate(children, open_list, open_heap, feat, training, testing, id, max_num_feat);
+        for (int iteration = 0; !open_list.empty(); iteration++) {
+
+		auto minmax = feat_list_sizes(open_list);
+
+		// stopping criteria: if we have already met the maximum number of features
+		// or if no features changed in the last 3 iterations of having a minimum number of features
+		if (minmax.second > max_num_feat || (iteration - last_best_changed >= 3 && minmax.second > min_num_feat)) {
+			break;
+		}
+		//cout << "Features: " << minmax.first << " to " << minmax.second << endl;
+
+		// Peek at the maximum-accuracy feature
+		auto ptr = open_heap.top();
+		feat_set = ptr.first;
+		double acc = ptr.second;
+
+		// Remove the best item from the open list/heap and add to the closed list
+		open_heap.pop();
+		open_list.erase(feat_set);
+		closed_list.insert(feat_set);
+
+		if (acc - eps > best_acc && feat_set.size() >= min_num_feat && feat_set.size() <= max_num_feat) {
+			//cout << "New Best feature: " << feature_name<T>(feat_set, feat->get_k()) << endl;
+			best_feat_set = feat_set;
+			best_acc = acc;
+			last_best_changed = iteration;
+		}
+
+		vector<set<FeatPair> > children = children_of(feat_set, possible_feats, closed_list, open_list);
+		evaluate(children, open_list, open_heap, feat, training, testing, id, max_num_feat);
+	}// while (iteration++ - last_best_changed < 2);
+
+	Feature<T>* feat_c = load_feat(feat, best_feat_set, training);
+	feat_c->set_save(false);
+	auto pr = FeatureSelector<T>::class_train(training, *feat_c, id);
+	matrix::GLM c_glm = pr.second;
+
+	auto train_results = FeatureSelector<T>::class_test(training, *feat_c, c_glm, id);//, "train");
+	cout << "Training ACC: " << get<0>(train_results) << " " << get<1>(train_results) << " " << get<2>(train_results) << endl;
+	auto test_results = FeatureSelector<T>::class_test(testing, *feat_c, c_glm, id);//, "test");
+	double class_acc = get<0>(test_results);
+	cout << "Testing ACC: " << class_acc << " " << get<1>(test_results) << " " << get<2>(test_results) << endl;
+
+	cout << "Features: "<< endl;
+	for (auto line : feat_c->feat_names()) {
+		cout << "\t" << line << endl;
+	}
+	return std::make_pair(feat_c, c_glm);
+}
+
+
+template class BestFirstSelector<uint8_t>;
+template class BestFirstSelector<uint16_t>;
+template class BestFirstSelector<uint32_t>;
+template class BestFirstSelector<uint64_t>;
+template class BestFirstSelector<int>;
+template class BestFirstSelector<double>;
diff --git a/src/predict/BestFirstSelector.h b/src/predict/BestFirstSelector.h
new file mode 100644
index 0000000..b969b09
--- /dev/null
+++ b/src/predict/BestFirstSelector.h
@@ -0,0 +1,25 @@
+// -*- C++ -*-
+/*
+ * BestFirstSelector.h
+ *
+ * Author: Benjamin T James
+ */
+
+#ifndef BEST_FIRST_SELECTOR_H
+#define BEST_FIRST_SELECTOR_H
+#include "FeatureSelector.h"
+#include <set>
+template<class T>
+class BestFirstSelector : public FeatureSelector<T> {
+public:
+	BestFirstSelector(vector<std::pair<uint64_t, Combo> > possible_feats_, int min_n_feat, int max_n_feat) : possible_feats(possible_feats_), min_num_feat(min_n_feat), max_num_feat(max_n_feat) {}
+	~BestFirstSelector() {}
+
+	pair<Feature<T>*,matrix::GLM> train_regression(Feature<T>* tfeat, const vector<pra<T> > &training,const vector<pra<T> > &testing);
+	pair<Feature<T>*,matrix::GLM> train_class(Feature<T>* tfeat, const vector<pra<T> > &training,const vector<pra<T> > &testing, double id);
+
+private:
+	int max_num_feat, min_num_feat;
+	vector<std::pair<uint64_t, Combo> > possible_feats;
+};
+#endif
diff --git a/src/cluster/src/Feature.cpp b/src/predict/Feature.cpp
similarity index 81%
rename from src/cluster/src/Feature.cpp
rename to src/predict/Feature.cpp
index 67baf50..74147f5 100644
--- a/src/cluster/src/Feature.cpp
+++ b/src/predict/Feature.cpp
@@ -9,14 +9,13 @@
  * exist because I was lazy and couldn't get
  * anonymous functions to work with the hashing
  */
-#include "Feature.h"
-#include "DivergencePoint.h"
-#include <cmath>
-#include <numeric>
+
+#include <map>
+#include <iterator>
 #include <algorithm>
-#include <limits>
-#include "../../utility/GlobAlignE.h"
+#include "Feature.h"
 
+using namespace std;
 
 template<class T>
 Feature<T>::Feature(const Feature<T>& feat_) : k(feat_.get_k())
@@ -28,6 +27,7 @@ Feature<T>::Feature(const Feature<T>& feat_) : k(feat_.get_k())
 	combos = feat_.get_combos();
 	lookup = feat_.get_lookup();
 	is_finalized = feat_.get_finalized();
+	ltable = feat_.get_ltable();
 	do_save = false;
 	auto freverse = [](int idx, int k) {
 		int sum = 0;
@@ -102,7 +102,7 @@ Feature<T> Feature<T>::operator=(const Feature<T>& feat_)
 template<class T>
 void Feature<T>::add_feature(uint64_t f_flags, Combo combo)
 {
-//	cout << "Adding combo " << f_flags << endl;
+	//	cout << "Adding combo " << f_flags << endl;
 	if (combo != Combo::xy && combo != Combo::x2y && combo != Combo::xy2 && combo != Combo::x2y2) {
 		throw "invalid combo";
 	}
@@ -138,6 +138,13 @@ void Feature<T>::normalize_cache(vector<double> &cache) const
 {
 	for (size_t i = 0; i < lookup.size(); i++) {
 		double val = (cache[i] - mins[i]) / (maxs[i] - mins[i]);
+
+		// Hani Z. Girgis added this test
+		if(isnan(val)){
+			cerr << "Got NAN from max " << maxs[i] << " min " << mins[i] << endl;
+			throw std::exception();
+		}
+
 		if (is_sims[i]) {
 			cache[i] = val;
 		} else {
@@ -172,6 +179,39 @@ void Feature<T>::set_normal(uint64_t single_flag, double min_, double max_)
 	is_finalized.at(idx) = true;
 }
 
+template<class T>
+pair<double,double> Feature<T>::get_normal(uint64_t single_flag) const
+{
+	int idx = index_of(single_flag);
+	return make_pair(mins.at(idx), maxs.at(idx));
+}
+
+/*
+template<class T>
+vector<double> Feature<T>::get_raw(const vector<pair<Point<T>*,Point<T>*> > &vec, int index) const
+{
+	std::vector<double> results(vec.size(), 0);
+	auto func = raw_funcs[index];
+
+	#pragma omp parallel for
+	for (size_t i = 0; i < vec.size(); i++) {
+		results[i] = func(*vec[i].first, *vec[i].second);
+	}
+
+	double vmin, vmax;
+        auto mm = std::minmax_element(results.begin(), results.end());
+	vmin = *(mm.first);
+	vmax = *(mm.second);
+	for (auto &v : results) {
+		v = (v - vmin) / (vmax - vmin);
+		if (! is_sims[index]) {
+			v = 1 - v;
+		}
+	}
+	return results;
+}
+*/
+
 template<class T>
 void Feature<T>::normalize(const vector<pra<T> > &pairs)
 {
@@ -203,6 +243,27 @@ void Feature<T>::normalize(const vector<pra<T> > &pairs)
 
 		mins[i] = small;
 		maxs[i] = big;
+
+		// Hani Z. Girgis added this tests
+		if(abs(maxs[i] - mins[i]) <= 0.000000001){
+			cerr << "Error of feature: " << feat_names().at(i) << ". ";
+			cerr << "The maximum distance cannot be zero.";
+			cerr << endl;
+			throw std::exception();
+		}
+
+		if(isinf(maxs[i])){
+			cerr << "Error of feature: " << feat_names().at(i) << ". ";
+			cerr << "Maximum is " << maxs[i] << endl;
+			throw std::exception();
+		}
+
+		if(isinf(mins[i])){
+			cerr << "Error of feature: " << feat_names().at(i) << ". ";
+			cerr << "Minimum is " << mins[i] << endl;
+			throw std::exception();
+		}
+
 	}
 };
 
@@ -548,7 +609,8 @@ bool Feature<T>::feat_is_sim(uint64_t single_flag) const
 		is_sim = false;
 		break;
 	case FEAT_SPEARMAN:
-		is_sim = true;
+		is_sim = false; // Hani Z. Girgis modified the boolean
+		//is_sim = true;
 		break;
 	case FEAT_JACCARD:
 		is_sim = true;
@@ -710,6 +772,7 @@ double Feature<T>::intersection(Point<T> &a, Point<T> &b)
 	for (auto i = 0; i < N; i++) {
 		dist += 2 * std::min(p.points[i], q.points[i]);
 	}
+
 	return (double)dist / (double)mag;
 }
 
@@ -744,7 +807,7 @@ double Feature<T>::pearson(Point<T> &a, Point<T> &b)
 		nq += dq * dq;
 		dot += dp * dq;
 	}
-	return dot / sqrt(std::max(np * nq, 0.5));
+	return dot / sqrt(np * nq);
 }
 
 template<class T>
@@ -874,6 +937,11 @@ double Feature<T>::c_n2rrc(Point<T>& a, Point<T>& b) {
 template<class T>
 double Feature<T>::n2rrc(Point<T>& a, Point<T>& b) const
 {
+	if(!Util::isDna){
+		cerr << "n2rrc cannot be calculated on protein sequences." << endl;
+		throw std::exception();
+	}
+
 	const DivergencePoint<T>& p = dynamic_cast<const DivergencePoint<T>&>(a);
 	const DivergencePoint<T>& q = dynamic_cast<const DivergencePoint<T>&>(b);
 	const auto N = p.points.size();
@@ -921,7 +989,7 @@ double Feature<T>::jensen_shannon(Point<T> &a, Point<T> &b) const
 	uint64_t mq = q.getPseudoMagnitude();
 	double sum = 0;
 	const auto N = p.points.size();
-        #pragma omp simd reduction(+:sum)
+    #pragma omp simd reduction(+:sum)
 	for (auto i = 0; i < N; i++) {
 		double pp = (double)p.points[i] / mp;
 		double pq = (double)q.points[i] / mq;
@@ -955,22 +1023,27 @@ double Feature<T>::c_rre_k_r(Point<T>& a, Point<T>& b) {
 	}
 }
 
+// This statistics uses conditional probability
+// Modified by Hani Z. Girgis on Oct 7 2018 to enable processing protein sequences
 template<class T>
 double Feature<T>::rre_k_r(Point<T>& a, Point<T>& b)
 {
 	const DivergencePoint<T>& p = dynamic_cast<const DivergencePoint<T>&>(a);
 	const DivergencePoint<T>& q = dynamic_cast<const DivergencePoint<T>&>(b);
 	const auto N = p.points.size();
+	const auto A = Util::getAlphabetSize();
+
 	double op = 0, oq = 0;
-	const double l4 = log(4);
+	const double l4 = log(A);
 	uint64_t sum4_p = 0, sum4_q = 0;
+
 	for (auto i = 0; i < N; i++) {
 		sum4_p += p.points[i];
 		sum4_q += q.points[i];
-		if (i % 4 == 3) {
+		if (i % A == (A-1)) {
 			double inner_sum_p = 0;
 			double inner_sum_q = 0;
-			for (auto j = i - 3; j <= i; j++) {
+			for (auto j = i - (A-1); j <= i; j++) {
 				double conditional_p = (double)p.points[j] / sum4_p;
 				double conditional_q = (double)q.points[j] / sum4_q;
 				double avg = 0.5 * (conditional_p + conditional_q);
@@ -985,7 +1058,8 @@ double Feature<T>::rre_k_r(Point<T>& a, Point<T>& b)
 			sum4_q = 0;
 		}
 	}
-        double val = 0.5 * (op + oq);
+
+    double val = 0.5 * (op + oq);
 	return val;
 }
 
@@ -1165,7 +1239,24 @@ double Feature<T>::jefferey_divergence(Point<T>& a, Point<T>& b)
 	for (auto i = 0; i < N; i++) {
 		double pp = (double)p.points[i] / mp;
 		double pq = (double)q.points[i] / mq;
+		// if (q.points[i] == 0) {
+		// 	cout << "Error for sequence " << q.get_header() << endl;
+		// 	for (int j = 0; j < q.points.size(); j++) {
+		// 		cout << q.points.at(j) << " ";
+		// 	}
+		// 	cout << endl;
+		// 	exit(1);
+		// }
+		// if (p.points[i] == 0) {
+		// 	cout << "Error for sequence " << p.get_header() << endl;
+		// 	for (int j = 0; j < p.points.size(); j++) {
+		// 		cout << (int)p.points.at(j) << " ";
+		// 	}
+		// 	cout << endl;
+		// 	exit(1);
+		// }
 		double diff = pp - pq;
+		//	cout << "pp: " << pp << " pq: " << pq << " pp/pq: " << pp / pq << endl;
 	        sum += diff * log(pp / pq);
 	}
 	return sum;
@@ -1219,21 +1310,26 @@ double Feature<T>::c_kl_conditional(Point<T>& a, Point<T>& b) {
 	}
 }
 
+// Modified by Hani Z Girgis on Oct 7 2018.
 template<class T>
 double Feature<T>::kl_conditional(Point<T>& a, Point<T>& b)
 {
 	const DivergencePoint<T>& p = dynamic_cast<const DivergencePoint<T>&>(a);
 	const DivergencePoint<T>& q = dynamic_cast<const DivergencePoint<T>&>(b);
-	uint64_t sum4_p = 0,    sum4_q = 0;            // Sum for every 4 nucleotides
+	uint64_t sum4_p = 0,    sum4_q = 0;            // Sum for every 4 nucleotides or 22 a.a.
 	double outer_sum_p = 0, outer_sum_q = 0;       // Prior K-mer sum
+
 	const auto N = p.points.size();
+	const auto A = Util::getAlphabetSize();
+
 	for (auto i = 0; i < N; i++) {
 		sum4_p += p.points[i];
 		sum4_q += q.points[i];
-		if (i % 4 == 3) { //finished counting word, now compute probabilities
+
+		if (i % A == A-1) { //finished counting word, now compute probabilities
 			double inner_sum_p = 0;        // Sum of p(X|Y) * log(p(X|Y) / q(X|Y))
 			double inner_sum_q = 0;        // Sum of q(X|Y) * log(q(X|Y) / p(X|Y))
-			for (auto j = i - 3; j <= i; j++) {
+			for (auto j = i - (A-1); j <= i; j++) {
 				double conditional_p = (double)p.points[j] / sum4_p;
 				double conditional_q = (double)q.points[j] / sum4_q;
 				double lg = log(conditional_p / conditional_q);
@@ -1273,20 +1369,26 @@ double Feature<T>::markov(Point<T>& a, Point<T>& b)
 	const DivergencePoint<T>& q = dynamic_cast<const DivergencePoint<T>&>(a);
 	const DivergencePoint<T>& p = dynamic_cast<const DivergencePoint<T>&>(b);
 	double total = 0;       // Prior K-mer sum
+
+	// Hani Z. Girgis modified this code on Oct 2 2018
+	// to adapt this feature to proteins
 	const auto N = p.points.size();
-	for (auto i = 0; i < N; i += 4) {
+	const auto A = Util::getAlphabetSize();
+
+	for (auto i = 0; i < N; i += A) {
 		uint64_t psum = 0, qsum = 0;
-		for (auto j = 0; j < 4; j++) {
+		for (auto j = 0; j < A; j++) {
 			psum += p.points[i+j];
 			qsum += q.points[i+j];
 		}
 		double lpsum = log(psum);
 		double lqsum = log(qsum);
-		for (auto j = 0; j < 4; j++) {
+		for (auto j = 0; j < A; j++) {
 			total += (q.points[i+j]-1) * (log(p.points[i+j]) - lpsum);
 			total += (p.points[i+j]-1) * (log(q.points[i+j]) - lqsum);
 		}
-        }
+    }
+
 	return total / 2;
 }
 
@@ -1319,7 +1421,7 @@ double Feature<T>::d2z(Point<T>& a, Point<T>& b)
 		double pz = (p.points[i] - ap) / sp;
 		double qz = (q.points[i] - aq) / sq;
 		sum += pz * qz;
-        }
+    }
 	return sum;
 }
 
@@ -1415,16 +1517,74 @@ double Feature<T>::emd(Point<T>& a, Point<T>& b)
 	return (double)dist;
 }
 
-template<class T>
-std::vector<size_t> tiedrank(const Point<T>& a)
-{
+// Commented by Hani Z. Girgis
+// template<class T>
+// std::vector<size_t> tiedrank(const Point<T>& a)
+// {
+// 	const DivergencePoint<T>& p = dynamic_cast<const DivergencePoint<T>&>(a);
+// 	const auto N = p.points.size();
+// 	vector<size_t> ip(N, 0);
+// 	std::iota(std::begin(ip), std::end(ip), 0);
+// 	std::sort(std::begin(ip), std::end(ip), [&](size_t x, size_t y) {
+// 			return p.points[x] < p.points[y];
+// 	});
+
+// 	for(auto elm : ip){
+// 		cerr << elm << endl;
+// 	}
+// 	exit(9);
+// 	return ip;
+// }
+
+// Added by Hani Z. Girgis
+template<class T>
+std::vector<double> tiedrank(const Point<T>& a){
+	// Initialize multimap
 	const DivergencePoint<T>& p = dynamic_cast<const DivergencePoint<T>&>(a);
-	const auto N = p.points.size();
-	vector<size_t> ip(N, 0);
-	std::iota(std::begin(ip), std::end(ip), 0);
-	std::sort(std::begin(ip), std::end(ip), [&](size_t x, size_t y) {
-			return p.points[x] < p.points[y];
-		});
+	unsigned int n = p.points.size();
+
+	std::multimap<T, double > mmap;
+	for(unsigned i = 0; i < n; i++){
+		mmap.insert(pair<T,double>( p.points[i] , i));
+	}
+
+	// Set ranks without ties
+	int lastRank = 0;
+	// std::multimap<T,double>::iterator
+	for (auto it=mmap.begin(); it!=mmap.end(); ++it){
+	 	(*it).second = ++lastRank;
+	}
+
+	for (auto it=mmap.begin(); it!=mmap.end(); it=mmap.upper_bound((*it).first)){
+	 	auto ret = mmap.equal_range((*it).first);
+
+    	// Calculate the average rank
+    	double rankTotal = 0;
+    	double count = 0;
+    	for (auto it1=ret.first; it1 != ret.second; ++it1){
+    		count++;
+    		rankTotal += (*it1).second;
+    	}
+
+    	// Assign the average rank
+    	double meanRank = rankTotal / count;
+    	for (auto it1=ret.first; it1 != ret.second; ++it1){
+    		(*it1).second = meanRank;
+    		// cout << (*it).first << " => " << (*it1).second << endl;
+    	}
+	}
+
+	std::vector<double> r(n, 0);
+	for(unsigned int i = 0; i < n; i++){
+		r[i] = mmap.find(p.points[i])->second;
+	}
+
+	// For testing
+	// for(unsigned int i = 0; i < n; i++){
+	// 	cout << r[i] << endl;
+	// }
+
+	return r;
 }
 
 template<class T>
@@ -1442,6 +1602,7 @@ double Feature<T>::c_spearman(Point<T>& a, Point<T>& b) {
 	}
 }
 
+/*
 template<class T>
 double Feature<T>::spearman(Point<T>& a, Point<T>& b)
 {
@@ -1455,9 +1616,7 @@ double Feature<T>::spearman(Point<T>& a, Point<T>& b)
 	std::sort(std::begin(ip), std::end(ip), [&](size_t x, size_t y) {
 			return p.points[x] < p.points[y];
 		});
-	std::sort(std::begin(iq), std::end(iq), [&](size_t x, size_t y) {
-			return q.points[x] < q.points[y];
-		});
+
 	double expected = (N+1) / 2.0;
 	double cov = 0;
 	double sp = 0;
@@ -1466,8 +1625,41 @@ double Feature<T>::spearman(Point<T>& a, Point<T>& b)
 		cov += (ip[i] - expected) * (iq[i] - expected);
 		sp += (ip[i] - expected) * (ip[i] - expected);
 		sq += (iq[i] - expected) * (iq[i] - expected);
-        }
-	return (N * cov) / (sp * sq);
+    }
+
+    cout << "N: "   << N   << endl;
+    cout << "Cov: " << cov << endl;
+    cout << "Sp: "  << sp  << endl;
+    cout << "Sq: "  << sq  << endl;
+
+    double results = (N * cov) / (sp * sq);
+
+	return log(results);
+}
+*/
+
+
+
+template<class T>
+double Feature<T>::spearman(Point<T>& a, Point<T>& b)
+{
+	vector<double> ip = tiedrank(a);
+	vector<double> iq = tiedrank(b);
+	const auto N = iq.size();
+
+	double expected = (N+1) / 2.0;
+	double cov = 0;
+	double sp = 0;
+	double sq = 0;
+	for (auto i = 0; i < N; i++) {
+		cov += (ip[i] - expected) * (iq[i] - expected);
+		sp += (ip[i] - expected) * (ip[i] - expected);
+		sq += (iq[i] - expected) * (iq[i] - expected);
+    }
+
+    double result = 1 - ( cov / ( sqrt(sp) * sqrt(sq) ));
+    // cerr << result << endl;
+	return result;
 }
 
 template<class T>
@@ -1515,13 +1707,25 @@ double Feature<T>::c_d2s(Point<T>& a, Point<T>& b) {
 	}
 }
 
+// Modified by Hani Z. Girgis on Oct 07 2018 to enable comparing protein sequences
+// Note: This feature cannot be used if k is 1.
 template<class T>
 double Feature<T>::d2s(Point<T>& a, Point<T>& b)
 {
 	const DivergencePoint<T>& p = dynamic_cast<const DivergencePoint<T>&>(a);
 	const DivergencePoint<T>& q = dynamic_cast<const DivergencePoint<T>&>(b);
 	const auto N = p.points.size();
-	const int k = (int)(log(N) / log(4));
+	const auto A = Util::getAlphabetSize();
+
+	// Commented out by Hani Z Girgis and replaced by the line next to it.
+	// const int k = (int)(log(N) / log(4));
+	int k = a.getK();
+	if(k==1){
+		cerr << "D2s is skipped because it cannot be applied when k is 1.";
+		cerr << endl;
+		throw std::exception();
+	}
+
 	const auto p1 = p.get_1mers();
 	const auto q1 = q.get_1mers();
 	const double pmag = p.getPseudoMagnitude();
@@ -1529,23 +1733,129 @@ double Feature<T>::d2s(Point<T>& a, Point<T>& b)
 	double sum = 0;
 	for (size_t i = 0; i < N; i++) {
 		double p1i = 1;
-	        double q1i = 1;
-	        size_t idx = i;
+	    double q1i = 1;
+	    size_t idx = i;
 		for (int j = 0; j < k; j++) {
-			int i1 = idx % 4;
-			idx /= 4;
+			int i1 = idx % A;
+			idx /= A;
 			p1i *= (double)p1[i1] / pmag;
 			q1i *= (double)q1[i1] / qmag;
 		}
-		double hp = p.points[i] - pmag * p1i;
-		double hq = q.points[i] - qmag * q1i;
-		if (hp != 0 && hq != 0) {
-			sum += hp * hq / hypot(hp, hq);
+
+		// Post conditions the probabilities
+		if(p1i > 1 || p1i < 0){
+			cerr << "p1i is too big or too small." << endl;
+			throw std::exception();
+		}
+		if(q1i > 1 || q1i < 0){
+			cerr << "pq1i is too big or too small." << endl;
+			throw std::exception();
+		}
+
+		//double hp = p.points[i] - pmag * p1i;
+		//double hq = q.points[i] - qmag * q1i;
+		double hp = p.points[i] - (p.getRealMagnitude() * p1i + 1);
+		double hq = q.points[i] - (q.getRealMagnitude() * q1i + 1);
+		double denom = hypot(hp, hq);
+		if (denom != 0 ) {
+			sum += (hp * hq) / denom;
 		}
 	}
 	return sum;
 }
 
+template<class T>
+double Feature<T>::c_d2_star(Point<T>& a, Point<T>& b) {
+
+	auto aid = a.get_id();
+	auto bid = b.get_id();
+	auto tup = std::tuple<uintmax_t, uintmax_t, uint8_t>(aid, bid, Feature<T>::log2(FEAT_D2_star));
+	if (ltable.find(tup) == ltable.end()) {
+		double val = d2_star(a, b);
+		ltable.insert({tup, val});
+		return val;
+	} else {
+		return ltable.at(tup);
+	}
+}
+
+// Modified by Hani Z. Girgis on Oct 07 2018 to enable comparing protein sequences
+// This method is rewriten based on the d2s code.
+// Note: This feature cannot be used if k is 1.
+template<class T>
+double Feature<T>::d2_star(Point<T>& a, Point<T>& b)
+{
+	const DivergencePoint<T>& p = dynamic_cast<const DivergencePoint<T>&>(a);
+	const DivergencePoint<T>& q = dynamic_cast<const DivergencePoint<T>&>(b);
+	const auto N = p.points.size();
+	const auto A = Util::getAlphabetSize();
+
+	// Commented out by Hani Z Girgis and replaced by the line next to it.
+	// const int k = (int)(log(N) / log(4));
+	int k = a.getK();
+	if(k==1){
+		cerr << "D2_star cannot be applied when k is 1.";
+		cerr << endl;
+		throw std::exception();
+	}
+
+	const auto p1 = p.get_1mers();
+	const auto q1 = q.get_1mers();
+	const double pmag = p.getPseudoMagnitude();
+	const double qmag = q.getPseudoMagnitude();
+	const double pq_len = sqrt(p.getRealMagnitude() * q.getRealMagnitude());
+
+	double sum = 0;
+	for (size_t i = 0; i < N; i++) {
+		double p1i  = 1;
+	    double q1i  = 1;
+	    double pq1i = 1;
+	    size_t idx  = i;
+		for (int j = 0; j < k; j++) {
+			int i1 = idx % A;
+			idx  /= A;
+			p1i  *= (double) p1.at(i1) / pmag;
+			q1i  *= (double) q1.at(i1) / qmag;
+			pq1i *= ((double) p1.at(i1) + q1.at(i1)) / (pmag + qmag);
+		}
+
+		// Post conditions the probabilities
+		if(p1i > 1 || p1i < 0){
+			cerr << "p1i is too big or too small." << endl;
+			throw std::exception();
+		}
+		if(q1i > 1 || q1i < 0){
+			cerr << "pq1i is too big or too small." << endl;
+			throw std::exception();
+		}
+		if(pq1i > 1 || pq1i < 0){
+			cerr << "pq1i is too big or too small." << endl;
+			throw std::exception();
+		}
+
+		double hp = p.points[i] - (p.getRealMagnitude() * p1i + 1);
+		double hq = q.points[i] - (q.getRealMagnitude() * q1i + 1);
+		double e = (p.getRealMagnitude() + q.getRealMagnitude()) * pq1i + 1;
+
+		// Post conditions on the expected value
+		if(e > p.getRealMagnitude() + q.getRealMagnitude()){
+			cerr << "E is too big." << endl;
+			throw std::exception();
+		}
+		if(e < 0){
+			cerr << "E is too small." << endl;
+			throw std::exception();
+		}
+
+		double denom =  e * pq_len;
+		if (denom > 0) {
+			sum += hp * hq / denom;
+		}
+	}
+
+	return sum;
+}
+
 template<class T>
 double Feature<T>::c_afd(Point<T>& a, Point<T>& b) {
 
@@ -1561,37 +1871,54 @@ double Feature<T>::c_afd(Point<T>& a, Point<T>& b) {
 	}
 }
 
+// Modified by Hani Z. Girgis to enable processing protein sequences on Oct 9 2018.
+// Must be used when k = 2; otherwise, an exception is thrown.
 template<class T>
 double Feature<T>::afd(Point<T>& a, Point<T>& b)
 {
 	const DivergencePoint<T>& p = dynamic_cast<const DivergencePoint<T>&>(a);
 	const DivergencePoint<T>& q = dynamic_cast<const DivergencePoint<T>&>(b);
 	const auto N = p.points.size();
-	const int k = (int)(log(N) / log(4));
+	const auto A = Util::getAlphabetSize();
+
+	const int k = a.getK();
+	if(k != 2){
+		cerr << "AFD cannot be calculated for k other than 2: Received: "  << k << endl;
+		throw std::exception();
+	}
+
 	const auto p1 = p.get_1mers();
 	const auto q1 = q.get_1mers();
 	const auto pmag = p.getPseudoMagnitude();
 	const auto qmag = q.getPseudoMagnitude();
+
 	double sum = 0;
-	const auto nMinusOne = N / 4;
-	const auto nMinusTwo = nMinusOne / 4;
+	const auto nMinusOne = N / A;
+	const auto nMinusTwo = nMinusOne / A;
 	int first_i = 0;
 	for (auto i = 0; i < N; i += nMinusTwo) {
-// 16 iterations total, iterating through all 2-mers
+		// 16 iterations total, iterating through all 2-mers
 		uint64_t psum = 0, qsum = 0;
 		for (auto j = i; j < i + nMinusTwo; j++) {
-			psum += p.points[j];
-			qsum += q.points[j];
+			psum += p.points.at(j);
+			qsum += q.points.at(j);
 		}
-		double x = (double)psum / p1[first_i / 4];
-		double y = (double)qsum / q1[first_i / 4];
+		double x = (double)psum / p1.at(first_i / A);
+		double y = (double)qsum / q1.at(first_i / A);
 		first_i++;
+		double diff = abs(x - y);
+	    double unsquared = (diff * pow(1+diff, -14));
+	    // Hani Z. Girgis modified this line
+		// double unsquared = (diff * pow(1+diff, -2));
 
-
-		double diff = x - y;
-	        double unsquared = (diff * pow(1+diff, -14));
 		sum += unsquared * unsquared;
+
+		if(isinf(sum)){
+			cerr << x << " " << y << " " << diff << " " << unsquared << endl;
+			throw std::exception();
+		}
 	}
+
 	return sum;
 }
 
@@ -1685,57 +2012,62 @@ double Feature<T>::kulczynski1(Point<T> &a, Point<T> &b)
 	return sum;
 }
 
-template<class T>
-double Feature<T>::c_d2_star(Point<T>& a, Point<T>& b) {
 
-	auto aid = a.get_id();
-	auto bid = b.get_id();
-	auto tup = std::tuple<uintmax_t, uintmax_t, uint8_t>(aid, bid, Feature<T>::log2(FEAT_D2_star));
-	if (ltable.find(tup) == ltable.end()) {
-		double val = d2_star(a, b);
-		ltable.insert({tup, val});
-		return val;
-	} else {
-		return ltable.at(tup);
-	}
-}
 
-template<class T>
-double Feature<T>::d2_star(Point<T>& a, Point<T>& b)
-{
-	const DivergencePoint<T>& p = dynamic_cast<const DivergencePoint<T>&>(a);
-	const DivergencePoint<T>& q = dynamic_cast<const DivergencePoint<T>&>(b);
-	const auto N = p.points.size();
-	const int k = (int)(log(N) / log(4));
-	const auto p1 = p.get_1mers();
-	const auto q1 = q.get_1mers();
+// // Modified by Hani Z. Girgis on Oct 7 2018 to enable processing protine sequence.
+// // Failed——needs understanding of the implementation.
+// template<class T>
+// double Feature<T>::d2_star(Point<T>& a, Point<T>& b)
+// {
+// 	const DivergencePoint<T>& p = dynamic_cast<const DivergencePoint<T>&>(a);
+// 	const DivergencePoint<T>& q = dynamic_cast<const DivergencePoint<T>&>(b);
+// 	const auto N = p.points.size();
+
+// 	// const int k = (int)(log(N) / log(4));
+// 	int k = a.getK();
+// 	if(k==1){
+// 		cerr << "D2s is skipped because it cannot be applied when k is 1.";
+// 		cerr << endl;
+// 		throw std::exception();
+// 	}
+// 	const int Alpha = Util::getAlphabetSize();
+
+// 	const auto p1 = p.get_1mers();
+// 	const auto q1 = q.get_1mers();
+
+// 	const auto pmag = p.getPseudoMagnitude();
+// 	const auto qmag = q.getPseudoMagnitude();
+// 	double sum = 0;
+
+// 	vector<double> tilde(Alpha, 0);
+// 	for (int i = 0; i < Alpha; i++) {
+// 		tilde[i] = (double)(p1[i] + q1[i]) / (pmag + qmag);
+// 		cerr << "tilde[i]: " << tilde[i] << endl;
+// 	}
+// 	const double L = sqrt(pmag * qmag);
+// 	for (auto i = 0; i < N; i++) {
+// 		double p1i = 1;
+// 	    double q1i = 1;
+// 		double tilde_i = 1;
+// 	    auto idx = i;
+// 		for (int j = 0; j < k; j++) {
+// 			auto i1 = idx % Alpha;
+// 			idx /= Alpha;
+// 			p1i *= (double)p1[i1] / pmag;
+// 			q1i *= (double)q1[i1] / qmag;
+// 			tilde_i *= tilde[i1];
+// 		}
+// 		double hp = p.points[i] - pmag * p1i;
+// 		double hq = q.points[i] - qmag * q1i;
+// 		sum += hp * hq / (L * tilde_i);
+// 	}
+
+// cerr << "L: " << L << endl;
+
+// 	return sum;
+// }
+
 
-	const auto pmag = p.getPseudoMagnitude();
-	const auto qmag = q.getPseudoMagnitude();
-	double sum = 0;
-	vector<double> tilde(4, 0);
-	for (int i = 0; i < 4; i++) {
-		tilde[i] = (double)(p1[i] + q1[i]) / (pmag + qmag);
-	}
-	const double L = sqrt(pmag * qmag);
-	for (auto i = 0; i < N; i++) {
-		double p1i = 1;
-	        double q1i = 1;
-		double tilde_i = 1;
-	        auto idx = i;
-		for (int j = 0; j < k; j++) {
-			auto i1 = idx % 4;
-			idx /= 4;
-			p1i *= (double)p1[i1] / pmag;
-			q1i *= (double)q1[i1] / qmag;
-			tilde_i *= tilde[i1];
-		}
-		double hp = p.points[i] - pmag * p1i;
-		double hq = q.points[i] - qmag * q1i;
-		sum += hp * hq / (L * tilde_i);
-	}
-	return sum;
-}
 
 template<class T>
 double Feature<T>::c_n2r(Point<T>& a, Point<T>& b) {
@@ -1794,6 +2126,11 @@ double Feature<T>::c_n2rc(Point<T>& a, Point<T>& b) {
 template<class T>
 double Feature<T>::n2rc(Point<T>& a, Point<T>& b) const
 {
+	if(!Util::isDna){
+		cerr << "n2rc cannot be calculated on protein sequences." << endl;
+		throw std::exception();
+	}
+
 	const DivergencePoint<T>& p = dynamic_cast<const DivergencePoint<T>&>(a);
 	const DivergencePoint<T>& q = dynamic_cast<const DivergencePoint<T>&>(b);
 	const auto N = p.points.size();
@@ -1815,6 +2152,14 @@ double Feature<T>::n2rc(Point<T>& a, Point<T>& b) const
 	return total;
 }
 
+// template<class T>
+// void Feature<T>::safe_insert(std::tuple<uintmax_t, uintmax_t, uint8_t> k, double v){
+// 	# pragma omp critical
+// 	{
+// 		ltable.insert({k, v});
+// 	}
+// }
+
 template class Feature<uint8_t>;
 template class Feature<uint16_t>;
 template class Feature<uint32_t>;
diff --git a/src/cluster/src/Feature.h b/src/predict/Feature.h
similarity index 90%
rename from src/cluster/src/Feature.h
rename to src/predict/Feature.h
index ba7f73e..ed18f17 100644
--- a/src/cluster/src/Feature.h
+++ b/src/predict/Feature.h
@@ -8,13 +8,25 @@
  * shared indivual features can be shared through hashing if sequence
  * id's are set.
  */
-#ifndef FEATURES_H
-#define FEATURES_H
+#ifndef FEATURE_H
+#define FEATURE_H
+
+// #include "SingleFeature.h"
 
-#include "SingleFeature.h"
 #include <cmath>
 #include <algorithm>
 #include <map>
+#include <functional>
+#include <numeric>
+#include <limits>
+#include <vector>
+#include <iostream>
+#include <iterator>
+
+#include "../clutil/DivergencePoint.h"
+#include "../utility/GlobAlignE.h"
+
+using namespace std;
 
 #define	FEAT_ALIGN               (1UL << 0)
 #define	FEAT_HELLINGER           (1UL << 1)
@@ -98,16 +110,19 @@ class Feature {
 	Feature<T> operator=(const Feature<T>& feat_);
 	Feature(const int k_) : k(k_) {
 		flags = 0;
+
+		// Modified by Hani Z. Girgis on Oct 9 2018 to enable processing protein
 		auto freverse = [](int idx, int k) {
 			int sum = 0;
+			const auto A = Util::getAlphabetSize();
 			for (int i = 0; i < k; i++) {
-				int rem = idx % 4;
-				idx /= 4;
-				sum = 4 * sum + rem;
-
+				int rem = idx % A;
+				idx /= A;
+				sum = A * sum + rem;
 			}
 			return sum;
 		};
+
 		auto freverse_complement = [](int idx, int k) {
 			std::vector<int> v;
 			for (int i = 0; i < k; i++) {
@@ -121,13 +136,19 @@ class Feature {
 			return sum;
 		};
 
-		uint64_t k4 = 1;
+		uint64_t k4_22 = 1;
 		for (int i = 0; i < k; i++) {
-			k4 *= 4;
+			k4_22 *= Util::getAlphabetSize();
 		}
-		for (int i = 0; i < k4; i++) {
+
+		for (int i = 0; i < k4_22; i++) {
 			reverse.push_back(freverse(i, k));
-			reverse_complement.push_back(freverse_complement(i, k));
+		}
+
+		if(Util::isDna){
+			for (int i = 0; i < k4_22; i++) {
+				reverse_complement.push_back(freverse_complement(i, k));
+			}
 		}
 	}
 	void add_feature(uint64_t f_flags, Combo combo=Combo::xy);
@@ -135,7 +156,7 @@ class Feature {
 	vector<std::string> feat_names();
 	static std::string feat_name(uint64_t single);
 	void finalize();
-
+	// std::vector<double> get_raw(const vector<pair<Point<T>*,Point<T>*> >&, int index) const;
 	void remove_feature() { // Tear down features SPECIFIC to last pairing
 		// auto indices_to_rm = combos.back().second;
 		// combos.pop_back();
@@ -171,11 +192,16 @@ class Feature {
 	}
 	void normalize(const vector<pra<T> > &pairs);
 	void set_normal(uint64_t single_flag, double min, double max);
+	pair<double,double> get_normal(uint64_t single_flag) const;
+
 	vector<double> compute(Point<T>& p, Point<T>& q) {
 		vector<double> cache = compute_all_raw(p, q);
 		normalize_cache(cache);
 		return cache;
 	};
+
+	// This should be called on the singles, which can be calculated
+	// using the compute method
 	double operator()(int col, const vector<double>& cache) const {
 		auto pr = combos.at(col);
 		Combo combo = pr.first;
@@ -306,6 +332,7 @@ class Feature {
 	std::vector<double> get_mins() const { return mins; };
 	std::vector<double> get_maxs() const { return maxs; };
 	std::vector<uint64_t> get_lookup() const { return lookup; };
+	int get_k() const { return k; };
 private:
 
 	vector<double> compute_all_raw(Point<T>& p, Point<T>& q);
@@ -333,11 +360,8 @@ class Feature {
 	std::vector<bool> get_sims() const { return is_sims; };
 	std::vector<bool> get_finalized() const { return is_finalized; };
 
+	int k;
 
-
-
-
-	int k; int get_k() const { return k; };
 	uint64_t flags;
 	bool do_save;
 	std::vector<std::pair<Combo,
@@ -352,6 +376,9 @@ class Feature {
 
 	std::map<std::pair<uintmax_t,uintmax_t>, double> atable;
 	std::map<std::tuple<uintmax_t, uintmax_t, uint8_t>, double> ltable;
+	const std::map<std::tuple<uintmax_t, uintmax_t, uint8_t>, double>& get_ltable() const { return ltable; }
+	// Added by Hani Z. Girgis
+	// std::vector<double> tiedrank(const Point<T>& a);
 
 //	std::map<std::tuple<uintmax_t, uintmax_t, uint8_t>, double> * get_table() const { return ltable; }
 };
@@ -377,4 +404,7 @@ class Feature {
 // 	vector<SingleFeature<T> > features;
 // 	std::function<double(vector<double>)> combo;
 // };
+
+//#include "Feature.cpp"
+
 #endif
diff --git a/src/predict/FeatureSelector.cpp b/src/predict/FeatureSelector.cpp
new file mode 100644
index 0000000..01455a7
--- /dev/null
+++ b/src/predict/FeatureSelector.cpp
@@ -0,0 +1,110 @@
+// -*- C++ -*-
+/*
+ * FeatureSelector.cpp
+ *
+ * Author: Benjamin T James
+ */
+
+#include "FeatureSelector.h"
+template<class T>
+std::pair<matrix::Matrix,matrix::Matrix> FeatureSelector<T>::generate_feat_mat(const vector<pra<T> > &data, Feature<T>& feat, double cutoff)
+{
+	bool classify = (cutoff > 0);
+	int nrows = data.size();
+	int ncols = feat.size()+1;
+	matrix::Matrix feat_mat(nrows, ncols);
+	matrix::Matrix labels(nrows, 1);
+//	#pragma omp parallel for
+	for (int row = 0; row < data.size(); row++) {
+		auto kv = data.at(row);
+		vector<double> cache;
+ 		// #pragma omp critical
+		// {
+			cache = feat.compute(*kv.first, *kv.second);
+		// }
+		feat_mat.set(row, 0, 1);
+		if (classify) {
+			labels.set(row, 0, kv.val >= cutoff ? 1 : -1);
+		} else {
+			labels.set(row, 0, kv.val);
+			//	labels.set(row, 0, (kv.val - smin) / (smax - smin));
+		}
+		for (int col = 1; col < ncols; col++) {
+			double val = feat(col-1, cache);
+			feat_mat.set(row, col, val);
+		}
+	}
+	return std::make_pair(feat_mat, labels);
+}
+
+
+template<class T>
+std::pair<double, matrix::GLM> FeatureSelector<T>::regression_train(const vector<pra<T> > &data, Feature<T>& feat)
+{
+	auto pr = generate_feat_mat(data, feat, -1);
+	matrix::GLM glm;
+	glm.train(pr.first, pr.second);
+	auto result1 = pr.first * glm.get_weights();
+	auto diff1 = result1 - pr.second;
+	double sum = 0;
+	for (int i = 0; i < diff1.getNumRow(); i++) {
+		sum += fabs(diff1.get(i, 0));
+	}
+	sum /= diff1.getNumRow();
+	return {sum, glm};
+}
+
+template<class T>
+std::pair<double, matrix::GLM> FeatureSelector<T>::class_train(const vector<pra<T> > &data, Feature<T>& feat, double cutoff)
+{
+	auto pr = generate_feat_mat(data, feat, cutoff);
+	matrix::GLM glm;
+	glm.train(pr.first, pr.second);
+	matrix::Matrix p = glm.predict(pr.first);
+	for (int row = 0; row < p.getNumRow(); row++) {
+		if (p.get(row, 0) == 0) {
+			p.set(row, 0, -1);
+		}
+	}
+	auto tup = glm.accuracy(pr.second, p);
+	double acc = get<0>(tup);
+	double sens = get<1>(tup);
+	double spec = get<2>(tup);
+	return {acc, glm};
+}
+
+template<class T>
+double FeatureSelector<T>::regression_test(const vector<pra<T> >& data, Feature<T>& feat, const matrix::GLM& glm)
+{
+	auto pr = generate_feat_mat(data, feat, -1);
+	auto result1 = pr.first * glm.get_weights();
+	auto diff1 = result1 - pr.second;
+	double sum = 0;
+	for (int i = 0; i < diff1.getNumRow(); i++) {
+		sum += fabs(diff1.get(i, 0));
+	}
+	sum /= diff1.getNumRow();
+	return sum;
+}
+
+template<class T>
+tuple<double,double,double> FeatureSelector<T>::class_test(const vector<pra<T> >& data, Feature<T>& feat, const matrix::GLM& glm, double cutoff)
+{
+	auto pr = generate_feat_mat(data, feat, cutoff);
+	matrix::Matrix p = glm.predict(pr.first);
+	for (int row = 0; row < p.getNumRow(); row++) {
+		if (p.get(row, 0) == 0) {
+			p.set(row, 0, -1);
+		}
+	}
+	auto tup = glm.accuracy(pr.second, p);
+	return tup;
+
+}
+
+template class FeatureSelector<uint8_t>;
+template class FeatureSelector<uint16_t>;
+template class FeatureSelector<uint32_t>;
+template class FeatureSelector<uint64_t>;
+template class FeatureSelector<int>;
+template class FeatureSelector<double>;
diff --git a/src/predict/FeatureSelector.h b/src/predict/FeatureSelector.h
new file mode 100644
index 0000000..1d96de4
--- /dev/null
+++ b/src/predict/FeatureSelector.h
@@ -0,0 +1,27 @@
+// -*- C++ -*-
+/*
+ * FeatureSelector.h
+ *
+ * Author: Benjamin T James
+ */
+
+#ifndef FEATURE_SELECTOR_H
+#define FEATURE_SELECTOR_H
+
+#include "GLM.h"
+#include "Feature.h"
+
+template<class T>
+class FeatureSelector {
+public:
+	virtual ~FeatureSelector() {};
+	static std::pair<matrix::Matrix,matrix::Matrix> generate_feat_mat(const vector<pra<T> > &data, Feature<T>& feat, double cutoff);
+	static std::pair<double, matrix::GLM> class_train(const vector<pra<T> > &data, Feature<T>& feat, double cutoff);
+	static std::pair<double, matrix::GLM> regression_train(const vector<pra<T> > &data, Feature<T>& feat);
+	static double regression_test(const vector<pra<T> >& data, Feature<T>& feat, const matrix::GLM& glm);
+	static tuple<double,double,double> class_test(const vector<pra<T> >& data, Feature<T>& feat, const matrix::GLM& glm, double cutoff);
+
+	virtual pair<Feature<T>*,matrix::GLM> train_regression(Feature<T>* tfeat, const vector<pra<T> > &training,const vector<pra<T> > &testing) = 0;
+	virtual pair<Feature<T>*,matrix::GLM> train_class(Feature<T>* tfeat, const vector<pra<T> > &training,const vector<pra<T> > &testing, double id) = 0;
+};
+#endif
diff --git a/src/cluster/src/GLM.cpp b/src/predict/GLM.cpp
similarity index 89%
rename from src/cluster/src/GLM.cpp
rename to src/predict/GLM.cpp
index f5ef4ba..d4f37d1 100644
--- a/src/cluster/src/GLM.cpp
+++ b/src/predict/GLM.cpp
@@ -22,13 +22,18 @@ void GLM::train(Matrix& features, Matrix& labels){
 	weights = weights.pseudoInverse() * features.transpose() * labels;
 }
 
+
+double GLM::logistic(double x)
+{
+	return 1.0 / (1 + exp(-x));
+}
 Matrix GLM::predict(Matrix& features) const {
 	Matrix labels;
 	labels	= features * weights;
 	double log;
 	for(int i = 0; i < labels.getNumRow(); i++){
-		log = round(1/(1 + exp(-(labels.get(i,0)))));
-		labels.set(i,0, log);
+		//log = round(1/(1 + exp(-(labels.get(i,0)))) + 0.1);
+		labels.set(i,0, round(logistic(labels.get(i, 0))));
 	}
 	return labels;
 }
diff --git a/src/cluster/src/GLM.h b/src/predict/GLM.h
similarity index 91%
rename from src/cluster/src/GLM.h
rename to src/predict/GLM.h
index d9e150b..868dc84 100644
--- a/src/cluster/src/GLM.h
+++ b/src/predict/GLM.h
@@ -22,6 +22,8 @@ class GLM {
 	void load(Matrix weights_) { weights = weights_; }
 	void train(matrix::Matrix& features, matrix::Matrix& labels);
 	Matrix predict(matrix::Matrix& features) const;
+	static double logistic(double x);
+	static double linear(double x);
 	std::tuple<double,double,double> accuracy(matrix::Matrix& oLabels, matrix::Matrix& pLabels) const;
 	const Matrix& get_weights() const { return weights; };
 };
diff --git a/src/predict/GreedySelector.cpp b/src/predict/GreedySelector.cpp
new file mode 100644
index 0000000..7ec73df
--- /dev/null
+++ b/src/predict/GreedySelector.cpp
@@ -0,0 +1,154 @@
+/* -*- C++ -*- */
+/*
+ * GreedySelector.cpp
+ *
+ * Author: Benjamin T James
+ */
+#include "GreedySelector.h"
+#include "../clutil/Progress.h"
+
+template<class T>
+pair<Feature<T>*,matrix::GLM> GreedySelector<T>::train_regression(Feature<T>* feat, const vector<pra<T> > &training,const vector<pra<T> > &testing)
+{
+	auto c_size = feat->get_combos().size();
+	for (int i = 0; i < c_size; i++) {
+		feat->remove_feature();
+	}
+	vector<uintmax_t> used_list;
+	double abs_best_regr = 1000000;
+//	Progress prog(possible_feats.size() * max_num_feat, "Feature selection:");
+	for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) {
+		double best_regr_err = abs_best_regr;
+		uintmax_t best_idx = -1, cur_idx = 1;
+		auto best_regr_feat = possible_feats.front();
+		for (uint64_t i = 0; i < possible_feats.size(); i++) {
+			if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) {
+				continue;
+			}
+			auto rfeat = possible_feats[i];
+		        feat->add_feature(rfeat.first, rfeat.second);
+			feat->normalize(training);
+			feat->finalize();
+			auto pr = FeatureSelector<T>::regression_train(training, *feat);
+			auto name = feat->feat_names().back();
+			double regr_mse = FeatureSelector<T>::regression_test(testing, *feat, pr.second);
+			feat->remove_feature();
+			//	prog++;
+			//cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name << " err: " << regr_mse << endl;
+			if (regr_mse < best_regr_err) {
+				best_regr_err = regr_mse;
+				best_regr_feat = rfeat;
+				best_idx = i;
+			}
+		}
+		if (best_regr_err < abs_best_regr) {
+			feat->add_feature(best_regr_feat.first, best_regr_feat.second);
+			feat->normalize(training);
+			feat->finalize();
+			abs_best_regr = best_regr_err;
+			used_list.push_back(best_idx);
+			//possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_regr_feat), possible_feats.end());
+		}
+	}
+//	prog.end();
+
+	Feature<T>* feat_r = new Feature<T>(*feat);
+	feat_r->set_save(false);
+	auto pr = FeatureSelector<T>::regression_train(training, *feat_r);
+	matrix::GLM r_glm = pr.second;
+	double tr_regr_mse = FeatureSelector<T>::regression_test(testing, *feat_r, r_glm); // "training"
+	cout << "Training Mean Error: " << pr.first << endl;
+	double regr_mse = FeatureSelector<T>::regression_test(testing, *feat_r, r_glm);//, "testing");
+	cout << "Testing Mean Error: " << regr_mse << endl;
+	cout << "Features: "<< endl;
+	for (auto line : feat_r->feat_names()) {
+		cout << "\t" << line << endl;
+	}
+	auto w = r_glm.get_weights();
+	for (int r = 0; r < w.getNumRow(); r++) {
+		cout << "weight: ";
+		for (int c = 0; c < w.getNumCol(); c++) {
+			cout << w.get(r, c) << " ";
+		}
+		cout << endl;
+	}
+
+}
+
+template<class T>
+std::pair<Feature<T>*,matrix::GLM> GreedySelector<T>::train_class(Feature<T>* feat, const vector<pra<T> > &training,const vector<pra<T> > &testing, double id)
+{
+	auto c_size = feat->get_combos().size();
+	for (int i = 0; i < c_size; i++) {
+		feat->remove_feature();
+	}
+	vector<uintmax_t> used_list;
+	double abs_best_acc = 0;
+//	cout << "possible feats at one step: " << possible_feats.size() << endl;
+	Progress prog(possible_feats.size() * max_num_feat, "Feature selection:");
+
+	std::ostringstream oss;
+	for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) {
+		double best_class_acc = abs_best_acc;
+		uintmax_t best_idx = -1, cur_idx = 1;
+		auto best_class_feat = possible_feats.front();
+		for (uint64_t i = 0; i < possible_feats.size(); i++) {
+			if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) {
+				continue;
+			}
+			auto rfeat = possible_feats[i];
+		        feat->add_feature(rfeat.first, rfeat.second);
+			feat->normalize(training);
+			feat->finalize();
+			auto name = feat->feat_names().back();
+			auto pr = FeatureSelector<T>::class_train(training, *feat, id);
+			auto class_ac = FeatureSelector<T>::class_test(testing, *feat, pr.second, id);
+			double class_accuracy = get<0>(class_ac);//sqrt(get<1>(class_ac) * get<2>(class_ac));
+			feat->remove_feature();
+			prog++;
+//			cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name  << " acc: " << get<0>(class_ac) << " sens: " << get<1>(class_ac) << " spec: " << get<2>(class_ac) << endl;
+			if (class_accuracy > best_class_acc) {
+				best_class_acc = class_accuracy;
+				best_class_feat = rfeat;
+				best_idx = i;
+			}
+		}
+		/* accept the feature if either 1. we don't have enough features
+		 * or 2. it improves accuracy by over 0.5%
+		 */
+		if (best_class_acc > abs_best_acc || num_feat <= min_num_feat) {
+			feat->add_feature(best_class_feat.first, best_class_feat.second);
+			feat->normalize(training);
+			feat->finalize();
+			abs_best_acc = best_class_acc;
+			used_list.push_back(best_idx);
+			oss << "Feature added: " << best_class_feat.first << " " << (int)best_class_feat.second << endl;
+			oss << "Accuracy: " << best_class_acc << endl;
+			possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_class_feat), possible_feats.end());
+		}
+	}
+	prog.end();
+	cout << oss.str();
+	Feature<T>* feat_c = new Feature<T>(*feat);
+	feat_c->set_save(false);
+	auto pr = FeatureSelector<T>::class_train(training, *feat_c, id);
+	matrix::GLM c_glm = pr.second;
+	auto train_results = FeatureSelector<T>::class_test(training, *feat_c, c_glm, id);//, "train");
+	cout << "Training ACC: " << get<0>(train_results) << " " << get<1>(train_results) << " " << get<2>(train_results) << endl;
+	auto test_results = FeatureSelector<T>::class_test(testing, *feat_c, c_glm, id);//, "test");
+	double class_acc = get<0>(test_results);
+	cout << "Testing ACC: " << class_acc << " " << get<1>(test_results) << " " << get<2>(test_results) << endl;
+
+	cout << "Features: "<< endl;
+	for (auto line : feat_c->feat_names()) {
+		cout << "\t" << line << endl;
+	}
+	return std::make_pair(feat_c, c_glm);
+}
+
+template class GreedySelector<uint8_t>;
+template class GreedySelector<uint16_t>;
+template class GreedySelector<uint32_t>;
+template class GreedySelector<uint64_t>;
+template class GreedySelector<int>;
+template class GreedySelector<double>;
diff --git a/src/predict/GreedySelector.h b/src/predict/GreedySelector.h
new file mode 100644
index 0000000..5d4bc2f
--- /dev/null
+++ b/src/predict/GreedySelector.h
@@ -0,0 +1,23 @@
+/* -*- C++ -*- */
+/*
+ * GreedySelector.h
+ *
+ * Author: Benjamin T James
+ */
+
+#ifndef GREEDY_SELECTOR_H
+#define GREEDY_SELECTOR_H
+#include "FeatureSelector.h"
+
+template<class T>
+class GreedySelector : public FeatureSelector<T> {
+public:
+	GreedySelector(vector<std::pair<uint64_t, Combo> > possible_feats_, int min_n_feat, int max_n_feat) : possible_feats(possible_feats_), min_num_feat(min_n_feat), max_num_feat(max_n_feat) {}
+	~GreedySelector() {}
+	pair<Feature<T>*,matrix::GLM> train_regression(Feature<T>* tfeat, const vector<pra<T> > &training,const vector<pra<T> > &testing);
+	pair<Feature<T>*,matrix::GLM> train_class(Feature<T>* tfeat, const vector<pra<T> > &training,const vector<pra<T> > &testing, double id);
+private:
+	int max_num_feat, min_num_feat;
+	vector<std::pair<uint64_t, Combo> > possible_feats;
+};
+#endif
diff --git a/src/cluster/src/HandleSeq.cpp b/src/predict/HandleSeq.cpp
similarity index 89%
rename from src/cluster/src/HandleSeq.cpp
rename to src/predict/HandleSeq.cpp
index 041c22a..f9e7f2f 100644
--- a/src/cluster/src/HandleSeq.cpp
+++ b/src/predict/HandleSeq.cpp
@@ -1,5 +1,6 @@
 /**
  * Author: Alex Baumgartner
+ * Modified by Benjamin T James
  * The Bioinformatics Toolsmith Laboratory, the University of Tulsa
  * 5/15/2018
  *
@@ -11,11 +12,12 @@
 #include "HandleSeq.h"
 #include <omp.h>
 // d
-HandleSeq::HandleSeq(int m) {
+HandleSeq::HandleSeq(int m, std::random_device::result_type rnd) {
 
 	mode = m & HandleSeq::BOTH;
 	enableTrans = m & HandleSeq::TRANSLOCATION;
 	enableRev = m & HandleSeq::REVERSION;
+	random = new LCG(rnd);
 	// disable = (m & HandleSeq::ATYPICAL) > 0 ? 0 : 1;
 }
 
@@ -71,7 +73,7 @@ pair<vector<string>, vector<string>> HandleSeq::parseFile(string fileName) {
 	}
 }
 
-pair<float, string> HandleSeq::mutate(string sequence, int muteRate) {
+pair<float, string> HandleSeq::mutate(string sequence, int muteRate, int split) {
 	percMute = muteRate;
 	if (muteRate == 0) {
 		return std::make_pair(1, sequence);
@@ -99,12 +101,13 @@ pair<float, string> HandleSeq::mutate(string sequence, int muteRate) {
 	}
 	//Otherwise, assing a random percentage to both
 	else {
-		percMulti = rand() % percMute;
+		percMulti = split;
+//		percMulti = random.randMod<int>(percMute);
 		percSing = percMute - percMulti;
 	}
 	//Define a new multiple mutation
 	MultiMute multi(percAs, percCs, percGs, percTs,
-			percMulti, enableTrans, enableRev);
+			percMulti, enableTrans, enableRev, random->nextRandSeed());
 	//Run the multiple mutations,
 	//get back its vector of what is valid to mutate and what isn't
 	vector<bool> mutes = multi.genMulti(seq);
@@ -112,9 +115,12 @@ pair<float, string> HandleSeq::mutate(string sequence, int muteRate) {
 	for (bool b : mutes) {
 		cnt += b ? 1 : 0;
 	}
-
+	if (mutes.size() != seq->length()) {
+		cerr << "mutation size is not matching the multi-sequence" << endl;
+		throw 100;
+	}
 	SingMute sing(percAs, percCs, percGs, percTs,
-		      percSing, seq, mutes);
+		      percSing, seq, mutes, random->nextRandSeed());
 	float alignmentLength = multi.getAlignmentLength() + sing.getAlignmentLength() + length;
 //	cout << "alignLength: " << alignmentLength << endl;
 	float IBP = length - multi.getIBP() - sing.getIBP();
@@ -130,7 +136,7 @@ pair<float, string> HandleSeq::mutate(string sequence, int muteRate) {
 	return make_pair(alignment, outseq);
 }
 
-vector<int> HandleSeq::countNucl(string sequence) {
+vector<int> HandleSeq::countNucl(const string& sequence) {
 	int a = 0;
 	int c = 0;
 	int g = 0;
diff --git a/src/cluster/src/HandleSeq.h b/src/predict/HandleSeq.h
similarity index 83%
rename from src/cluster/src/HandleSeq.h
rename to src/predict/HandleSeq.h
index 95a7718..f75ac0f 100644
--- a/src/cluster/src/HandleSeq.h
+++ b/src/predict/HandleSeq.h
@@ -1,5 +1,6 @@
 /**
  * Author: Alex Baumgartner
+ * Modified by Benjamin T James
  * The Bioinformatics Toolsmith Laboratory, the University of Tulsa
  * 5/15/2018
  *
@@ -14,6 +15,7 @@
 #include <vector>
 #include <fstream>
 #include <string>
+#include "LCG.h"
 #include "MultiMute.h"
 #include "SingMute.h"
 
@@ -39,7 +41,8 @@ class HandleSeq {
 	 int: the mode of the program
 	 				(Single only = 1, nonsingle only = 2, both = 3)
 	 */
-	HandleSeq(int);
+	HandleSeq(int, std::random_device::result_type seed);
+	~HandleSeq() { if (random != NULL) { delete random; }}
 	/*
 	 returns a vector of all sequences in a file inputted
 
@@ -55,11 +58,15 @@ class HandleSeq {
 	 Mutates a sequence based on parameters inputted in constructor,
 	 	and returns the mutated sequence
 	 */
-	pair<float, string> mutate(string, int);
+	pair<float, string> mutate(string, int, int);
+
+	uint32_t getSeed() const { return seed; }
 private:
+	uint32_t seed;
 	int mode;
 	int percMute;
 	bool enableTrans, enableRev;
+        LCG *random = NULL;
 	/*
 	 Counts the nucleotides in a file,
 	 	and returns a vector corresponding to their values {A, C, G, T}
@@ -70,7 +77,7 @@ class HandleSeq {
 	 @return:
 	 std::vector<int>: vector containing ints of each nucleotide count
 	 */
-	vector<int> countNucl(string);
+	vector<int> countNucl(const string&);
 
 };
 
diff --git a/src/cluster/src/Matrix.cpp b/src/predict/Matrix.cpp
similarity index 97%
rename from src/cluster/src/Matrix.cpp
rename to src/predict/Matrix.cpp
index 997d1c7..f1055c7 100644
--- a/src/cluster/src/Matrix.cpp
+++ b/src/predict/Matrix.cpp
@@ -20,6 +20,13 @@ using namespace std;
 
 namespace matrix {
 
+Matrix::Matrix(vector<double> vec) : numRow(1), numCol(vec.size()) {
+       	m.at(0) = vector<double>(vec.size());
+       	for (int i = 0; i < vec.size(); i++) {
+       		set(0, i, vec[i]);
+       	}
+}
+
 Matrix::Matrix(int r, int c) :
 		numRow(r), numCol(c) {
 	m.resize(r);
@@ -142,7 +149,6 @@ Matrix Matrix::gaussJordanInverse() {
 						}
 					} else {//If it cannot perform a type 1 row swap with a non zero pivot value, the Inverse does not exist.
 						cout << "Inverse does not exist\n";
-						throw 0;
 						m = temp.m;
 						return temp;
 					}
@@ -183,13 +189,11 @@ Matrix Matrix::gaussJordanInverse() {
 			for (int j = 0; j < numCol; j++) {
 				if (i == j && get(i, j) != 1) {
 					cout << "Inverse does not exist\n";
-					throw 0;
 					m = temp.m;
 					return temp;
 				}
 				if (i != j && get(i, j) != 0) {
 					cout << "Inverse does not exist\n";
-					throw 0;
 					m = temp.m;
 					return temp;
 				}
diff --git a/src/cluster/src/Matrix.h b/src/predict/Matrix.h
similarity index 90%
rename from src/cluster/src/Matrix.h
rename to src/predict/Matrix.h
index 46a73a6..6aaffa1 100644
--- a/src/cluster/src/Matrix.h
+++ b/src/predict/Matrix.h
@@ -3,6 +3,7 @@
  *
  * Created on: May 10, 2017
  * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa
+ * Modified by Benjamin T James
  */
 
 
@@ -23,7 +24,7 @@ class Matrix
 
 
 public:
-
+	Matrix(std::vector<double> m);
 	Matrix(int r, int c);
 	Matrix();
 	~Matrix();
@@ -47,6 +48,7 @@ class Matrix
 	void rowToVector(int, std::vector<double>&);
 	void colToVector(int, std::vector<double>&);
 	int getNumRow() const;
+	int getNumCol() const { return numCol; };
 };
 }
 #endif /* MATRIX_H_ */
diff --git a/src/cluster/src/MultiMute.cpp b/src/predict/MultiMute.cpp
similarity index 90%
rename from src/cluster/src/MultiMute.cpp
rename to src/predict/MultiMute.cpp
index 73ee242..5acef58 100644
--- a/src/cluster/src/MultiMute.cpp
+++ b/src/predict/MultiMute.cpp
@@ -1,5 +1,6 @@
 /**
  * Author: Alex Baumgartner
+ * Modified by Benjamin T James
  * The Bioinformatics Toolsmith Laboratory, the University of Tulsa
  * 5/15/2018
  *
@@ -11,7 +12,7 @@
 #include <sstream>
 #include "Random.h"
 
-MultiMute::MultiMute(int a, int c, int g, int t, int alloc, bool enableTrans, bool enableRev)
+MultiMute::MultiMute(int a, int c, int g, int t, int alloc, bool enableTrans, bool enableRev, std::random_device::result_type seed) : rng(seed)
 {
 	percAs = a;
 	percCs = c;
@@ -38,22 +39,26 @@ MultiMute::MultiMute(int a, int c, int g, int t, int alloc, bool enableTrans, bo
 		maxDel = 0;
 	} else if (enableTrans) {
 		if (alloc > 1) {
-			maxTrans = rand() % alloc;
+			maxTrans = rng.randMod<int64_t>(alloc);
+//			maxTrans = rand() % alloc;
 			alloc -= maxTrans;
 		}
 	} else if (enableRev) {
 		if (alloc > 1) {
-			maxReverse = rand() % alloc;
+			maxReverse = rng.randMod<int64_t>(alloc);
+//			maxReverse = rand() % alloc;
 			alloc -= maxReverse;
 		}
 	}
 
 	if (alloc > 1) {
-		maxDel = (rand() % alloc);
+		maxDel = rng.randMod<int64_t>(alloc);
+//		maxDel = (rand() % alloc);
 		alloc -= maxDel;
 	}
 	if (alloc > 0) {
-		maxDup = rand() % alloc;
+		maxDup = rng.randMod<int64_t>(alloc);
+//		maxDup = rand() % alloc;
 		alloc -= maxDup;
 	} else {
 		maxDup = 0;
@@ -167,14 +172,15 @@ vector<bool> MultiMute::genMulti(string * sequence)
 void MultiMute::reverse(vector<string> * toAddTo)
 {
 	//Keep forming strings until the allocation of reverse is used up
-	int size;
+	int64_t size;
 	//cout << "maxReverse: " << maxReverse << endl;
 	while (maxReverse > 0) {
 		//Automatically make it 2 to avoid modulus error
-		if (maxReverse == 2) {
-			size = 2;
+		if (maxReverse <= 2) {
+			size = maxReverse;
 		} else {
-			size = (rand() % (maxReverse - 2)) + 2;
+			size = rng.randMod<int64_t>(maxReverse - 2) + 2;
+//			size = (rand() % (maxReverse - 2)) + 2;
 			//Add 1 to size if the remaining reverse allocation would be 1
 			if (maxReverse - size == 1) {
 				size++;
@@ -190,12 +196,12 @@ void MultiMute::reverse(vector<string> * toAddTo)
 
 void MultiMute::translocate(vector<string> * toAddTo)
 {
-	int size;
+	int size = 0;
 	//Keep forming strings until the allocation of Translocate is used up
 	while (maxTrans > 0) {
 		//Automatically make it 2 to avoid modulus error
-		if (maxTrans == 2) {
-			size = 2;
+		if (maxTrans <= 2) {
+			size = maxTrans;
 		} else {
 			size =  rng.randMod<int>(std::min(max_block_size, maxTrans - 2)) + 2;
 			//Add 1 to size if the remaining reverse allocation would be 1
@@ -214,12 +220,13 @@ void MultiMute::translocate(vector<string> * toAddTo)
 
 void MultiMute::insert(vector<string> * toAddTo)
 {
-	int size;
+	long size = 0;
+	const int initial_maxInsert = maxInsert;
 	//Keep forming strings until the allocation of insert is used up
 	while (maxInsert > 0) {
 		//Automatically make it 2 to avoid modulus error
-		if (maxInsert == 2) {
-			size = 2;
+		if (maxInsert <= 2) {
+			size = maxInsert;
 		} else {
 //			size = (rand() % (maxInsert - 2)) + 2;
 			size = rng.randMod<int>(std::min(max_block_size, maxInsert - 2)) + 2;
@@ -231,6 +238,10 @@ void MultiMute::insert(vector<string> * toAddTo)
 		//	cout << "maxInsert=" << maxInsert << " insert " << size << endl;
 		//Add an I for where to insert, and add a generated string to the insetions vector
 		toAddTo->push_back("I");
+		if (size < 0) {
+			cerr << "insert is " << size << endl;
+			throw std::exception();
+		}
 		insertions->push_back(genInsert(size));
 		maxInsert -= size;
 	}
@@ -238,12 +249,12 @@ void MultiMute::insert(vector<string> * toAddTo)
 
 void MultiMute::deleteNucl(vector<string> * toAddTo)
 {
-	int size;
+	int size = 0;
 	//Keep forming strings until the allocation of deletion is used up
 	while (maxDel > 0) {
 		//Automatically make it 2 to avoid modulus error
-		if (maxDel == 2) {
-			size = 2;
+		if (maxDel <= 2) {
+			size = maxDel;
 		} else {
 			size = rng.randMod<int>(std::min(max_block_size, maxDel - 2)) + 2;
 			//size = (rand() % (maxDel - 2)) + 2;
@@ -262,12 +273,12 @@ void MultiMute::deleteNucl(vector<string> * toAddTo)
 
 void MultiMute::duplicate(vector<string> * toAddTo)
 {
-	int size;
+	int size = 0;
 	//Keep forming strings until the allocation of duplicate is used up
 	while (maxDup > 0) {
 		//Automatically make it 2 to avoid modulus error
-		if (maxDup == 2) {
-			size = 2;
+		if (maxDup <= 2) {
+			size = maxDup;
 		} else {
 			size = rng.randMod<int>(std::min(max_block_size, maxDup - 2)) + 2;
 //			size = (rand() % (maxDup - 2)) + 2;
@@ -302,7 +313,8 @@ string MultiMute::genInsert(int size)
 	int value;
 	//Keep adding characters based on the original distribution of nucleotides
 	for (int i = 0; i < size; i++) {
-		value = rand() % (percAs + percCs + percGs + percTs);
+		value = rng.randMod<int>(percAs + percCs + percGs + percTs);
+//		value = rand() % (percAs + percCs + percGs + percTs);
 		if (value < percAs) {
 			toInsert.push_back('A');
 		} else if (value < percAs + percCs) {
@@ -340,7 +352,11 @@ vector<bool> MultiMute::formatString(int maxSize, vector<char> * mutationsChars)
 		//If it is an I, get the next insertion string and append it to the back of the mutaton string, as long as the insertion vector still has stuffing
 		else if (mutationsChars->at(j) == 'I') {
 			if (insertions->size() > 0) {
-				temp.append(insertions->back());
+				std::string ins = insertions->back();
+				temp.append(ins);
+				for (char c : ins) {
+					validCharacters.push_back(false);
+				}
 				insertions->pop_back();
 			}
 			//Increment only the char vector
@@ -358,9 +374,12 @@ vector<bool> MultiMute::formatString(int maxSize, vector<char> * mutationsChars)
 			}
 			//I and J are not incremented because they are incremented in the loop
 			temp.append(temp2);
-		}
+		// } else if (mutationChars->at(j) == 'X') {
+		// 	i++;
+		// 	j++;
+		// }
 		//Otherwise, skip over the nuleotide
-		else {
+		} else {
 			i++;
 			j++;
 		}
@@ -449,7 +468,8 @@ void MultiMute::checkForAllPalindromes(vector<string> * toParseFrom) {
 	}
 	//Insert enough I's randomly for the amount of transversals that replaced reversals
 	for (int i = 0; i < insertionChanges; i++) {
-		int index = rand() % toParseFrom->size();
+		int index = rng.randMod(toParseFrom->size());
+//		int index = rand() % toParseFrom->size();
 		toParseFrom->insert(toParseFrom->begin() + index, "I");
 	}
 }
diff --git a/src/cluster/src/MultiMute.h b/src/predict/MultiMute.h
similarity index 95%
rename from src/cluster/src/MultiMute.h
rename to src/predict/MultiMute.h
index 8d27d6e..1e5dc4b 100644
--- a/src/cluster/src/MultiMute.h
+++ b/src/predict/MultiMute.h
@@ -1,5 +1,6 @@
 /**
  * Author: Alex Baumgartner
+ * Modified by Benjamin T James
  * The Bioinformatics Toolsmith Laboratory, the University of Tulsa
  * 5/15/2018
  *
@@ -8,7 +9,7 @@
  */
 
 #ifndef MULTIMUTE_H
-#define  MULTIMUTE_H
+#define MULTIMUTE_H
 
 #include <iostream>
 #include <vector>
@@ -16,7 +17,7 @@
 #include <algorithm>
 #include <random>
 #include "Random.h"
-
+#include "LCG.h"
 using namespace std;
 
 class MultiMute {
@@ -33,7 +34,7 @@ class MultiMute {
 	 int: The total allocation for non-single mutations
 	 int: bool to exclude Translocate and reverse, 1 for disable, any other umber for include
 	 */
-	MultiMute(int, int, int, int, int, bool, bool);
+	MultiMute(int, int, int, int, int, bool, bool, std::random_device::result_type);
 	/*
 	 Takes in a string pointer,
 	 	and mutates it based on the allocation given to the constructor.
@@ -64,7 +65,7 @@ class MultiMute {
 	int64_t alignmentLength;
 	int64_t IBP;
 	int64_t total_alloc;
-	Random rng;
+	LCG rng;
 
 	int64_t max_block_size;
 	std::vector<std::string> * insertions;
diff --git a/src/predict/Predictor.cpp b/src/predict/Predictor.cpp
new file mode 100644
index 0000000..79d99d6
--- /dev/null
+++ b/src/predict/Predictor.cpp
@@ -0,0 +1,992 @@
+/* -*- C++ -*-
+ *
+ * Predictor.cpp
+ *
+ * Author: Benjamin T James
+ *
+ * Predictor implementation class
+ * train(vector<>...) is entry point, generates "semi-synthetic" sequences
+ * train() actually trains applicable GLM's.
+ * close() and similarity() are callable once trained
+ */
+#include "Predictor.h"
+#include "../clutil/LCG.h"
+#include "../clutil/Loader.h"
+#include "Matrix.h"
+#include "HandleSeq.h"
+#include "../clutil/Progress.h"
+#include "../clutil/Random.h"
+#include "../clutil/Clock.h"
+#include "../clutil/Datatype.h"
+#include <algorithm>
+#include <iomanip>
+#include "FeatureSelector.h"
+#include "BestFirstSelector.h"
+#include "GreedySelector.h"
+
+template<class T>
+void Predictor<T>::save(std::string file, std::string datatype)
+{
+	std::ofstream out(file);
+	out << "k: " << k << endl;
+	out << "mode: " << (unsigned int)mode << endl;
+	out << "max_features: " << max_num_feat << endl;
+	out << "ID: " << id << endl;
+	out << "Datatype: " << datatype << endl;
+	out << "feature_set: " << feats64 << endl;
+	if (mode & PRED_MODE_CLASS) {
+		write_to(out, feat_c, c_glm);
+	}
+	if (mode & PRED_MODE_REGR) {
+		write_to(out, feat_r, r_glm);
+	}
+
+}
+
+template<class T>
+Predictor<T>::Predictor(const std::string filename)
+{
+	std::ifstream in(filename);
+	std::string buf;
+	unsigned mode_ = 0;
+	in >> buf >> k;
+	//cout << buf << k << endl;
+	in >> buf >> mode_;
+	mode = mode_;
+//	cout << buf << mode << endl;
+	in >> buf >> max_num_feat;
+//	cout << buf << max_num_feat << endl;
+	in >> buf >> id;
+//	cout << buf << id << endl;
+	in >> buf >> datatype;
+//	cout << buf << datatype << endl;
+	in >> buf >> feats64;
+//	cout << buf << feats64 << endl;
+
+	is_trained = true;
+	is_training = false;
+	if (mode & PRED_MODE_CLASS) {
+		auto pr = read_from(in, k);
+		c_glm = pr.first;
+		feat_c = pr.second;
+	}
+	if (mode & PRED_MODE_REGR) {
+		auto pr = read_from(in, k);
+		r_glm = pr.first;
+		feat_r = pr.second;
+	}
+	Datatype::set(datatype);
+}
+
+template<class T>
+void Predictor<T>::write_to(std::ofstream &out, Feature<T>* feat, matrix::GLM glm)
+{
+	auto combos = feat->get_combos();
+	auto lookup = feat->get_lookup();
+	auto mins = feat->get_mins();
+	auto maxs = feat->get_maxs();
+	out << std::endl << "n_combos: " << combos.size() << std::endl;
+	out << std::setprecision(std::numeric_limits<double>::digits10) << glm.get_weights().get(0, 0) << endl;
+	for (int j = 0; j < combos.size(); j++) {
+		auto cmb = combos[j];
+		unsigned int val = 0;
+		uint64_t flags = 0;
+		for (auto i : cmb.second) {
+			flags |= lookup[i];
+		}
+		switch (cmb.first) {
+		case Combo::xy:
+			val = 0;
+			break;
+		case Combo::xy2:
+			val = 1;
+			break;
+		case Combo::x2y:
+			val = 2;
+			break;
+		case Combo::x2y2:
+			val = 3;
+			break;
+		}
+		out << val << " ";
+		out << flags << " ";
+		out << std::setprecision(std::numeric_limits<double>::digits10) << glm.get_weights().get(j+1, 0) << std::endl;
+	}
+	out << std::endl << "n_singles: " << lookup.size() << std::endl;
+	for (int j = 0; j < lookup.size(); j++) {
+		out << lookup[j] << " ";
+		out << std::setprecision(std::numeric_limits<double>::digits10) << mins[j] << " ";
+		out << std::setprecision(std::numeric_limits<double>::digits10) << maxs[j] << std::endl;
+	}
+}
+
+
+template<class T>
+pair<matrix::GLM, Feature<T>*> Predictor<T>::read_from(std::ifstream& in, int k_)
+{
+	matrix::GLM glm;
+	int c_num_raw_feat, c_num_combos;
+	Feature<T> *feat = new Feature<T>(k_);
+	std::string buf;
+	in >> buf >> c_num_combos;
+//	cout << buf << "\"" << c_num_combos << "\"" << endl;
+	matrix::Matrix weights(c_num_combos+1, 1);
+	double d_;
+	in >> d_;
+	weights.set(0, 0, d_);
+	for (int i = 0; i < c_num_combos; i++) {
+		int cmb;
+		in >> cmb;
+		//	cout << (int)cmb << endl;
+		uint64_t flags;
+		in >> flags;
+//		cout << flags << endl;
+		double d;
+		in >> d;
+//		cout << "[" << 0 << "," << i << "] " << d << endl;
+		weights.set(i+1, 0, d);//push_back(d);
+		Combo cmb_ = Combo::xy;
+		switch (cmb) {
+		case 0:
+			cmb_ = Combo::xy;
+			break;
+		case 1:
+			cmb_ = Combo::xy2;
+			break;
+		case 2:
+			cmb_ = Combo::x2y;
+			break;
+		case 3:
+			cmb_ = Combo::x2y2;
+			break;
+		default:
+			cerr << "error reading weights file" << endl;
+			break;
+		}
+		feat->add_feature(flags, cmb_);
+	}
+
+	in >> buf >> c_num_raw_feat;
+//	cout << buf << "\"" << c_num_raw_feat << "\"" << endl;
+	for (int i = 0; i < c_num_raw_feat; i++) {
+		uint64_t single_flag;
+		double min_, max_;
+		in >> single_flag;
+//		cout << single_flag << endl;
+		in >> min_;
+//		cout << min_ << endl;
+		in >> max_;
+//		cout << max_ << endl;
+		feat->set_normal(single_flag, min_, max_);
+	}
+	feat->finalize();
+	glm.load(weights);
+	return {glm, feat};
+}
+
+void identities_for_gen(double id_begin, double id_end, int num_seq, LCG& rnd, vector<int> &to_ret)
+{
+	double inc = (id_end - id_begin) / num_seq;
+	for (size_t i = 0; i < num_seq; i++) {
+		double iter_id = id_begin + inc * (i + 0.5);
+		double actual_id = rnd.rand_between(iter_id, inc, id_begin, id_end);
+		int mut = round(100 - actual_id);
+		mut = (mut == 0) ? 1 : mut;
+		to_ret.push_back(mut);
+	}
+}
+
+template<class T>
+void Predictor<T>::add_feats(std::vector<std::pair<uint64_t, Combo> >& vec, uint64_t feat_flags)
+{
+	for (uint64_t i = 1; i <= feat_flags; i *= 2) {
+		if ((i & feat_flags) == 0) {
+			continue;
+		}
+		for (uint64_t j = 1; j <= i; j *= 2) {
+			if ((j & feat_flags) == 0) {
+				continue;
+			}
+			vec.emplace_back(i | j, Combo::xy);
+			vec.emplace_back(i | j, Combo::x2y2);
+			if (i != j) {
+				vec.emplace_back(i | j, Combo::x2y);
+				vec.emplace_back(i | j, Combo::xy2);
+			}
+		}
+	}
+}
+template<class T>
+void Predictor<T>::check()
+{
+	// if (!is_trained && training.size() >= threshold && !is_training) {
+	// 	omp_set_lock(&lock);
+	// 	is_training = true;
+	// 	train();
+	// 	is_training = false;
+	// 	omp_unset_lock(&lock);
+	// }
+}
+template<class T>
+double Predictor<T>::similarity(Point<T>* a, Point<T>* b)
+{
+	if (!is_trained) {
+//		double d = Selector<T>::align(a, b);
+		cerr << "alignment: we don't do that here" << endl;
+		throw "Bad";
+		//		return d;
+		// if (!is_training) {
+		// 	omp_set_lock(&lock);
+		// 	if (training.size() < testing.size() && training.size() < threshold) {
+		// 		training.push_back(pra<T>(a, b, d));
+		// 	} else if (training.size() >= testing.size() && testing.size() < threshold) {
+		// 		testing.push_back(pra<T>(a, b, d));
+		// 	}
+		// 	omp_unset_lock(&lock);
+		// }
+		return 0;
+
+	} else {
+		return predict(a, b);
+	}
+}
+
+template<class T>
+bool Predictor<T>::close(Point<T> *a, Point<T> *b)
+{
+	if (!is_trained) {
+//		double d = Selector<T>::align(a, b);
+		cerr << "alignment shouldn't be used here" << endl;
+		throw "bad";
+		// if (!is_training) {
+		// 	omp_set_lock(&lock);
+		// 	if (training.size() < testing.size() && training.size() < threshold) {
+		// 		training.push_back(pra<T>(a, b, d));
+		// 	} else if (training.size() >= testing.size() && testing.size() < threshold) {
+		// 		testing.push_back(pra<T>(a, b, d));
+		// 	}
+		// 	omp_unset_lock(&lock);
+		// }
+//		return d > id;
+		return false;
+	}
+	bool val = p_close(a, b);
+	if ((mode & PRED_MODE_REGR) && val) {
+		// val = p_predict(a, b) > id;
+		// if (!val) {
+		// 	cout << "FIXED" << endl;
+		// }
+	}
+	return val;
+}
+
+template<class T>
+double Predictor<T>::p_predict(Point<T>* a, Point<T>* b)
+{
+	auto cache = feat_r->compute(*a, *b);
+	auto weights = r_glm.get_weights();
+	double sum = weights.get(0, 0);
+	for (int col = 0; col < feat_r->size(); col++) {
+		double val = (*feat_r)(col, cache);
+		sum += weights.get(col+1, 0) * val;
+	}
+//	sum = scale_min + (scale_max - scale_min) * sum;
+	if (sum < 0) {
+		sum = 0;
+	} else if (sum > 1) {
+		sum = 1;
+	}
+	return sum;
+}
+template<class T>
+double Predictor<T>::predict(Point<T>* a, Point<T>* b)
+{
+	return p_predict(a, b);
+}
+
+double _bias = 0;
+//double _bias = 0;
+
+template<class T>
+void Predictor<T>::set_bias(double b)
+{
+	_bias = b;
+}
+template<class T>
+double Predictor<T>::classify_sum(double sum)
+{
+//	cout << "Bias is " << _bias << endl;
+	return matrix::GLM::logistic(sum) + _bias;
+}
+
+template<class T>
+bool Predictor<T>::p_close(Point<T>* a, Point<T>* b)
+{
+	auto weights = c_glm.get_weights();
+	double sum = weights.get(0, 0);
+	auto cache = feat_c->compute(*a, *b);
+	for (int col = 1; col < weights.getNumRow(); col++) {
+		double d = (*feat_c)(col-1, cache);
+		sum += weights.get(col, 0) * d;
+	}
+	return round(classify_sum(sum)) > 0;
+}
+
+
+template<class T>
+std::pair<matrix::Matrix,matrix::Matrix> generate_feat_mat(const vector<pra<T> > &data, Feature<T>& feat, double cutoff, bool do_print=false)//bool classify, double cutoff, double smin, double smax)
+{
+	bool classify = (cutoff > 0);
+	int nrows = data.size();
+	int ncols = feat.size()+1;
+	matrix::Matrix feat_mat(nrows, ncols);
+	matrix::Matrix labels(nrows, 1);
+	#pragma omp parallel for
+	for (int row = 0; row < data.size(); row++) {
+		auto kv = data.at(row);
+		vector<double> cache;
+ 		// #pragma omp critical
+		// {
+			cache = feat.compute(*kv.first, *kv.second);
+		// }
+		feat_mat.set(row, 0, 1);
+		if (classify) {
+			labels.set(row, 0, kv.val >= cutoff ? 1 : -1);
+		} else {
+			labels.set(row, 0, kv.val);
+			//	labels.set(row, 0, (kv.val - smin) / (smax - smin));
+		}
+		for (int col = 1; col < ncols; col++) {
+			double val = feat(col-1, cache);
+			feat_mat.set(row, col, val);
+		}
+	}
+	if (do_print) {
+		for (int row = 0; row < data.size(); row++) {
+			cout << "FM " << labels.get(row, 0) << " ";
+			for (int col = 0; col < ncols; col++) {
+				auto val = feat_mat.get(row, col);
+				cout << val << " ";
+			}
+			cout << endl;
+		}
+		cout << endl;
+	}
+	return std::make_pair(feat_mat, labels);
+}
+
+std::string bin2acgt(const std::string& input)
+{
+	std::string out = "";
+	for (char c : input) {
+		switch (c) {
+		case 0:
+			out += 'A';
+			break;
+		case 1:
+			out += 'C';
+			break;
+		case 2:
+			out += 'G';
+			break;
+		case 3:
+			out += 'T';
+			break;
+		default:
+			out += "ERR";
+		}
+	}
+	return out;
+}
+
+std::string uniqheader(std::string hdr)
+{
+	std::string out = "";
+	bool reached_space = false;
+	for (char c : hdr) {
+		if (c == ' ') {
+			break;
+		}
+		out += c;
+	}
+	auto ptr = hdr.find("_mut");
+	if (ptr != std::string::npos) {
+		return out + hdr.substr(ptr);
+	} else {
+		return out;
+	}
+
+}
+
+template<class T>
+size_t remove_uniform(std::vector<pra<T> > &vec, size_t trim_size, std::vector<pra<T> > &out_vec)
+{
+	size_t N = vec.size();
+	double inc = (double)N / trim_size;
+	if (inc <= 1) {
+		inc = 1;
+	}
+	size_t output_size = 0;
+	double i_keep = 0;
+	for (size_t i = 0; i < N; i++) {
+		if (i == round(i_keep)) {
+			output_size++;
+			out_vec.push_back(vec[i]);
+			i_keep += inc;
+		} else {
+			delete vec[i].second;
+		}
+	}
+	return output_size;
+}
+
+template<class T>
+size_t remove_uniform_old(std::vector<pra<T> > &vec, size_t trim_size, std::vector<pra<T> > &out_vec)
+{
+	size_t N = vec.size();
+	size_t inc = N - trim_size;
+	if (inc <= 0) { // no removal so make sure it is never equal
+		inc = N;
+	}
+	size_t i_rm = N % inc; // shift off to remove ending points instead of first point
+	size_t output_size = 0;
+	for (size_t i = 0; i < N; i++) {
+		if (i == i_rm) {
+			/* dont do anything but set the next bad index */
+			i_rm += inc;
+			delete vec[i].second;
+		} else {
+			output_size++;
+			out_vec.push_back(vec[i]);
+		}
+	}
+	return output_size;
+}
+
+template<class T>
+void remove_boundary(std::vector<pra<T> > &vec, size_t trim_size, std::vector<pra<T> > &out_vec, bool left_rm = false)
+{
+	size_t N = vec.size();
+	size_t to_rm = N - trim_size;
+	for (size_t i = 0; i < N; i++) {
+		if ((!left_rm || i >= to_rm) && (left_rm || i < trim_size)) {
+			out_vec.push_back(vec[i].deep_clone());
+		} else {
+			cout << "Removing point " << vec[i].val << endl;
+		}
+		delete vec[i].first;
+		delete vec[i].second;
+	}
+}
+
+
+template<class T>
+void remove_random(std::vector<pra<T> > &vec, size_t trim_size, std::vector<pra<T> > &out_vec, Random& random)
+{
+	std::shuffle(vec.begin(), vec.end(), random.gen());
+	for (size_t i = 0; i < vec.size(); i++) {
+		if (i < trim_size) {
+			out_vec.push_back(vec[i].deep_clone());
+		}
+		delete vec[i].first;
+		delete vec[i].second;
+	}
+}
+template<class T>
+size_t split_thd_data(std::vector<std::vector<pra<T> > >& vec, double id, std::vector<pra<T> >& pos, std::vector<pra<T> >& neg)
+{
+	for (int i = 0; i < vec.size(); i++) {
+		for (auto pr : vec[i]) {
+			if (pr.val > id) {
+				uint64_t len = pr.first->get_length();
+				uint64_t min_len = len * id;
+				uint64_t max_len = len / id;
+				uint64_t second_len = pr.second->get_length();
+				if (second_len >= min_len && second_len <= max_len) {
+					pos.push_back(pr);
+				} else {
+					cout << "Bad generated point " << len << " " << second_len << endl;
+				}
+			} else {
+				neg.push_back(pr);
+			}
+		}
+		vec[i].clear();
+	}
+	return min(pos.size(), neg.size());
+}
+template<class T>
+void Predictor<T>::train(const vector<Point<T> *> &points, uintmax_t &_id, size_t total_num_samples, size_t num_templates)
+{
+	if (is_trained) { return; }
+
+	// for (auto p : points) {
+	// 	cout << "H: " << p->get_header() << endl;
+	// }
+	cout << "params: total_samples: " << total_num_samples << " num_templates: " << num_templates << endl;
+	num_templates = min(num_templates, points.size());
+	vector<Point<T>*> f_points_tr, f_points_test;
+	size_t total_size = points.size();// + queries.size();
+	for (int i = 0; i < num_templates; i++) {
+		int i1 = floor((double)i * total_size / (2 * num_templates));
+		int i2 = floor((i + 1) * (double)total_size / (2 * num_templates));
+		f_points_tr.push_back(points.at(i1));
+		f_points_test.push_back(points.at(i2));
+	}
+	cout << "# of templates: " << num_templates << " train: " << f_points_tr.size() << " test: " << f_points_test.size() << endl;
+	const double pts_per_mut = (double)total_num_samples / num_templates;
+	// size_t q_sample = min(num_sample / 10, queries.size());
+	// while (10 * f_points_tr.size() <= 11 * num_sample) {
+	// 	for (int i = 0; i < q_sample; i++) {
+	// 		int i1 = floor((double)i * queries.size() / (2 * q_sample));
+	// 		int i2 = floor((i + 1) * (double)queries.size() / (2 * q_sample));
+	// 		f_points_tr.push_back(queries.at(i1));
+	// 		f_points_test.push_back(queries.at(i2));
+	// 	}
+	// }
+	training.clear();
+	testing.clear();
+	if (mode & PRED_MODE_CLASS) {
+		vector<std::random_device::result_type> train_seeds, test_seeds;
+		for (size_t i = 0; i < f_points_tr.size(); i++) {
+			train_seeds.push_back(random.nextRandSeed());
+		}
+		for (size_t i = 0; i < f_points_test.size(); i++) {
+			test_seeds.push_back(random.nextRandSeed());
+		}
+		std::vector<pra<T> > pos_buf, neg_buf;
+		std::vector<std::vector<pra<T> > > thd_data(f_points_tr.size());
+		cout << "mutating sequences" << endl;
+		int n_mut = 15;
+		int n_pos = 10;
+		int n_neg = 10;
+		if (1) {
+			auto p = f_points_tr[0];
+			vector<int> mut_rates;
+			std::random_device::result_type seed = random.nextRandSeed();
+			LCG rnd(seed);
+			identities_for_gen(100 * id, 100, n_mut, rnd, mut_rates);
+			identities_for_gen(min_id, 100 * id, 2 * n_mut, rnd, mut_rates);
+			std::vector<double> out_mut(3 * n_mut);
+			std::string bin_seq = p->get_data_str();
+			std::string seq;
+			for (auto c : bin_seq) {
+				switch (c) {
+				case 0:
+					seq += 'A';
+					break;
+				case 1:
+					seq += 'C';
+					break;
+				case 2:
+					seq += 'G';
+					break;
+				case 3:
+					seq += 'T';
+					break;
+				case 'N':
+					seq += 'C';
+					break;
+				default:
+					cout << "Invalid character " << c << endl;
+					cout << "from sequence " << bin_seq << endl;
+					throw 3;
+				}
+			}
+			#pragma omp parallel for
+			for (int i = 0; i < mut_rates.size(); i++) {
+				int mut_rate = mut_rates[i];
+				HandleSeq hs(mut_type, seed);
+				LCG lcg(seed);
+				int spt = lcg.randMod<int>(mut_rate);
+				auto newseq = hs.mutate(seq, mut_rate, spt);
+				out_mut[i] = newseq.first;
+			}
+			double P = 0;
+			double N = 0;
+			for (double val : out_mut) {
+				if (val > id) {
+					P++;
+				} else {
+					N++;
+				}
+			}
+			cout << "pts_per_mut: " << pts_per_mut << " / " << " P: " << P << " N: " << N << endl;
+
+			// Avoid singular solution
+		        P = std::max(1.0, P);
+			N = std::max(1.0, N);
+			/* Equation for solving number of pos and neg
+			   2 * P + N = pts_per_mut
+			   P * n_mut_pos + N * n_mut_neg = n_mut_pos + n_mut_neg
+
+			   solved:
+			 */
+
+			double nd_pos = pts_per_mut / (1 + 4*P/N);
+                        double nd_neg = pts_per_mut / (1 + N/(P*4));
+			n_pos = ceil(nd_pos);
+			n_neg = ceil(nd_neg);
+			cout << "found: " << (int)P << ", " << (int)N << " -> " << nd_pos << ", " << nd_neg << " -> " << n_pos << ", " << n_neg << endl;
+			cout << "final +: " << n_pos << " -: " << n_neg << endl;
+			// n_pos = max(n_pos, n_neg);
+			// n_neg = max(n_pos, n_neg);
+		}
+
+		Progress prog1(f_points_tr.size(), "Generating training");
+#pragma omp parallel for
+		for (size_t i = 0; i < f_points_tr.size(); i++) {
+			auto p = f_points_tr[i];
+			mutate_seqs(p, n_pos, thd_data[i], 100 * id, 100, _id, train_seeds[i]);
+			mutate_seqs(p, n_neg, thd_data[i], min_id, 100 * id, _id, train_seeds[i]);
+			#pragma omp critical
+			prog1++;
+		}
+		prog1.end();
+		size_t buf_size = split_thd_data(thd_data, id, pos_buf, neg_buf);
+		cout << "training +: " << pos_buf.size() << endl;
+		cout << "training -: " << neg_buf.size() << endl;
+		auto pra_cmp = [&](const pra<T> &a, const pra<T> &b) {
+			 // int fc = a.first->get_header().compare(b.first->get_header());
+			 // int sc = a.second->get_header().compare(b.second->get_header());
+//			 return fc < 0 || (fc == 0 && sc < 0);
+			return fabs(a.val - id) < fabs(b.val - id);
+		};
+		std::sort(pos_buf.begin(), pos_buf.end(), pra_cmp);
+		std::sort(neg_buf.begin(), neg_buf.end(), pra_cmp);
+
+		size_t num_pos = buf_size;
+		size_t num_neg = 2 * buf_size;
+		// remove_random(pos_buf, num_pos, training, random);
+		// remove_random(neg_buf, num_neg, training, random);
+	        num_pos = remove_uniform(pos_buf, num_pos, training);
+		num_neg = remove_uniform(neg_buf, num_neg, training);
+		// remove_boundary(pos_buf, num_pos, training);
+		// remove_boundary(neg_buf, num_neg, training);
+		cout << "Training final #: +: " <<  num_pos << " -: " << num_neg << endl;
+
+
+
+		pos_buf.clear();
+		neg_buf.clear();
+		thd_data.resize(f_points_test.size());
+		Progress prog2(f_points_test.size(), "Generating testing");
+		#pragma omp parallel for
+		for (size_t i = 0; i < f_points_test.size(); i++) {
+			auto p = f_points_test[i];
+			mutate_seqs(p, n_pos, thd_data[i], 100 * id, 100, _id, test_seeds[i]);
+			mutate_seqs(p, n_neg, thd_data[i], min_id, 100 * id, _id, test_seeds[i]);
+#pragma omp critical
+			prog2++;
+		}
+		prog2.end();
+		buf_size = split_thd_data(thd_data, id, pos_buf, neg_buf);
+		cout << "testing +: " << pos_buf.size() << endl;
+		cout << "testing -: " << neg_buf.size() << endl;
+		std::sort(pos_buf.begin(), pos_buf.end(), pra_cmp);
+		std::sort(neg_buf.begin(), neg_buf.end(), pra_cmp);
+
+		// std::shuffle(pos_buf.begin(), pos_buf.end(), random.gen());
+		// std::shuffle(neg_buf.begin(), neg_buf.end(), random.gen());
+		num_pos = buf_size;
+		num_neg = 2 * buf_size;
+		num_pos = remove_uniform(pos_buf, num_pos, testing);
+		num_neg = remove_uniform(neg_buf, num_neg, testing);
+		// remove_boundary(pos_buf, num_pos, testing);
+		// remove_boundary(neg_buf, num_neg, testing);
+		// remove_random(pos_buf, num_pos, testing, random);
+		// remove_random(neg_buf, num_neg, testing, random);
+		cout << "Testing final #: +: " << num_pos << " -: " << num_neg << endl;
+		Clock::stamp("data_generation");
+	} else {
+		for (auto p : f_points_tr) {
+			mutate_seqs(p, 5, training, training, min_id, 100, _id, random.nextRandSeed());
+		}
+		for (auto p : f_points_test) {
+			mutate_seqs(p, 5, testing, testing, min_id, 100, _id, random.nextRandSeed());
+		}
+	}
+	train();
+}
+
+
+template<class T>
+void Predictor<T>::filter(std::vector<pra<T> > &vec, std::string prefix)
+{
+	std::vector<std::vector<pra<T> > > bins;
+	std::vector<double> limits;
+	size_t num_bins = 10;
+	size_t smallest_bin_size = vec.size();
+	for (size_t i = 0; i < num_bins; i++) {
+		limits.push_back(id + i * (1 - id) / num_bins);
+		bins.push_back(std::vector<pra<T> >());
+	}
+	limits.push_back(1);
+	for (auto p : vec) {
+		for (size_t i = 1; i < limits.size(); i++) {
+			if (p.val <= limits[i] && p.val > limits[i-1]) {
+				bins[i-1].push_back(p);
+				break;
+			}
+		}
+	}
+	size_t bin_size = 0;
+	for (auto &v : bins) {
+		bin_size += v.size();
+		// smallest_bin_size = std::min(smallest_bin_size, v.size());
+		std::shuffle(v.begin(), v.end(), random.gen());
+	}
+	smallest_bin_size = bin_size / bins.size();
+	vec.clear();
+
+	for (auto &v : bins) {
+		for (size_t i = 0; i < std::min(v.size(), smallest_bin_size); i++) {
+			vec.push_back(v[i]);
+			if (prefix != "") {
+				cout << prefix << " bin " << i - 1 << " " << v[i].val << endl;
+			}
+		}
+	}
+	cout << "new vector size: " << vec.size() << " divided into " << bins.size() << " equal parts" << endl;
+}
+
+
+template<class T>
+void Predictor<T>::mutate_seqs(Point<T>* p, size_t num_seq, vector<pra<T> >  &thd_buf, double id_begin, double id_end, uintmax_t& _id, std::random_device::result_type seed)
+{
+	LCG newRand(seed);
+	HandleSeq h(mut_type, newRand.nextRandSeed());
+
+	std::string bin_seq = p->get_data_str();
+	std::string seq;
+	for (auto c : bin_seq) {
+		switch (c) {
+		case 0:
+			seq += 'A';
+			break;
+		case 1:
+			seq += 'C';
+			break;
+		case 2:
+			seq += 'G';
+			break;
+		case 3:
+			seq += 'T';
+			break;
+		case 'N':
+			seq += 'C';
+			break;
+		default:
+			cout << "Invalid character " << c << endl;
+			cout << "from sequence " << bin_seq << endl;
+			throw 3;
+		}
+	}
+
+	double inc = (id_end - id_begin) / num_seq;
+	for (size_t i = 0; i < num_seq; i++) {
+		double iter_id = id_begin + inc * (i + 0.5);
+		double actual_id = newRand.rand_between(iter_id, inc, id_begin, id_end);
+//		double actual_id = rand_between(iter_id, inc, id_begin, id_end);
+		int mut = round(100 - actual_id);
+		mut = (mut == 0) ? 1 : mut;
+		int spt = newRand.randMod<int>(mut);
+		auto newseq = h.mutate(seq, mut, spt);
+		std::string chrom;
+		std::ostringstream oss;
+		oss << p->get_header() << "_mut" << mut << "_" << spt << "_" << i;
+		std::string header = oss.str();
+		Point<T>* new_pt = Loader<T>::get_point(header, newseq.second, _id, k, false);
+		pra<T> pr;
+		//pr.first = p->clone();
+		pr.first = p;
+//		pr.first->set_data_str("");
+//		pr.first->set_data_str(bin_seq);
+		pr.second = new_pt;
+		pr.second->set_data_str("");
+//		pr.second->set_data_str(newseq.second);
+		pr.val = newseq.first;
+		thd_buf.push_back(pr);
+	}
+}
+template<class T>
+void Predictor<T>::mutate_seqs(Point<T>* p, size_t num_seq, vector<pra<T> > &pos_buf, vector<pra<T> > &neg_buf, double id_begin, double id_end, uintmax_t& _id, std::random_device::result_type seed)
+{
+
+	LCG newRand(seed);
+	HandleSeq h(mut_type, newRand.nextRandSeed());
+
+	std::string bin_seq = p->get_data_str();
+	std::string seq;
+	for (auto c : bin_seq) {
+		switch (c) {
+		case 0:
+			seq += 'A';
+			break;
+		case 1:
+			seq += 'C';
+			break;
+		case 2:
+			seq += 'G';
+			break;
+		case 3:
+			seq += 'T';
+			break;
+		case 'N':
+			seq += 'C';
+			break;
+		default:
+			cout << "Invalid character " << c << endl;
+			cout << "from sequence " << bin_seq << endl;
+			throw 3;
+		}
+	}
+
+	double inc = (id_end - id_begin) / num_seq;
+	for (size_t i = 0; i < num_seq; i++) {
+		double iter_id = id_begin + inc * (i + 0.5);
+		double actual_id = newRand.rand_between(iter_id, inc, id_begin, id_end);
+//		double actual_id = rand_between(iter_id, inc, id_begin, id_end);
+		int mut = round(100 - actual_id);
+		mut = (mut == 0) ? 1 : mut;
+		int spt = newRand.randMod<int>(mut);
+		auto newseq = h.mutate(seq, mut, spt);
+		std::string chrom;
+		std::ostringstream oss;
+		oss << p->get_header() << "_mut" << mut << "_" << spt << "_" << i;
+		std::string header = oss.str();
+		Point<T>* new_pt = Loader<T>::get_point(header, newseq.second, _id, k);
+		pra<T> pr;
+		pr.first = p->clone();
+		pr.first->set_data_str(bin_seq);
+		pr.second = new_pt;
+		pr.second->set_data_str(newseq.second);
+		pr.val = newseq.first;
+#pragma omp critical
+		{
+			if (pr.val > id) {
+				pos_buf.push_back(pr);
+			} else {
+				neg_buf.push_back(pr);
+			}
+		}
+	}
+}
+template<class T>
+void Predictor<T>::train()
+{
+	Feature<T> feat(k);
+	feat.set_save(true);
+
+	uint64_t max_feat = 0;
+	for (uint64_t i = 0; i < possible_feats.size(); i++) {
+		if (possible_feats.at(i).first > max_feat) {
+			max_feat |= possible_feats.at(i).first;
+		}
+	}
+	for (uint64_t i = 1; i <= max_feat; i *= 2) {
+		if (i & max_feat) {
+			feat.add_feature(i, Combo::xy);
+		}
+	}
+	feat.normalize(training);
+	feat.normalize(testing);
+	feat.finalize();
+
+
+
+	// cout << "Class Training:" << endl;
+	// for (auto p : training) {
+	// 	cout << p.val << " ";
+	// }
+	// cout << "Class Testing:" << endl;
+	// for (auto p : testing) {
+	// 	cout << p.val << " ";
+	// }
+	if (mode & PRED_MODE_CLASS) {
+		train_class(&feat);
+		if (mode & PRED_MODE_REGR) {
+			// vector<Point<T>*> f_points_tr, f_points_test;
+			// for (int i = 0; i < 10; i++) {
+			// 	f_points_tr.push_back(training[rand()%training.size()].first);
+			// 	f_points_test.push_back(training[rand()%training.size()].first);
+			// }
+			// training.clear();
+			// testing.clear();
+			// for (auto p : f_points_tr) {
+			// 	mutate_seqs(p, 50, training, 100 * id, 100);
+			// 	mutate_seqs(p, 50, training, 60, 100 * id);
+			// }
+			// for (auto p : f_points_test) {
+			// 	mutate_seqs(p, 50, testing, 100 * id, 100);
+			// 	mutate_seqs(p, 50, testing, 60, 100 * id);
+			// }
+			// filter();
+			auto func = [&](pra<T> pr) {
+				return pr.val <= id;
+			};
+			training.erase(std::remove_if(training.begin(), training.end(), func), training.end());
+			testing.erase(std::remove_if(testing.begin(), testing.end(), func), testing.end());
+			filter(training);//, "training");
+			filter(testing);//, "testing");
+
+		}
+	}
+	if (mode & PRED_MODE_REGR) {
+		train_regr(&feat);
+	}
+	cout << "Training size: " << training.size() << endl;
+	cout << "Testing size: " << testing.size() << endl;
+	for (auto p : training) {
+//		delete p.first;
+		delete p.second;
+	}
+	for (auto p : testing) {
+//		delete p.first;
+		delete p.second;
+	}
+	cout << endl;
+	feat.set_save(false);
+	training.clear();
+	testing.clear();
+	possible_feats.clear();
+	is_trained = true;
+	// save("weights.txt");
+	// exit(100);
+	Clock::stamp("GLM");
+}
+
+template<class T>
+void Predictor<T>::train_class(Feature<T>* feat)
+{
+	// std::vector<std::pair<uint64_t, Combo> > bf_feats;
+	// for (int i = 0; bf_feats.size() < 2; i++) {
+	// 	if (possible_feats[i].second == Combo::xy) {
+	// 		bf_feats.push_back(possible_feats[i]);
+	// 	}
+	// }
+	// bf_feats.push_back(std::make_pair(FEAT_INTERSECTION, Combo::xy));
+	// bf_feats.push_back(std::make_pair(FEAT_NORMALIZED_VECTORS, Combo::xy));
+	FeatureSelector<T> *fs = new BestFirstSelector<T>(possible_feats, min_num_feat, max_num_feat);
+//	FeatureSelector<T> *fs = new GreedySelector<T>(possible_feats, min_num_feat, max_num_feat);
+	auto pr = fs->train_class(feat, training, testing, id);
+	delete fs;
+	feat_c = pr.first;
+	c_glm = pr.second;
+}
+template<class T>
+void Predictor<T>::train_regr(Feature<T>* feat)
+{
+	FeatureSelector<T> *fs = new GreedySelector<T>(possible_feats, min_num_feat, max_num_feat);
+	auto pr = fs->train_regression(feat, training, testing);
+	delete fs;
+	feat_r = pr.first;
+	r_glm = pr.second;
+}
+
+template class Predictor<uint8_t>;
+template class Predictor<uint16_t>;
+template class Predictor<uint32_t>;
+template class Predictor<uint64_t>;
+template class Predictor<int>;
+template class Predictor<double>;
diff --git a/src/cluster/src/Predictor.h b/src/predict/Predictor.h
similarity index 79%
rename from src/cluster/src/Predictor.h
rename to src/predict/Predictor.h
index bf35036..cda6b08 100644
--- a/src/cluster/src/Predictor.h
+++ b/src/predict/Predictor.h
@@ -15,6 +15,7 @@
 #include "Point.h"
 #include "Feature.h"
 #include <set>
+#include "Random.h"
 #include <omp.h>
 #define PRED_MODE_CLASS 1
 #define PRED_MODE_REGR  2
@@ -26,7 +27,7 @@
 template<class T>
 class Predictor {
 public:
-	Predictor(int k_, double id_, uint8_t mode_, uint64_t feats, int mut_type_, int min_num_feat_=3, int max_num_feat_=5, double min_id_=0.35) : k(k_), id(id_), is_trained(false), is_training(false), mode(mode_), max_num_feat(max_num_feat_), mut_type(mut_type_), min_num_feat(min_num_feat_), min_id(min_id_ * 100) {
+	Predictor(int k_, double id_, uint8_t mode_, uint64_t feats, int mut_type_, int min_num_feat_=3, int max_num_feat_=5, double min_id_=0.35) : k(k_), id(id_), is_trained(false), is_training(false), mode(mode_), max_num_feat(max_num_feat_), mut_type(mut_type_), min_num_feat(min_num_feat_), min_id(min_id_ * 100), feats64(feats) {
 		add_feats(possible_feats, feats);
 		feat_c = NULL;
 		feat_r = NULL;
@@ -45,13 +46,20 @@ class Predictor {
 		training.clear();
 		testing.clear();
 	}
-	void train(const std::vector<Point<T>* >& vec, const std::vector<Point<T>* >& vecq, uintmax_t& _id, size_t num_sample);
+	static double classify_sum(double sum);
+	static void set_bias(double bias);
+	void train(const std::vector<Point<T>* >& vec, uintmax_t& _id, size_t num_sample, size_t n_templates);
 	double similarity(Point<T>* a, Point<T>* b);
 	bool close(Point<T>* a, Point<T>* b);
-	void save(std::string file);
+	void save(std::string file, std::string datatype);
 	void check();
 	uint8_t get_mode() const { return mode; }
 	pair<Feature<T>*, matrix::GLM> get_class() { return std::make_pair(new Feature<T>(*feat_c), c_glm); }
+	void mutate_seqs(Point<T>* p, size_t num_seq, vector<pra<T> > &,vector<pra<T> > & , double id_begin, double id_end, uintmax_t& _id, std::random_device::result_type seed);
+	void mutate_seqs(Point<T>* p, size_t num_seq,vector<pra<T> >  &,double id_begin, double id_end, uintmax_t& _id, std::random_device::result_type seed);
+	std::string get_datatype() const { return datatype; }
+	int get_k() const { return k; }
+	double get_id() const { return id; }
 private:
 	static void add_feats(std::vector<std::pair<uint64_t, Combo> >& vec, uint64_t flags);
 	static pair<matrix::GLM, Feature<T>*> read_from(std::ifstream &in, int k_);
@@ -64,7 +72,7 @@ class Predictor {
 	double predict(Point<T>* a, Point<T>* b);
 	bool p_close(Point<T>* a, Point<T>* b);
 	double p_predict(Point<T>* a, Point<T>* b);
-	void mutate_seqs(Point<T>* p, size_t num_seq, vector<pra<T> > &,vector<pra<T> > & , double id_begin, double id_end, uintmax_t& _id);
+
 	Feature<T> *feat_c, *feat_r;
 	matrix::GLM c_glm, r_glm;
         vector<pra<T> > training, testing;
@@ -74,5 +82,10 @@ class Predictor {
 	double id, min_id;
 	vector<std::pair<uint64_t, Combo> > possible_feats;
 	omp_lock_t lock;
+	Random random;
+	uint64_t feats64;
+	std::string datatype;
+	double scale_min = 1000;
+	double scale_max = -1000;
 };
 #endif
diff --git a/src/predict/SingMute.cpp b/src/predict/SingMute.cpp
new file mode 100644
index 0000000..3772e08
--- /dev/null
+++ b/src/predict/SingMute.cpp
@@ -0,0 +1,162 @@
+/* -*- C++ -*- */
+/*
+ * SingMute.cpp
+ *
+ * Original Author: Alexander Baumgartner
+ * Modified by Benjamin T James
+ */
+#include "SingMute.h"
+#include <set>
+#include <random>
+#include <algorithm>
+#include <iostream>
+
+#ifdef MUTDEBUG
+static const std::string INSERT_BEGIN = "[";
+static const std::string INSERT_END = "]";
+static const std::string SWITCH_BEGIN = "(";
+static const std::string SWITCH_END = ")";
+static const std::string DEL = "-";
+#else
+static const std::string INSERT_BEGIN = "";
+static const std::string INSERT_END = "";
+static const std::string SWITCH_BEGIN = "";
+static const std::string SWITCH_END = "";
+static const std::string DEL = "";
+#endif
+
+
+char SingMute::randNucl()
+{
+	char character;
+	int value = rng.randMod<int>(percAs + percCs + percGs + percTs);
+//	int value = 40436 % (percAs + percCs + percGs + percTs);
+	if (value < percAs) {
+		character = 'A';
+	} else if (value < percAs + percCs) {
+		character = 'C';
+	} else if (value < percAs + percCs + percGs) {
+		character = 'G';
+	} else {
+		character = 'T';
+	}
+	return character;
+}
+void SingMute::init(const std::vector<bool> &valid)
+{
+	maxInsert = 0;
+	maxDel = 0;
+	maxSwitch = 0;
+	if (num_mut == 0) {
+		out_seq = std::string(*seq);
+		IBP = 0;
+		alignmentLength = 0;
+		return;
+	} else if (num_mut == 1) {
+		maxInsert = 1;
+		maxDel = 0;
+		maxSwitch = 0;
+	} else {
+		maxSwitch = rng.randMod<long>(num_mut);
+		num_mut -= maxSwitch;
+
+		if (maxSwitch % 2 == 1 && num_mut >= 1) {
+			maxSwitch++;
+			num_mut--;
+		} else if (num_mut == 0) {
+			maxSwitch--;
+			num_mut++;
+		}
+		if (num_mut > 1) {
+			maxInsert = rng.randMod<long>(num_mut);
+			num_mut -= maxInsert;
+		} else {
+			maxInsert = num_mut;
+			num_mut -= maxInsert;
+		}
+		maxDel = num_mut;
+	}
+	size_t seq_len = seq->length();
+
+	maxDel *= seq_len / 100.0;
+	maxInsert *= seq_len / 100.0;
+	maxSwitch *= seq_len / 100.0;
+	alignmentLength = maxInsert;
+	IBP = maxDel + maxSwitch;
+
+
+	std::vector<char> command_str(seq_len, 'S');
+	long idx = 0;
+	long nons_len = maxInsert + maxDel + maxSwitch;
+	for (long i = 0; i < maxInsert; i++) {
+		command_str[idx++] = 'I';
+	}
+	for (long i = 0; i < maxDel; i++) {
+		command_str[idx++] = 'D';
+	}
+	for (long i = 0; i < maxSwitch; i++) {
+		command_str[idx++] = 'W';
+	}
+	//std::shuffle(command_str.begin(), command_str.end(), rng.gen());
+	std::shuffle(command_str.begin(), command_str.end(), std::minstd_rand0(rng.nextRandSeed()));
+	std::vector<long> valid_indices;
+	long repl = command_str.size() - 1;
+	for (long i = 0; i < command_str.size(); i++) {
+		if (command_str[i] != 'S' && !valid[i]) {
+			if (!valid_indices.empty()) {
+				repl = valid_indices.back();
+				valid_indices.pop_back();
+			} else {
+				for (; repl > 0; repl--) {
+					if (valid[repl]) {
+						break;
+					}
+				}
+			}
+			std::swap(command_str[i], command_str[repl]);
+		} else if (command_str[i] == 'S'
+			   && valid[i]
+			   && valid_indices.size() < nons_len) {
+
+			valid_indices.push_back(i);
+		}
+	}
+	// std::set<long> s_ins, s_del, s_switch;
+	// generate_unique_set(command_str.size(), s_ins, maxInsert, s_del, s_switch, valid);
+	// generate_unique_set(command_str.size(), s_del, maxDel, s_ins, s_switch, valid);
+	// generate_unique_set(command_str.size(), s_switch, maxSwitch, s_ins, s_del, valid);
+	// for (auto idx : s_ins) {
+	// 	command_str[idx] = 'I';
+	// }
+	// for (auto idx : s_del) {
+	// 	command_str[idx] = 'D';
+	// }
+	// for (auto idx : s_switch) {
+	// 	command_str[idx] = 'W';
+	// }
+	out_seq = "";
+	out_seq.reserve(maxInsert + seq_len - maxDel + 1);
+
+	for (long i = 0; i < seq_len; i++) {
+		auto cmd = command_str.at(i);
+		switch (cmd) {
+		case 'I': {
+			out_seq += INSERT_BEGIN + randNucl() + INSERT_END;
+			out_seq += seq->at(i);
+			break;
+		}
+		case 'S': {
+			out_seq += seq->at(i);
+			break;
+		}
+		case 'D': {
+			out_seq += DEL;
+			break;
+		}
+		case 'W': {
+			out_seq += SWITCH_BEGIN + randNucl() + SWITCH_END;
+			break;
+		}
+		}
+	}
+}
diff --git a/src/cluster/src/SingMute.h b/src/predict/SingMute.h
similarity index 72%
rename from src/cluster/src/SingMute.h
rename to src/predict/SingMute.h
index c659afd..bb97d06 100644
--- a/src/cluster/src/SingMute.h
+++ b/src/predict/SingMute.h
@@ -1,3 +1,10 @@
+/* -*- C++ -*- */
+/*
+ * SingMute.h
+ *
+ * Original Author: Alexander Baumgartner
+ * Modified by Benjamin T James
+ */
 
 #ifndef SINGMUTE_H
 #define SINGMUTE_H
@@ -5,6 +12,7 @@
 #include <vector>
 #include <string>
 #include "Random.h"
+#include "LCG.h"
 
 class SingMute {
 public:
@@ -19,8 +27,8 @@ class SingMute {
 	 int: percentage of T's
 	 int: The total allocation for non-single mutations
 	 */
-	SingMute(int pa, int pc, int pg, int pt, uintmax_t tt, const std::string* s, const std::vector<bool> &valid_) : percAs(pa),
-														  percCs(pc), percGs(pg), percTs(pt), num_mut(tt), seq(s) {
+	SingMute(int pa, int pc, int pg, int pt, uintmax_t tt, const std::string* s, const std::vector<bool> &valid_, std::random_device::result_type seed) : percAs(pa),
+																	percCs(pc), percGs(pg), percTs(pt), num_mut(tt), seq(s), rng(seed) {
 		init(valid_);
 	}
 	long getAlignmentLength() { return alignmentLength; }
@@ -43,6 +51,6 @@ class SingMute {
 	const std::string * seq;
 	std::string out_seq;
 	char randNucl();
-	Random rng;
+	LCG rng;
 };
 #endif
diff --git a/src/utility/AffineId.cpp b/src/utility/AffineId.cpp
deleted file mode 100644
index 484a5bd..0000000
--- a/src/utility/AffineId.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * AffineId.cpp
- *
- *  Created on: Dec 6, 2012
- *  Modified on: Nov 6, 2017
- *      Author: Hani Zakaria Girgis, PhD
- */
-
-// ToDo:
-// 1. Add pre-conditions after testing
-#include "AffineId.h"
-
-#include "Util.h"
-#include "../exception/InvalidInputException.h"
-
-#include <iostream>
-#include <cstring>
-using namespace std;
-//using namespace exception;
-
-namespace utility {
-
-AffineId::AffineId(const char * seq1In, int start1In, int end1In,
-		const char * seq2In, int start2In, int end2In) {
-
-	// The shorter of the two sequences is seq2
-	seq1 = seq1In;
-	start1 = start1In;
-	end1 = end1In;
-
-	seq2 = seq2In;
-	start2 = start2In;
-	end2 = end2In;
-
-	if (end1 - start1 < end2 - start2) {
-		seq1 = seq2In;
-		start1 = start2In;
-		end1 = end2In;
-
-		seq2 = seq1In;
-		start2 = start1In;
-		end2 = end1In;
-	}
-
-	/*	if (start1 < 0 || end1 < 0 || start1 > end1) {
-	 string msg("Invalid Input. Start1 is ");
-	 msg.append(Util::int2string(start1));
-	 msg.append(". End 1 is ");
-	 msg.append(Util::int2string(end1));
-	 msg.append(".");
-	 //throw InvalidInputException(msg);
-
-	 cerr << msg << endl;
-	 throw exception();
-	 }
-
-	 if (start2 < 0 || end2 < 0 || start2 > end2) {
-	 string msg("Invalid Input. Start2 is ");
-	 msg.append(Util::int2string(start2));
-	 msg.append(". End2 is ");
-	 msg.append(Util::int2string(end2));
-	 msg.append(".");
-	 //throw InvalidInputException(msg);
-
-	 cerr << msg << endl;
-	 throw exception();
-	 }*/
-
-	// Validate input
-	// cout << start1 << " " << end1 << endl;
-	// cout << start2 << " " << end2 << endl;
-
-	len1 = end1 - start1 + 2;
-	len2 = end2 - start2 + 2;
-
-	align();
-}
-
-AffineId::~AffineId() {
-}
-
-void AffineId::align() {
-	// Initialize needed arrays
-	auto m = new int[len2][2](); // Middle level array
-	auto u = new int[len2][2](); // Upper level array
-	auto mId = new int[len2][2](); // Array storing number of matches in the middle array
-	auto uId = new int[len2][2](); // Array storing number of matches in the upper array
-	auto mPath = new int[len2][2](); // Array storing number of steps in the middle array
-	auto uPath = new int[len2][2](); // Array storing number of steps in the upper array
-
-	// Apply the DP
-	// The i index is only used to get a character from the first sequence
-	// It is not used for filling the DP matrix
-	for (int i = 1; i < len1; i++) {
-		char base1 = seq1[start1 + i - 1];
-		int lower = 0;
-		int lowerId = 0;
-		int lowerPath = 0;
-
-		// j is the row. There are only two columns 0 and 1
-		for (int j = 1; j < len2; j++) {
-			// Update the lower value
-			int extLower = lower + EXT;
-			int openLower = m[j - 1][0] + OPEN;
-			if (extLower > openLower) {
-				lower = extLower;
-				lowerPath++;
-			} else {
-				lower = openLower;
-				lowerId = mId[j - 1][0];
-				lowerPath = mPath[j - 1][0] + 1;
-			}
-
-			// Fill the array of the upper level
-			int extUpper = u[j][0] + EXT;
-			int openUpper = m[j][0] + OPEN;
-			if (extUpper > openUpper) {
-				u[j][1] = extUpper;
-				uId[j][1] = uId[j][0];
-				uPath[j][1] = uPath[j][0] + 1;
-			} else {
-				u[j][1] = openUpper;
-				uId[j][1] = mId[j][0];
-				uPath[j][1] = mPath[j][0] + 1;
-			}
-
-			// Fill the array of the middle level
-			int matchOrMis;
-			if (base1 == seq2[start2 + j - 1]) {
-				matchOrMis = m[j - 1][0] + MATCH;
-			} else {
-				matchOrMis = m[j - 1][0] + MIS;
-			}
-
-			int lowerOrUpper;
-			if (lower > u[j][1]) {
-				lowerOrUpper = lower;
-			} else {
-				lowerOrUpper = u[j][1];
-			}
-
-			if (matchOrMis > lowerOrUpper) {
-				m[j][1] = matchOrMis;
-				mPath[j][1] = mPath[j - 1][0] + 1;
-				if (base1 == seq2[start2 + j - 1]) {
-					mId[j][1] = mId[j - 1][0] + 1;
-				} else {
-					mId[j][1] = mId[j - 1][0];
-				}
-			} else {
-				m[j][1] = lowerOrUpper;
-				if (lower > u[j][1]) {
-					mId[j][1] = lowerId;
-					mPath[j][1] = lowerPath;
-				} else {
-					mId[j][1] = uId[j][1];
-					mPath[j][1] = uPath[j][1];
-				}
-			}
-		}
-
-		// // Test
-		// for (int h = 0; h < len2; h++) {
-		// 	cout << m[h][0] << "\t" << m[h][1] << "----" << mId[h][0] << "\t"
-		// 			<< mId[h][1] << endl;
-		// }
-		// cout << "---------------------------------------------------" << endl;
-		// // End of test
-
-		// Copy the second column to the first one
-		if (i != len1 - 1) {
-			for (int h = 0; h < len2; h++) {
-				m[h][0] = m[h][1];
-				u[h][0] = u[h][1];
-				mId[h][0] = mId[h][1];
-				uId[h][0] = uId[h][1];
-				mPath[h][0] = mPath[h][1];
-				uPath[h][0] = uPath[h][1];
-			}
-		}
-	}
-
-	lenCS = mId[len2 - 1][1];
-	lenPath = mPath[len2 - 1][1];
-	//cout << "Alignment length = " << lenPath << endl;
-	delete[] u;
-	delete[] m;
-	delete[] mId;
-	delete[] uId;
-	delete[] mPath;
-	delete[] uPath;
-}
-
-double AffineId::getAlign() {
-	double amt = lenCS;
-	return amt / (double)lenPath;
-}
-
-}
-/* namespace utility */
-
-// // Testing code
-// int main() {
-// 	string s1("GATCTCAG");
-// 	string s2("GACAG");
-
-// 	utility::AffineId id(s1.c_str(), 0, s1.length() - 1, s2.c_str(), 0,
-// 			s2.length() - 1);
-// 	cout << "Length = " << id.getLenCS() << endl;
-
-// 	return 0;
-// }
diff --git a/src/utility/AffineId.h b/src/utility/AffineId.h
deleted file mode 100644
index 61173e7..0000000
--- a/src/utility/AffineId.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * AffineId.h
- *
- *  Created on: Dec 6, 2012
- *  Modified on: Nov 6, 2017
- *      Author: Hani Zakaria Girgis, PhD
- */
-
-#ifndef AFFINEID_H_
-#define AFFINEID_H_
-
-namespace utility {
-
-class AffineId {
-private:
-	const char * seq1;
-	int start1;
-	int end1;
-	const char * seq2;
-	int start2;
-	int end2;
-
-	int len1;
-	int len2;
-	//int lenTotal;
-	int lenCS;
-	int lenPath;
-	int * m; // Middle level
-	//int * l; // Lower level
-	int * u; // Upper level
-
-	// const int MATCH = 4; // Score of a match
-	// const int MIS = -4; // Score of a mismatch
-	// const int OPEN = -2; // Score of a gap opening
-	// const int EXT = -1; // Score of a gap extension
-
-	const int MATCH = 1;
-	const int MIS = -1;
-	const int OPEN = -2;
-	const int EXT = -1;
-	void align();
-
-public:
-	AffineId(const char *, int, int, const char *, int, int);
-	virtual ~AffineId();
-        double getAlign();
-};
-
-} /* namespace utility */
-#endif /* AFFINEID_H_ */
diff --git a/src/utility/Util.cpp b/src/utility/Util.cpp
index 4a6d4c1..c778d02 100644
--- a/src/utility/Util.cpp
+++ b/src/utility/Util.cpp
@@ -18,7 +18,13 @@ Util::~Util() {
 
 string Util::fileSeparator("/");
 
-//string * Util::emptyString = new string("");
+string * Util::emptyString = new string("");
+
+bool Util::isDna = true;
+
+const int Util::getAlphabetSize(){
+	return Util::isDna? 4 : 22;
+}
 
 void Util::readFasta(string seqFile, vector<string> * infoList,
 		vector<string> * seqList, bool canCheckFormat) {
diff --git a/src/utility/Util.h b/src/utility/Util.h
index a9ed695..b63277a 100644
--- a/src/utility/Util.h
+++ b/src/utility/Util.h
@@ -33,6 +33,7 @@ class Util {
 public:
 	static string * emptyString;
 	static string fileSeparator;
+	static bool isDna;
 	static void readFasta(string, vector<string> *, vector<string> *, bool);
 	static void readFasta(string, vector<string> *, vector<string> *);
 	static void readCoordinates(string, vector<Location *> *);
@@ -53,6 +54,9 @@ class Util {
 
 	static int sumTotalLength(const vector<ILocation *> *);
 
+	// Added on Oct 6 2018
+	static const int getAlphabetSize();
+
 	/**
 	 * Delete the objects pointed to by pointers in a vector.
 	 * It does not delete the vector itself.