diff --git a/.drone/build.sh b/.drone/build.sh
index a2fa58785..cd90c4652 100755
--- a/.drone/build.sh
+++ b/.drone/build.sh
@@ -12,7 +12,7 @@ cd build
 
 echo "[Drone build] cmake configuration"
 
-cmake -DDO_QUIET_MAKE=TRUE -DBOOST_ROOT=/usr -DNO_IPO=TRUE ..
+cmake -DDO_QUIET_MAKE=TRUE -DBOOST_ROOT=/usr -DNO_IPO=TRUE -DCMAKE_BUILD_TYPE=RELEASE ..
 
 echo "[Drone build] making salmon and installing locally (this could take a while)"
 
diff --git a/.travis.yml b/.travis.yml
index 9e6c65edf..91bb7d206 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -45,6 +45,7 @@ script:
   # VERBOSE=1 to show the input commands in generatd Makefile for debug.
   - |
     cmake \
+      -DCMAKE_BUILD_TYPE=RELEASE \
       -DFETCH_BOOST=TRUE \
       -DNO_RTM=TRUE \
       .. && \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb2bcf4ea..7d180079a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,8 +42,8 @@ message("version: ${CPACK_PACKAGE_VERSION}")
 set(PROJECT_VERSION ${CPACK_PACKAGE_VERSION})
 set(CPACK_GENERATOR "TGZ")
 set(CPACK_SOURCE_GENERATOR "TGZ")
-set(CPACK_PACKAGE_VENDOR "Stony Brook University")
-set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Salmon - Wicked-fast RNA-seq isoform quantification using lightweight mapping")
+set(CPACK_PACKAGE_VENDOR "University of Maryland")
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Salmon - Wicked-fast RNA-seq isoform quantification using selective alignment")
 set(CPACK_PACKAGE_NAME
   "${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
 set(CPACK_SOURCE_PACKAGE_FILE_NAME
@@ -59,10 +59,12 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
       STRING "Choose the type of build." FORCE)
   # Set the possible values of build type for cmake-gui
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
-    "Debug" "Release")
+  #set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+  #  "Debug" "Release")
 endif()
 
+message(STATUS "CMAKE_BUILD_TYPE = ${CMAKE_BUILD_TYPE}")
+
 ## Set the standard required compile flags
 # Nov 18th --- removed -DHAVE_CONFIG_H
 set(REMOVE_WARNING_FLAGS "-Wno-unused-function;-Wno-unused-local-typedefs")
@@ -288,8 +290,6 @@ if(NOT FETCHED_PUFFERFISH)
   set(FETCHED_PUFFERFISH TRUE CACHE BOOL "Has pufferfish been fetched?" FORCE)
 endif()
 
-
-
 ##
 # Super-secret override
 ##
@@ -459,23 +459,25 @@ elseif(FETCH_BOOST)
     --with-timer)
   set(BOOST_WILL_RECONFIGURE TRUE)
   set(FETCH_BOOST FALSE)
+  set(BOOST_FETCHED_VERSION "1_72_0")
   message("Build system will fetch and build Boost")
   message("==================================================================")
   externalproject_add(libboost
     DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
-    DOWNLOAD_COMMAND curl -k -L https://dl.bintray.com/boostorg/release/1.71.0/source/boost_1_71_0.tar.gz -o boost_1_71_0.tar.gz &&
-      ${SHASUM} 96b34f7468f26a141f6020efb813f1a2f3dfb9797ecf76a7d7cbd843cc95f5bd boost_1_71_0.tar.gz &&
-      tar xzf boost_1_71_0.tar.gz
-    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_71_0
+    DOWNLOAD_COMMAND curl -k -L https://sourceforge.net/projects/boost/files/boost/1.72.0/boost_1_72_0.tar.gz/download -o boost_1_72_0.tar.gz &&
+      #${SHASUM} 96b34f7468f26a141f6020efb813f1a2f3dfb9797ecf76a7d7cbd843cc95f5bd boost_1_71_0.tar.gz &&
+      ${SHASUM} c66e88d5786f2ca4dbebb14e06b566fb642a1a6947ad8cc9091f9f445134143f boost_${BOOST_FETCHED_VERSION}.tar.gz &&
+      tar xzf boost_${BOOST_FETCHED_VERSION}.tar.gz 
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_${BOOST_FETCHED_VERSION}
     INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
     #PATCH_COMMAND patch -p2 < ${CMAKE_CURRENT_SOURCE_DIR}/external/boost156.patch
-    CONFIGURE_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_71_0/bootstrap.sh ${BOOST_CONFIGURE_TOOLSET} ${BOOST_BUILD_LIBS} --prefix=<INSTALL_DIR>
+    CONFIGURE_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_${BOOST_FETCHED_VERSION}/bootstrap.sh ${BOOST_CONFIGURE_TOOLSET} ${BOOST_BUILD_LIBS} --prefix=<INSTALL_DIR>
     add_custom_command(
-      OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_71_0/tools/build/src/user-config.jam
+      OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_${BOOST_FETCHED_VERSION}/tools/build/src/user-config.jam
       PRE_BUILD
       COMMAND echo "using gcc : ${CC_VERSION} : ${CMAKE_CXX_COMPILER} ;"
     )
-    BUILD_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_71_0/b2 -d0 -j${BOOST_BUILD_THREADS} ${BOOST_LIB_SUBSET} toolset=${BOOST_TOOLSET} ${BOOST_EXTRA_FLAGS} cxxflags=${BOOST_CXX_FLAGS} link=static install
+    BUILD_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_${BOOST_FETCHED_VERSION}/b2 -d0 -j${BOOST_BUILD_THREADS} ${BOOST_LIB_SUBSET} toolset=${BOOST_TOOLSET} ${BOOST_EXTRA_FLAGS} cxxflags=${BOOST_CXX_FLAGS} link=static install
       BUILD_IN_SOURCE 1
       INSTALL_COMMAND ""
   )
@@ -610,7 +612,7 @@ endif()
 message("Build system will fetch and build Intel Threading Building Blocks")
 message("==================================================================")
 # These are useful for the custom install step we'll do later
-set(TBB_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb-2019_U8)
+set(TBB_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/oneTBB-2019_U8)
 set(TBB_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install)
 
 if("${TBB_COMPILER}" STREQUAL "gcc")
@@ -624,9 +626,9 @@ set(TBB_CXXFLAGS "${TBB_CXXFLAGS} ${CXXSTDFLAG}")
 externalproject_add(libtbb
   DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
   DOWNLOAD_COMMAND curl -k -L https://github.com/intel/tbb/archive/2019_U8.tar.gz -o tbb-2019_U8.tgz &&
-    ${SHASUM} 7b1fd8caea14be72ae4175896510bf99c809cd7031306a1917565e6de7382fba tbb-2019_U8.tgz &&
+    ${SHASUM} 6b540118cbc79f9cbc06a35033c18156c21b84ab7b6cf56d773b168ad2b68566 tbb-2019_U8.tgz &&
     tar -xzvf tbb-2019_U8.tgz
-  SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb-2019_U8
+  SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/oneTBB-2019_U8 
   INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
   PATCH_COMMAND "${TBB_PATCH_STEP}"
   CONFIGURE_COMMAND ""
diff --git a/doc/source/salmon.rst b/doc/source/salmon.rst
index d17a4d685..e30ac0443 100644
--- a/doc/source/salmon.rst
+++ b/doc/source/salmon.rst
@@ -16,33 +16,32 @@ step, obviously, is specific to the set of RNA-seq reads and is thus run more
 frequently. For a more complete description of all available options in Salmon,
 see below.
 
-.. note:: Mapping validation in mapping-based mode
-
-   Selective alignment, enabled by the ``--validateMappings`` flag, is a major
-   feature enhancement introduced in recent versions of salmon. When salmon is
-   run with selective alignment, it adopts a considerably more sensitive scheme
-   that we have developed for finding the potential mapping loci of a read, and
-   score potential mapping loci using the chaining algorithm introdcued in
-   minimap2 [#minimap2]_. It scores and validates these mappings using
-   the score-only, SIMD, dynamic programming algorithm of ksw2 [#ksw2]_.
-   Finally, we recommend using selective alignment with a *decoy-aware* transcriptome,
-   to mitigate potential spurious mapping of reads that actually arise from some
-   unannotated genomic locus that is sequence-similar to an annotated transcriptome.
-   The selective-alignment algorithm, the use of a decoy-aware transcriptome, and
-   the influence of running salmon with different mapping and alignment strategies
-   is covered in detail in the paper `Alignment and mapping methodology influence transcript abundance estimation <https://www.biorxiv.org/content/10.1101/657874v1>`_.
+.. note:: Selective alignment
+
+   Selective alignment, first introduced by the ``--validateMappings`` flag
+   in salmon, and now the default mapping strategy (in version 1.0.0
+   forward), is a major feature enhancement introduced in recent versions of
+   salmon. When salmon is run with selective alignment, it adopts a
+   considerably more sensitive scheme that we have developed for finding the
+   potential mapping loci of a read, and score potential mapping loci using
+   the chaining algorithm introdcued in minimap2 [#minimap2]_. It scores and
+   validates these mappings using the score-only, SIMD, dynamic programming
+   algorithm of ksw2 [#ksw2]_. Finally, we recommend using selective
+   alignment with a *decoy-aware* transcriptome, to mitigate potential
+   spurious mapping of reads that actually arise from some unannotated
+   genomic locus that is sequence-similar to an annotated transcriptome. The
+   selective-alignment algorithm, the use of a decoy-aware transcriptome, and
+   the influence of running salmon with different mapping and alignment
+   strategies is covered in detail in the paper `Alignment and mapping methodology influence transcript abundance estimation <https://www.biorxiv.org/content/10.1101/657874v1>`_.
 
    The use of selective alignment implies the use of range factorization, as mapping
    scores become very meaningful with this option. Selective alignment can
    improve the accuracy, sometimes considerably, over the faster, but
-   less-precise default mapping algorithm. As of salmon v0.13.1, we highly
-   recommend all users adopt selective alignment unless they have a specific
-   reason to avoid it. It is likely that this option will be enabled by default
-   in a future release. Also, there are a number of options and flags that allow
-   the user to control details about how the scoring is carried out, including
-   setting match, mismatch, and gap scores, and choosing the minimum score
-   below which an alignment will be considered invalid, and therefore not
-   used for the purposes of quantification. 
+   less-precise mapping algorithm that was previously used.  Also, there are a number of 
+   options and flags that allow the user to control details about how the scoring is 
+   carried out, including setting match, mismatch, and gap scores, and choosing the minimum 
+   score below which an alignment will be considered invalid, and therefore not used for the
+   purposes of quantification. 
 
 The **alignment**-based mode of Salmon does not require indexing.  Rather, you can 
 simply provide Salmon with a FASTA file of the transcripts and a SAM/BAM file
@@ -105,21 +104,33 @@ Preparing transcriptome indices (mapping-based mode)
 
 One of the novel and innovative features of Salmon is its ability to accurately
 quantify transcripts without having previously aligned the reads using its fast,
-built-in mapping algorithms (either *quasi-mapping* or *selective alignment*).
-These approaches are typically **much** faster to compute than traditional (or
-full) alignments. Further details about the selective alignment algorithm can be
-found `here <https://www.biorxiv.org/content/10.1101/657874v1>`_ and more
-details about quasi-mapping can be found `in this paper <http://bioinformatics.oxfordjournals.org/content/32/12/i192.full>`_.
+built-in selective-alignment mapping algorithm. Further details about the selective alignment algorithm can be
+found `here <https://www.biorxiv.org/content/10.1101/657874v1>`_.
 
 If you want to use Salmon in mapping-based mode, then you first have to build a
 salmon index for your transcriptome. Assume that ``transcripts.fa`` contains the
 set of transcripts you wish to quantify. We generally recommend that you build a
-*decoy-aware* transcriptome file and do quantification using selective alignment, which can be done with the
-`generateDecoyTranscriptome.sh
-<https://github.com/COMBINE-lab/SalmonTools/blob/master/scripts/generateDecoyTranscriptome.sh>`_
-script, whose instructions you can find `in this README 
-<https://github.com/COMBINE-lab/SalmonTools/blob/master/README.md>`_. First, you
-run the salmon indexer:
+*decoy-aware* transcriptome file. 
+
+There are two options for generating a decoy-aware transcriptome:
+
+- The first is to compute a set of decoy sequences by mapping the annotated transcripts you wish to index
+  against a hard-masked version of the organism's genome.  This can be done with e.g. 
+  `MashMap2  <https://github.com/marbl/MashMap>`_, and we provide some simple scripts to 
+  greatly simplify this whole process.  Specifically, you can use the 
+  `generateDecoyTranscriptome.sh <https://github.com/COMBINE-lab/SalmonTools/blob/master/scripts/generateDecoyTranscriptome.sh>`_
+  script, whose instructions you can find `in this README <https://github.com/COMBINE-lab/SalmonTools/blob/master/README.md>`_. 
+
+- The second is to use the entire genome of the organism as the decoy sequence. This can be 
+  done by concatenating the genome to the end of the transcriptome you want to index and populating 
+  the `decoys.txt` file with the chromosome names.  Detailed instructions on how to prepare this 
+  type of decoy sequence is available `here <https://combine-lab.github.io/alevin-tutorial/2019/selective-alignment/>`_.
+  This scheme provides a more comprehensive set of decoys, but, obviously, requires considerably more memory to build the index.
+
+Finally, pre-built versions of both the *partial* decoy and *full* decoy (i.e. using the whole genome) salmon indices 
+for some common organisms are available via refgenie `here <http://refgenomes.databio.org/>`_.
+
+If you are not using a pre-computed index, you run the salmon indexer as so:
 
 ::
     
@@ -136,12 +147,6 @@ improve sensitivity even more when using selective alignment (enabled via the `-
 if you are seeing a smaller mapping rate than you might expect, consider building
 the index with a slightly smaller `k`.  
 
-.. note:: Decoy-augmented transcriptomes and quasi-mapping
-   Currently, the use of decoy-augmented transcriptomes is only supported in 
-   conjunction with selective-alignment (via the `--validateMappings`, `--mimicBT2`
-   or `--mimicStrictBT2` flags.  For the time being, if you wish to quantify using 
-   quasi-mapping, you should not build a decoy-augmented index.
-
 Quantifying in mapping-based mode
 ---------------------------------------
 
diff --git a/doc/steps_to_prepare_release.md b/doc/steps_to_prepare_release.md
index d7de1ac39..e2cd64a3d 100644
--- a/doc/steps_to_prepare_release.md
+++ b/doc/steps_to_prepare_release.md
@@ -12,4 +12,4 @@
  9. Add release notes for the tagged master version.
  10. Upload the pre-compiled linux binary (from the CI server) to GitHub.
  11. Place a new version file on the website and update the old one.
- 12. (not technically part of release) Reset the relevant changes (steps 1,2) on the develop branch so they now point to a non-tagged RapMap.
+ 12. (not technically part of release) Reset the relevant changes (steps 1,2) on the develop branch so they now point to a non-tagged pufferfish.
diff --git a/include/AlevinOpts.hpp b/include/AlevinOpts.hpp
index 05410a422..cf08e041f 100644
--- a/include/AlevinOpts.hpp
+++ b/include/AlevinOpts.hpp
@@ -19,6 +19,7 @@ struct AlevinOpts {
   AlevinOpts(): numParsingThreads(1),
                 numConsumerThreads(2),
                 useVBEM{false},
+                numNoMapCB(0),
                 initUniform{false}{}
 
   //IUPAC code for the cell-barcodes
@@ -114,6 +115,7 @@ struct AlevinOpts {
   uint32_t intelligentCutoff;
   uint32_t totalLowConfidenceCBs;
   uint32_t numFeatures;
+  uint32_t numNoMapCB;
 
   uint32_t eqReads;
   uint32_t noisyUmis;
diff --git a/include/AlevinUtils.hpp b/include/AlevinUtils.hpp
index a314db087..f33c9a002 100644
--- a/include/AlevinUtils.hpp
+++ b/include/AlevinUtils.hpp
@@ -74,14 +74,19 @@ namespace alevin{
 
     template <typename ProtocolT>
     bool processAlevinOpts(AlevinOpts<ProtocolT>& aopt,
-                             SalmonOpts& sopt,
-                             boost::program_options::variables_map& vm);
+                           SalmonOpts& sopt, bool noTgMap,
+                           boost::program_options::variables_map& vm);
 
     template <typename ProtocolT>
     bool extractUMI(std::string& read,
                     ProtocolT& pt,
                     std::string& umi);
 
+    template <typename ProtocolT>
+    void getReadSequence(ProtocolT& pt,
+                         std::string& seq,
+                         std::string& subseq);
+
     template <typename ProtocolT>
     nonstd::optional<std::string> extractBarcode(std::string& read, ProtocolT& pt);
 
@@ -106,7 +111,8 @@ namespace alevin{
                          const std::string& t2gFile, const std::string& refNamesFile,
                          const std::string& refLengthFile,
                          const std::string& headerFile,
-                         std::shared_ptr<spdlog::logger>& jointLog);
+                         std::shared_ptr<spdlog::logger>& jointLog,
+                         bool noTgMap);
 
     bool checkSetCoverage(std::vector<std::vector<uint32_t>>& tgroup,
                           std::vector<uint32_t> txps);
diff --git a/include/AlignmentLibrary.hpp b/include/AlignmentLibrary.hpp
index 30c6ab1f5..218e7d895 100644
--- a/include/AlignmentLibrary.hpp
+++ b/include/AlignmentLibrary.hpp
@@ -504,6 +504,10 @@ for (auto& txp : transcripts_) {
     return numDecoys_;
   }
 
+  salmon::utils::DuplicateTargetStatus index_retains_duplicates() const { 
+    return salmon::utils::DuplicateTargetStatus::UNKNOWN; 
+  }
+
 private:
 
   void setTranscriptLengthClasses_(std::vector<uint32_t>& lengths,
diff --git a/include/BWAMemStaticFuncs.hpp b/include/BWAMemStaticFuncs.hpp
deleted file mode 100644
index 7872b4c69..000000000
--- a/include/BWAMemStaticFuncs.hpp
+++ /dev/null
@@ -1,140 +0,0 @@
-#ifndef BWAMEM_STATIC_FUNCS_HPP
-#define BWAMEM_STATIC_FUNCS_HPP
-
-extern unsigned char nst_nt4_table[256];
-char const* bwa_pg = "cha";
-
-/******* STUFF THAT IS STATIC IN BWAMEM THAT WE NEED HERE --- Just re-define it
- * *************/
-#define intv_lt(a, b) ((a).info < (b).info)
-KSORT_INIT(mem_intv, bwtintv_t, intv_lt)
-
-typedef struct {
-  bwtintv_v mem, mem1, *tmpv[2];
-} smem_aux_t;
-
-static smem_aux_t* smem_aux_init() {
-  smem_aux_t* a;
-  a = static_cast<smem_aux_t*>(calloc(1, sizeof(smem_aux_t)));
-  a->tmpv[0] = static_cast<bwtintv_v*>(calloc(1, sizeof(bwtintv_v)));
-  a->tmpv[1] = static_cast<bwtintv_v*>(calloc(1, sizeof(bwtintv_v)));
-  return a;
-}
-
-static void smem_aux_destroy(smem_aux_t* a) {
-  free(a->tmpv[0]->a);
-  free(a->tmpv[0]);
-  free(a->tmpv[1]->a);
-  free(a->tmpv[1]);
-  free(a->mem.a);
-  free(a->mem1.a);
-  free(a);
-}
-
-static void mem_collect_intv(const SalmonOpts& sopt, const mem_opt_t* opt,
-                             SalmonIndex* sidx, int len, const uint8_t* seq,
-                             smem_aux_t* a) {
-  const bwt_t* bwt = sidx->bwaIndex()->bwt;
-  size_t i, k, x = 0, old_n;
-  int start_width = (opt->flag & MEM_F_SELF_OVLP) ? 2 : 1;
-  int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
-  a->mem.n = 0;
-
-  // first pass: find all SMEMs
-  if (sidx->hasAuxKmerIndex()) {
-    KmerIntervalMap& auxIdx = sidx->auxIndex();
-    int32_t klen = static_cast<int32_t>(auxIdx.k());
-    while (x < static_cast<size_t>(len)) {
-      if (seq[x] < 4) {
-        // Make sure there are at least k bases left
-        if (len - static_cast<int>(x) < klen) {
-          x = len;
-          continue;
-        }
-        // search for this key in the auxiliary index
-        KmerKey kmer(const_cast<uint8_t*>(&(seq[x])), klen);
-        auto it = auxIdx.find(kmer);
-        // if we can't find it, move to the next key
-        if (it == auxIdx.end()) {
-          ++x;
-          continue;
-        }
-        // otherwise, start the search using the initial interval @it->second
-        // from the hash
-        int xb = x;
-        x = bwautils::bwt_smem1_with_kmer(bwt, len, seq, x, start_width,
-                                          it->second, &a->mem1, a->tmpv);
-        for (i = 0; i < a->mem1.n; ++i) {
-          bwtintv_t* p = &a->mem1.a[i];
-          int slen = (uint32_t)p->info - (p->info >> 32); // seed length
-          if (slen >= opt->min_seed_len)
-            kv_push(bwtintv_t, a->mem, *p);
-        }
-      } else
-        ++x;
-    }
-  } else {
-    while (x < static_cast<size_t>(len)) {
-      if (seq[x] < 4) {
-        x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
-        for (i = 0; i < a->mem1.n; ++i) {
-          bwtintv_t* p = &a->mem1.a[i];
-          int slen = (uint32_t)p->info - (p->info >> 32); // seed length
-          if (slen >= opt->min_seed_len)
-            kv_push(bwtintv_t, a->mem, *p);
-        }
-      } else
-        ++x;
-    }
-  }
-
-  // For sensitive / extra-sensitive mode only
-  if (sopt.sensitive or sopt.extraSeedPass) {
-    // second pass: find MEMs inside a long SMEM
-    old_n = a->mem.n;
-    for (k = 0; k < old_n; ++k) {
-      bwtintv_t* p = &a->mem.a[k];
-      int start = p->info >> 32, end = (int32_t)p->info;
-      if (end - start < split_len || p->x[2] > static_cast<bwtint_t>(opt->split_width))
-        continue;
-
-      // int idx = (start + end) >> 1;
-      bwt_smem1(bwt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1,
-                a->tmpv);
-      for (i = 0; i < a->mem1.n; ++i)
-        if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info >> 32) >=
-            static_cast<uint32_t>(opt->min_seed_len))
-          kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
-    }
-  }
-
-  // For extra-sensitive mode only
-  // third pass: LAST-like
-  if (sopt.extraSeedPass and opt->max_mem_intv > 0) {
-    x = 0;
-    while (x < static_cast<size_t>(len)) {
-      if (seq[x] < 4) {
-        if (1) {
-          bwtintv_t m;
-          x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len,
-                                 opt->max_mem_intv, &m);
-          if (m.x[2] > 0)
-            kv_push(bwtintv_t, a->mem, m);
-        } else { // for now, we never come to this block which is slower
-          x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv,
-                         &a->mem1, a->tmpv);
-          for (i = 0; i < a->mem1.n; ++i)
-            kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
-        }
-      } else
-        ++x;
-    }
-  }
-  // sort
-  // ks_introsort(mem_intv, a->mem.n, a->mem.a);
-}
-
-/******* END OF STUFF THAT IS STATIC IN BWAMEM THAT WE NEED HERE --- Just
- * re-define it *************/
-
-#endif // BWAMEM_STATIC_FUNCS_HPP
diff --git a/include/BWAUtils.hpp b/include/BWAUtils.hpp
deleted file mode 100644
index e608464e9..000000000
--- a/include/BWAUtils.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef __BWA_UTILS_HPP__
-#define __BWA_UTILS_HPP__
-
-extern "C" {
-#include "bwa.h"
-#include "bwamem.h"
-#include "kvec.h"
-#include "utils.h"
-}
-
-namespace bwautils {
-
-// Function modified from bwt_smem1a:
-// https://github.com/lh3/bwa/blob/eb428d7d31ced059ad39af2701a22ebe6d175657/bwt.c#L289
-/**
- * Search for the k-mer of length @len starting at @q.
- * Return true if an interval is found for the k-mer and false
- * otherwise. The appropriate bwt interval will be placed
- * in @resInterval upon success.
- *
- */
-bool getIntervalForKmer(const bwt_t* bwt, // the bwt index
-                        int len,          // k-mer length
-                        const uint8_t* q, // query
-                        bwtintv_t& resInterval);
-
-// NOTE: $max_intv is not currently used in BWA-MEM
-// NOTE: Modified from the original functions to take an initial interval for
-// the search query
-int bwt_smem1a_with_kmer(const bwt_t* bwt, int len, const uint8_t* q, int x,
-                         int min_intv, uint64_t max_intv,
-                         bwtintv_t initial_interval, bwtintv_v* mem,
-                         bwtintv_v* tmpvec[2]);
-
-int bwt_smem1_with_kmer(const bwt_t* bwt, int len, const uint8_t* q, int x,
-                        int min_intv, bwtintv_t initial_interval,
-                        bwtintv_v* mem, bwtintv_v* tmpvec[2]);
-} // namespace bwautils
-
-#endif // __BWA_UTILS_HPP__
diff --git a/include/BarcodeGroup.hpp b/include/BarcodeGroup.hpp
index c05bb7478..5f3f1e20f 100644
--- a/include/BarcodeGroup.hpp
+++ b/include/BarcodeGroup.hpp
@@ -21,7 +21,7 @@ struct BarcodeGroupStringHasherMetro {
   }
 };
 
-//using CFreqMapT = cuckoohash_map<std::string, uint32_t, BarcodeGroupStringHasher>;
+//using CFreqMapT = libcuckoo::cuckoohash_map<std::string, uint32_t, BarcodeGroupStringHasher>;
 using CFreqMapT = tsl::array_map<char, uint32_t, BarcodeGroupStringHasherMetro>;
 
 using MapT = std::vector<std::pair<std::string, double>>;
diff --git a/include/CollapsedCellOptimizer.hpp b/include/CollapsedCellOptimizer.hpp
index 5f85a8096..4ae0c08a4 100644
--- a/include/CollapsedCellOptimizer.hpp
+++ b/include/CollapsedCellOptimizer.hpp
@@ -31,11 +31,11 @@
 
 namespace bfs = boost::filesystem;
 using JqueueT = moodycamel::ConcurrentQueue<uint32_t>;
-using eqMapT = cuckoohash_map<TranscriptGroup, SCTGValue, TranscriptGroupHasher>;
+using eqMapT = libcuckoo::cuckoohash_map<TranscriptGroup, SCTGValue, TranscriptGroupHasher>;
 using tgrouplabelt = std::vector<uint32_t>;
 using tgroupweightvec = std::vector<double>;
 using SCExpT = ReadExperiment<EquivalenceClassBuilder<SCTGValue>>;
-using EqMapT = cuckoohash_map<TranscriptGroup, SCTGValue, TranscriptGroupHasher>;
+using EqMapT = libcuckoo::cuckoohash_map<TranscriptGroup, SCTGValue, TranscriptGroupHasher>;
 
 constexpr double digammaMin = 1e-10;
 
diff --git a/include/EMUtils.hpp b/include/EMUtils.hpp
index 541d7a149..3d2d543f4 100644
--- a/include/EMUtils.hpp
+++ b/include/EMUtils.hpp
@@ -12,6 +12,9 @@ void EMUpdate_(std::vector<std::vector<uint32_t>>& txpGroupLabels,
                const VecT& alphaIn,
                VecT& alphaOut);
 
+/**
+ * set entries with values <= cutoff to 0.
+ **/
 template <typename VecT>
 double truncateCountVector(VecT& alphas, double cutoff);
 
diff --git a/include/EquivalenceClassBuilder.hpp b/include/EquivalenceClassBuilder.hpp
index e6564d9bd..d357d2bb5 100644
--- a/include/EquivalenceClassBuilder.hpp
+++ b/include/EquivalenceClassBuilder.hpp
@@ -135,13 +135,13 @@ class EquivalenceClassBuilder {
 public:
   EquivalenceClassBuilder(std::shared_ptr<spdlog::logger> loggerIn, uint32_t maxResizeThreads)
       : logger_(loggerIn) {
-    countMap_.set_max_resize_threads(maxResizeThreads);
+    countMap_.max_num_worker_threads(maxResizeThreads);
     countMap_.reserve(1000000);
   }
 
   //~EquivalenceClassBuilder() {}
-  void setMaxResizeThreads(uint32_t t) { countMap_.set_max_resize_threads(t); }
-  uint32_t getMaxResizeThreads() const { return countMap_.get_max_resize_threads(); }
+  void setMaxResizeThreads(uint32_t t) { countMap_.max_num_worker_threads(t); }
+  uint32_t getMaxResizeThreads() const { return countMap_.max_num_worker_threads(); }
 
   void start() { active_ = true; }
 
@@ -212,7 +212,7 @@ class EquivalenceClassBuilder {
                               std::vector<uint32_t>& eqclass_counts,
                               std::vector<Transcript>& transcripts);
 
-  cuckoohash_map<TranscriptGroup, TGValueType, TranscriptGroupHasher>& eqMap(){
+  libcuckoo::cuckoohash_map<TranscriptGroup, TGValueType, TranscriptGroupHasher>& eqMap(){
     return countMap_;
   }
 
@@ -227,7 +227,7 @@ class EquivalenceClassBuilder {
 
 private:
   std::atomic<bool> active_;
-  cuckoohash_map<TranscriptGroup, TGValueType, TranscriptGroupHasher> countMap_;
+  libcuckoo::cuckoohash_map<TranscriptGroup, TGValueType, TranscriptGroupHasher> countMap_;
   std::vector<std::pair<const TranscriptGroup, TGValueType>> countVec_;
   std::shared_ptr<spdlog::logger> logger_;
 };
diff --git a/include/GZipWriter.hpp b/include/GZipWriter.hpp
index 0e628c2d5..94629b71a 100644
--- a/include/GZipWriter.hpp
+++ b/include/GZipWriter.hpp
@@ -96,6 +96,10 @@ class GZipWriter {
 
   bool setSamplingPath(const SalmonOpts& sopt);
 
+  void writeMtx(std::shared_ptr<spdlog::logger>& jointLog, 
+                boost::filesystem::path& outputDirectory,
+                size_t numGenes, size_t numCells, size_t totalExpGeneCounts);
+
 private:
   boost::filesystem::path path_;
   boost::filesystem::path bsPath_;
diff --git a/include/KmerIntervalMap.hpp b/include/KmerIntervalMap.hpp
deleted file mode 100644
index 385accf09..000000000
--- a/include/KmerIntervalMap.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef __KMER_INTERVAL_MAP_HPP__
-#define __KMER_INTERVAL_MAP_HPP__
-
-extern "C" {
-#include "bwt.h"
-}
-
-#include <fstream>
-#include <unordered_map>
-
-#include <jellyfish/mer_dna.hpp>
-
-#include "cereal/archives/binary.hpp"
-#include "cereal/types/unordered_map.hpp"
-
-#include "xxhash.h"
-
-using JFMer = jellyfish::mer_dna_ns::mer_base_static<uint64_t, 1>;
-
-// What will be the keys in our k-mer has map
-struct KmerKey {
-  KmerKey() { mer_.polyT(); }
-
-  KmerKey(uint8_t* seq, uint32_t len) : mer_(len) {
-    mer_.polyT();
-    for (size_t i = 0; i < len; ++i) {
-      mer_.shift_left(seq[i]);
-    }
-  }
-
-  bool operator==(const KmerKey& ok) const { return mer_ == ok.mer_; }
-
-  // Is there a smarter way to do save / load here?
-  template <typename Archive> void save(Archive& archive) const {
-    auto key = mer_.get_bits(0, 2 * mer_.k());
-    archive(key);
-  }
-
-  template <typename Archive> void load(Archive& archive) {
-    mer_.polyT();
-    uint64_t bits;
-    archive(bits);
-    mer_.set_bits(0, 2 * mer_.k(), bits);
-  }
-
-  JFMer mer_;
-};
-
-template <typename Archive> void load(Archive& archive, bwtintv_t& interval) {
-  archive(interval.x[0], interval.x[1], interval.x[2], interval.info);
-}
-
-template <typename Archive>
-void save(Archive& archive, const bwtintv_t& interval) {
-  archive(interval.x[0], interval.x[1], interval.x[2], interval.info);
-}
-
-/**
- *  This class provides an efficent hash-map from
- *  k-mers to BWT intervals.
- */
-class KmerIntervalMap {
-public:
-  // How we hash the keys
-  struct KmerHasher {
-    std::size_t operator()(const KmerKey& k) const {
-      void* data = static_cast<void*>(const_cast<KmerKey&>(k).mer_.data__());
-      return XXH64(data, sizeof(uint64_t), 0);
-    }
-  };
-
-private:
-  std::unordered_map<KmerKey, bwtintv_t, KmerHasher> map_;
-
-public:
-  void setK(unsigned int k) { JFMer::k(k); }
-  uint32_t k() { return JFMer::k(); }
-
-  bool hasKmer(KmerKey& k) { return map_.find(k) != map_.end(); }
-
-  decltype(map_)::iterator find(const KmerKey& k) { return map_.find(k); }
-  decltype(map_)::iterator find(KmerKey&& k) { return map_.find(k); }
-
-  decltype(map_)::iterator end() { return map_.end(); }
-
-  bwtintv_t& operator[](const KmerKey& k) { return map_[k]; }
-  bwtintv_t& operator[](KmerKey&& k) { return map_[k]; }
-
-  decltype(map_)::size_type size() { return map_.size(); }
-
-  void save(boost::filesystem::path indexPath) {
-    std::ofstream ofs(indexPath.string(), std::ios::binary);
-    {
-      cereal::BinaryOutputArchive oa(ofs);
-      oa(map_);
-    }
-    ofs.close();
-  }
-
-  void load(boost::filesystem::path indexPath) {
-    std::ifstream ifs(indexPath.string(), std::ios::binary);
-    {
-      cereal::BinaryInputArchive ia(ifs);
-      ia(map_);
-    }
-    ifs.close();
-  }
-};
-
-#endif // __KMER_INTERVAL_MAP_HPP__
diff --git a/include/LightweightAlignmentDefs.hpp b/include/LightweightAlignmentDefs.hpp
deleted file mode 100644
index d4375c25a..000000000
--- a/include/LightweightAlignmentDefs.hpp
+++ /dev/null
@@ -1,1575 +0,0 @@
-#ifndef LIGHTWEIGHT_ALIGNMENT_DEFS_HPP
-#define LIGHTWEIGHT_ALIGNMENT_DEFS_HPP
-
-#include "BWAMemStaticFuncs.hpp"
-#include "EffectiveLengthStats.hpp"
-#include "RapMapUtils.hpp"
-
-
-using BulkReadExpT = ReadExperiment<EquivalenceClassBuilder<TGValue>>;
-
-class SMEMAlignment {
-public:
-  SMEMAlignment()
-      : pos(0), fwd(false), mateIsFwd(false),
-        transcriptID_(std::numeric_limits<TranscriptID>::max()),
-        format_(LibraryFormat::formatFromID(0)), score_(0.0), fragLength_(0),
-        logProb(salmon::math::LOG_0), logBias(salmon::math::LOG_0) {}
-
-  SMEMAlignment(TranscriptID transcriptIDIn, LibraryFormat format,
-                double scoreIn = 0.0, int32_t hitPosIn = 0,
-                uint32_t fragLengthIn = 0,
-                double logProbIn = salmon::math::LOG_0)
-      : pos(hitPosIn), fwd(false), mateIsFwd(false),
-        transcriptID_(transcriptIDIn), format_(format), score_(scoreIn),
-        fragLength_(fragLengthIn), fragLen(fragLengthIn), logProb(logProbIn) {}
-
-  SMEMAlignment(const SMEMAlignment& o) = default;
-  SMEMAlignment(SMEMAlignment&& o) = default;
-  SMEMAlignment& operator=(SMEMAlignment& o) = default;
-  SMEMAlignment& operator=(SMEMAlignment&& o) = default;
-
-  inline TranscriptID transcriptID() const { return transcriptID_; }
-  inline uint32_t fragLength() const { return fragLength_; }
-  inline uint32_t fragLengthPedantic(uint32_t txpLen) const {
-    return fragLength_;
-  }
-  inline LibraryFormat libFormat() const { return format_; }
-  inline double score() const { return score_; }
-  inline int32_t hitPos() const { return pos; }
-  // inline double coverage() {  return static_cast<double>(kmerCount) /
-  // fragLength_; };
-  uint32_t kmerCount;
-  double logProb;
-  double logBias;
-  template <typename Archive> void save(Archive& archive) const {
-    archive(transcriptID_, format_.formatID(), score_, pos, fragLength_);
-  }
-
-  template <typename Archive> void load(Archive& archive) {
-    uint8_t formatID;
-    archive(transcriptID_, formatID, score_, pos, fragLength_);
-    format_ = LibraryFormat::formatFromID(formatID);
-  }
-
-  rapmap::utils::MateStatus mateStatus;
-  int32_t pos;
-  int32_t matePos; // JUST FOR COMPATIBILITY WITH QUASI!
-  bool fwd;
-  bool mateIsFwd;
-  uint32_t readLen;
-  uint32_t mateLen;
-  uint32_t fragLen;
-
-private:
-  TranscriptID transcriptID_;
-  LibraryFormat format_;
-  double score_;
-  uint32_t fragLength_;
-};
-
-uint32_t basesCovered(std::vector<uint32_t>& kmerHits) {
-  std::sort(kmerHits.begin(), kmerHits.end());
-  uint32_t covered{0};
-  uint32_t lastHit{0};
-  uint32_t kl{20};
-  for (auto h : kmerHits) {
-    covered += std::min(h - lastHit, kl);
-    lastHit = h;
-  }
-  return covered;
-}
-
-uint32_t basesCovered(std::vector<uint32_t>& posLeft,
-                      std::vector<uint32_t>& posRight) {
-  return basesCovered(posLeft) + basesCovered(posRight);
-}
-
-class KmerVote {
-public:
-  KmerVote(int32_t vp, uint32_t rp, uint32_t vl)
-      : votePos(vp), readPos(rp), voteLen(vl) {}
-  int32_t votePos{0};
-  uint32_t readPos{0};
-  uint32_t voteLen{0};
-  /*
-  std::string str(){
-      return "<" + votePos  + ", "  + readPos  + ", "  + voteLen + ">";
-  }
-  */
-};
-class MatchFragment {
-public:
-  MatchFragment(uint32_t refStart_, uint32_t queryStart_, uint32_t length_)
-      : refStart(refStart_), queryStart(queryStart_), length(length_) {}
-
-  uint32_t refStart, queryStart, length;
-  uint32_t weight;
-  double score;
-};
-
-bool precedes(const MatchFragment& a, const MatchFragment& b) {
-  return (a.refStart + a.length) < b.refStart and
-         (a.queryStart + a.length) < b.queryStart;
-}
-
-class TranscriptHitList {
-public:
-  int32_t bestHitPos{0};
-  uint32_t bestHitCount{0};
-  double bestHitScore{0.0};
-
-  std::vector<KmerVote> votes;
-  std::vector<KmerVote> rcVotes;
-
-  uint32_t targetID;
-  uint32_t fwdCov{0};
-  uint32_t revCov{0};
-
-  bool isForward_{true};
-
-  void addFragMatch(uint32_t tpos, uint32_t readPos, uint32_t voteLen) {
-    int32_t votePos =
-        static_cast<int32_t>(tpos) - static_cast<int32_t>(readPos);
-    votes.emplace_back(votePos, readPos, voteLen);
-    fwdCov += voteLen;
-  }
-
-  void addFragMatchRC(uint32_t tpos, uint32_t readPos, uint32_t voteLen,
-                      uint32_t readLen) {
-    // int32_t votePos = static_cast<int32_t>(tpos) - (readPos) + voteLen;
-    int32_t votePos = static_cast<int32_t>(tpos) - (readLen - readPos);
-    rcVotes.emplace_back(votePos, readPos, voteLen);
-    revCov += voteLen;
-  }
-
-  uint32_t totalNumHits() { return std::max(votes.size(), rcVotes.size()); }
-
-  bool computeBestLocFast_(std::vector<KmerVote>& sVotes,
-                           Transcript& transcript, std::string& read, bool isRC,
-                           int32_t& maxClusterPos, uint32_t& maxClusterCount,
-                           double& maxClusterScore) {
-    bool updatedMaxScore{true};
-    if (sVotes.size() == 0) {
-      return updatedMaxScore;
-    }
-    uint32_t readLen = read.length();
-    uint32_t votePos = sVotes.front().votePos;
-
-    uint32_t cov = isRC ? revCov : fwdCov;
-    if (cov > maxClusterCount) {
-      maxClusterCount = cov;
-      maxClusterPos = votePos;
-      maxClusterScore = maxClusterCount / static_cast<double>(readLen);
-      updatedMaxScore = true;
-    }
-    return updatedMaxScore;
-  }
-
-  bool computeBestLoc_(std::vector<KmerVote>& sVotes, Transcript& transcript,
-                       std::string& read, bool isRC, int32_t& maxClusterPos,
-                       uint32_t& maxClusterCount, double& maxClusterScore) {
-    // Did we update the highest-scoring cluster? This will be set to
-    // true iff we have a cluster of a higher score than the score
-    // currently given in maxClusterCount.
-    bool updatedMaxScore{false};
-
-    if (sVotes.size() == 0) {
-      return updatedMaxScore;
-    }
-
-    struct VoteInfo {
-      uint32_t coverage = 0;
-      int32_t rightmostBase = 0;
-    };
-
-    uint32_t readLen = read.length();
-
-    boost::container::flat_map<uint32_t, VoteInfo> hitMap;
-    int32_t currClust{static_cast<int32_t>(sVotes.front().votePos)};
-
-    for (size_t j = 0; j < sVotes.size(); ++j) {
-
-      int32_t votePos = sVotes[j].votePos;
-      uint32_t readPos = sVotes[j].readPos;
-      uint32_t voteLen = sVotes[j].voteLen;
-
-      if (votePos >= currClust) {
-        if (votePos - currClust > 10) {
-          currClust = votePos;
-        }
-        auto& hmEntry = hitMap[currClust];
-
-        hmEntry.coverage += std::min(voteLen, (votePos + readPos + voteLen) -
-                                                  hmEntry.rightmostBase);
-        hmEntry.rightmostBase = votePos + readPos + voteLen;
-      } else if (votePos < currClust) {
-        std::cerr << "Should not have votePos = " << votePos
-                  << " <  currClust = " << currClust << "\n";
-        std::exit(1);
-      }
-
-      if (hitMap[currClust].coverage > maxClusterCount) {
-        maxClusterCount = hitMap[currClust].coverage;
-        maxClusterPos = currClust;
-        maxClusterScore = maxClusterCount / static_cast<double>(readLen);
-        updatedMaxScore = true;
-      }
-    }
-    return updatedMaxScore;
-  }
-
-  bool computeBestLoc2_(std::vector<KmerVote>& sVotes, uint32_t tlen,
-                        int32_t& maxClusterPos, uint32_t& maxClusterCount,
-                        double& maxClusterScore) {
-
-    bool updatedMaxScore{false};
-
-    if (sVotes.size() == 0) {
-      return updatedMaxScore;
-    }
-
-    double weights[] = {1.0,
-                        0.983471453822,
-                        0.935506985032,
-                        0.860707976425,
-                        0.765928338365,
-                        0.6592406302,
-                        0.548811636094,
-                        0.441902209585,
-                        0.344153786865,
-                        0.259240260646,
-                        0.188875602838};
-
-    int32_t maxGap = 4;
-    uint32_t leftmost = (sVotes.front().votePos > maxGap)
-                            ? (sVotes.front().votePos - maxGap)
-                            : 0;
-    uint32_t rightmost = static_cast<uint32_t>(
-                             std::min(sVotes.back().votePos + maxGap,
-                                      static_cast<int32_t>(tlen)));
-
-    uint32_t span = (rightmost - leftmost);
-    std::vector<double> probAln(span, 0.0);
-    double kwidth = 1.0 / (2.0 * maxGap);
-
-    size_t nvotes = sVotes.size();
-    for (size_t j = 0; j < nvotes; ++j) {
-      uint32_t votePos = sVotes[j].votePos;
-      uint32_t voteLen = sVotes[j].voteLen;
-
-      auto x = j + 1;
-      while (x < nvotes and static_cast<uint32_t>(sVotes[x].votePos) == votePos) {
-        voteLen += sVotes[x].voteLen;
-        j += 1;
-        x += 1;
-      }
-
-      uint32_t dist{0};
-      size_t start = (votePos >= static_cast<uint32_t>(maxGap)) ? (votePos - maxGap - leftmost)
-                                         : (votePos - leftmost);
-      size_t mid = votePos - leftmost;
-      size_t end = std::min(votePos + maxGap - leftmost, rightmost - leftmost);
-      for (size_t k = start; k < end; k += 1) {
-        dist = (mid > k) ? mid - k : k - mid;
-        probAln[k] += weights[dist] * voteLen;
-        if (probAln[k] > maxClusterScore) {
-          maxClusterScore = probAln[k];
-          maxClusterPos = k + leftmost;
-          updatedMaxScore = true;
-        }
-      }
-    }
-
-    return updatedMaxScore;
-  }
-
-  inline uint32_t numSampledHits_(Transcript& transcript, std::string& readIn,
-                                  int32_t votePos, int32_t posInRead,
-                                  int32_t voteLen, bool isRC,
-                                  uint32_t numTries) {
-
-    // The read starts at this position in the transcript (may be negative!)
-    int32_t readStart = votePos;
-    // The (uncorrected) length of the read
-    int32_t readLen = readIn.length();
-    // Pointer to the sequence of the read
-    const char* read = readIn.c_str();
-    // Don't mess around with unsigned arithmetic here
-    int32_t tlen = transcript.RefLength;
-
-    // If the read starts before the first base of the transcript,
-    // trim off the initial overhang  and correct the other variables
-    if (readStart < 0) {
-      if (isRC) {
-        uint32_t correction = -readStart;
-        // std::cerr << "readLen = " << readLen << ", posInRead = " << posInRead
-        // << ", voteLen = " << voteLen << ", correction = " << correction <<
-        // "\n";  std::cerr << "tlen = " << tlen << ", votePos = " << votePos <<
-        // "\n";
-        read += correction;
-        readLen -= correction;
-        posInRead -= correction;
-        readStart = 0;
-      } else {
-        uint32_t correction = -readStart;
-        read += correction;
-        readLen -= correction;
-        posInRead -= correction;
-        readStart = 0;
-      }
-    }
-    // If the read hangs off the end of the transcript,
-    // shorten its effective length.
-    if (readStart + readLen >= tlen) {
-      if (isRC) {
-        uint32_t correction = (readStart + readLen) - transcript.RefLength + 1;
-        // std::cerr << "Trimming RC hit: correction = " << correction << "\n";
-        // std::cerr << "untrimmed read : "  << read << "\n";
-        read += correction;
-        readLen -= correction;
-        if (voteLen > readLen) {
-          voteLen = readLen;
-        }
-        posInRead = 0;
-      } else {
-        readLen = tlen - (readStart + 1);
-        voteLen = std::max(voteLen, readLen - (posInRead + voteLen));
-      }
-    }
-    // Finally, clip any reverse complement reads starting at 0
-    if (isRC) {
-
-      if (voteLen > readStart) {
-        readLen -= (readLen - (posInRead + voteLen));
-      }
-    }
-
-    // If the read is too short, it's not useful
-    if (readLen <= 15) {
-      return 0;
-    }
-    // The step between sample centers (given the number of samples we're going
-    // to take)
-    double step = (readLen - 1) / static_cast<double>(numTries - 1);
-    // The strand of the transcript from which we'll extract sequence
-    auto dir = (isRC) ? salmon::stringtools::strand::reverse
-                      : salmon::stringtools::strand::forward;
-
-    bool superVerbose{false};
-
-    if (superVerbose) {
-      std::stringstream ss;
-      ss << "Supposed hit " << (isRC ? "RC" : "") << "\n";
-      ss << "info: votePos = " << votePos << ", posInRead = " << posInRead
-         << ", voteLen = " << voteLen << ", readLen = " << readLen
-         << ", tran len = " << tlen << ", step = " << step << "\n";
-      if (readStart + readLen > tlen) {
-        ss << "ERROR!!!\n";
-        std::cerr << "[[" << ss.str() << "]]";
-        std::exit(1);
-      }
-      ss << "Transcript name = " << transcript.RefName << "\n";
-      ss << "T : ";
-      try {
-        for (int32_t j = 0; j < readLen; ++j) {
-          if (isRC) {
-            if (j == posInRead) {
-              char red[] = "\x1b[30m";
-              red[3] = '0' + static_cast<char>(fmt::RED);
-              ss << red;
-            }
-
-            if (j == posInRead + voteLen) {
-              const char RESET_COLOR[] = "\x1b[0m";
-              ss << RESET_COLOR;
-            }
-            ss << transcript.charBaseAt(readStart + readLen - j, dir);
-          } else {
-            if (j == posInRead) {
-              char red[] = "\x1b[30m";
-              red[3] = '0' + static_cast<char>(fmt::RED);
-              ss << red;
-            }
-
-            if (j == posInRead + voteLen) {
-              const char RESET_COLOR[] = "\x1b[0m";
-              ss << RESET_COLOR;
-            }
-
-            ss << transcript.charBaseAt(readStart + j);
-          }
-        }
-        ss << "\n";
-        char red[] = "\x1b[30m";
-        red[3] = '0' + static_cast<char>(fmt::RED);
-        const char RESET_COLOR[] = "\x1b[0m";
-
-        ss << "R : " << std::string(read, posInRead) << red
-           << std::string(read + posInRead, voteLen) << RESET_COLOR;
-        if (readLen > posInRead + voteLen) {
-          ss << std::string(read + posInRead + voteLen);
-        }
-        ss << "\n\n";
-      } catch (std::exception& e) {
-        std::cerr << "EXCEPTION !!!!!! " << e.what() << "\n";
-      }
-      std::cerr << ss.str() << "\n";
-      ss.clear();
-    }
-
-    // The index of the current sample within the read
-    int32_t readIndex = 0;
-
-    // The number of loci in the subvotes and their
-    // offset patternns
-    size_t lpos = 3;
-    int leftPattern[] = {-4, -2, 0};
-    int rightPattern[] = {0, 2, 4};
-    int centerPattern[] = {-4, 0, 4};
-
-    // The number of subvote hits we've had
-    uint32_t numHits = 0;
-    // Take the samples
-    for (size_t i = 0; i < numTries; ++i) {
-      // The sample will be centered around this point
-      readIndex =
-          static_cast<uint32_t>(std::round(readStart + i * step)) - readStart;
-
-      // The number of successful sub-ovtes we have
-      uint32_t subHit = 0;
-      // Select the center sub-vote pattern, unless we're near the end of a read
-      int* pattern = &centerPattern[0];
-      if (readIndex + pattern[0] < 0) {
-        pattern = &rightPattern[0];
-      } else if (readIndex + pattern[lpos - 1] >= readLen) {
-        pattern = &leftPattern[0];
-      }
-
-      // collect the subvotes
-      for (size_t j = 0; j < lpos; ++j) {
-        // the pattern offset
-        int offset = pattern[j];
-        // and sample position it implies within the read
-        int readPos = readIndex + offset;
-
-        if (readStart + readPos >= tlen) {
-          std::cerr << "offset = " << offset << ", readPos = " << readPos
-                    << ", readStart = " << readStart
-                    << ", readStart + readPos = " << readStart + readPos
-                    << ", tlen = " << transcript.RefLength << "\n";
-        }
-
-        subHit +=
-            (isRC)
-                ? (transcript.charBaseAt(readStart + readLen - readPos, dir) ==
-                   salmon::stringtools::charCanon[static_cast<uint8_t>(read[readPos])])
-                : (transcript.charBaseAt(readStart + readPos) ==
-                   salmon::stringtools::charCanon[static_cast<uint8_t>(read[readPos])]);
-      }
-      // if the entire subvote was successful, this is a hit
-      numHits += (subHit == lpos);
-    }
-    // return the number of hits we had
-    return numHits;
-  }
-
-  bool computeBestLoc3_(std::vector<KmerVote>& sVotes, Transcript& transcript,
-                        std::string& read, bool isRC, int32_t& maxClusterPos,
-                        uint32_t& maxClusterCount, double& maxClusterScore) {
-
-    bool updatedMaxScore{false};
-
-    if (sVotes.size() == 0) {
-      return updatedMaxScore;
-    }
-
-    struct LocHitCount {
-      int32_t loc;
-      uint32_t nhits;
-    };
-
-    uint32_t numSamp = 15;
-    std::vector<LocHitCount> hitCounts;
-    size_t nvotes = sVotes.size();
-    int32_t prevPos = -std::numeric_limits<int32_t>::max();
-    for (size_t j = 0; j < nvotes; ++j) {
-      int32_t votePos = sVotes[j].votePos;
-      int32_t posInRead = sVotes[j].readPos;
-      int32_t voteLen = sVotes[j].voteLen;
-      if (prevPos == votePos) {
-        continue;
-      }
-      auto numHits = numSampledHits_(transcript, read, votePos, posInRead,
-                                     voteLen, isRC, numSamp);
-      hitCounts.push_back({votePos, numHits});
-      prevPos = votePos;
-    }
-
-    uint32_t maxGap = 8;
-    uint32_t hitIdx = 0;
-    uint32_t accumHits = 0;
-    int32_t hitLoc = hitCounts[hitIdx].loc;
-    while (hitIdx < hitCounts.size()) {
-      uint32_t idx2 = hitIdx;
-      while (idx2 < hitCounts.size() and
-             std::abs(hitCounts[idx2].loc - hitLoc) <= maxGap) {
-        accumHits += hitCounts[idx2].nhits;
-        ++idx2;
-      }
-
-      double score = static_cast<double>(accumHits) / numSamp;
-      if (score > maxClusterScore) {
-        maxClusterCount = accumHits;
-        maxClusterScore = score;
-        maxClusterPos = hitCounts[hitIdx].loc;
-        updatedMaxScore = true;
-      }
-      accumHits = 0;
-      ++hitIdx;
-      hitLoc = hitCounts[hitIdx].loc;
-    }
-
-    return updatedMaxScore;
-  }
-
-  bool computeBestChain(Transcript& transcript, std::string& read) {
-    std::sort(votes.begin(), votes.end(),
-              [](const KmerVote& v1, const KmerVote& v2) -> bool {
-                if (v1.votePos == v2.votePos) {
-                  return v1.readPos < v2.readPos;
-                }
-                return v1.votePos < v2.votePos;
-              });
-
-    std::sort(rcVotes.begin(), rcVotes.end(),
-              [](const KmerVote& v1, const KmerVote& v2) -> bool {
-                if (v1.votePos == v2.votePos) {
-                  return v1.readPos < v2.readPos;
-                }
-                return v1.votePos < v2.votePos;
-              });
-
-    int32_t maxClusterPos{0};
-    uint32_t maxClusterCount{0};
-    double maxClusterScore{0.0};
-
-    // we don't need the return value from the first call
-    static_cast<void>(computeBestLoc_(votes, transcript, read, false,
-                                      maxClusterPos, maxClusterCount,
-                                      maxClusterScore));
-    bool revIsBest =
-        computeBestLoc_(rcVotes, transcript, read, true, maxClusterPos,
-                        maxClusterCount, maxClusterScore);
-    isForward_ = not revIsBest;
-
-    bestHitPos = maxClusterPos;
-    bestHitCount = maxClusterCount;
-    bestHitScore = maxClusterScore;
-    return true;
-  }
-
-  bool isForward() { return isForward_; }
-};
-
-template <typename AlnT>
-void processMiniBatch(BulkReadExpT& readExp, ForgettingMassCalculator& fmCalc,
-                      uint64_t firstTimestepOfRound, ReadLibrary& readLib,
-                      const SalmonOpts& salmonOpts,
-                      AlnGroupVecRange<AlnT> batchHits,
-                      std::vector<Transcript>& transcripts,
-                      ClusterForest& clusterForest,
-                      FragmentLengthDistribution& fragLengthDist,
-                      BiasParams& observedGCParams,
-                      /**
-                       * NOTE : test new el model in future
-                       * EffectiveLengthStats& effLengthStats,
-                       **/
-                      std::atomic<uint64_t>& numAssignedFragments,
-                      std::default_random_engine& randEng, bool initialRound,
-                      std::atomic<bool>& burnedIn, double& maxZeroFrac);
-
-template <typename CoverageCalculator>
-inline void collectHitsForRead(SalmonIndex* sidx, const bwtintv_v* a,
-                               smem_aux_t* auxHits, mem_opt_t* memOptions,
-                               const SalmonOpts& salmonOpts,
-                               const uint8_t* read, uint32_t readLen,
-                               std::vector<CoverageCalculator>& hits) {
-  // std::unordered_map<uint64_t, CoverageCalculator>& hits) {
-
-  bwaidx_t* idx = sidx->bwaIndex();
-  mem_collect_intv(salmonOpts, memOptions, sidx, readLen, read, auxHits);
-
-  // For each MEM
-  int firstSeedLen{-1};
-  for (decltype(auxHits->mem.n) i = 0; i < auxHits->mem.n; ++i) {
-    // A pointer to the interval of the MEMs occurences
-    bwtintv_t* p = &auxHits->mem.a[i];
-    // The start and end positions in the query string (i.e. read) of the MEM
-    int qstart = p->info >> 32;
-    uint32_t qend = static_cast<uint32_t>(p->info);
-    int step, count, slen = (qend - qstart); // seed length
-
-    /*
-    if (firstSeedLen > -1) {
-        if (slen < firstSeedLen) { return; }
-    } else {
-        firstSeedLen = slen;
-    }
-    */
-
-    int64_t k;
-    step = (p->x[2] > static_cast<bwtint_t>(memOptions->max_occ)) ? p->x[2] / memOptions->max_occ : 1;
-    // For every occurrence of the MEM
-    for (k = count = 0;
-         k < static_cast<decltype(k)>(p->x[2]) && count < static_cast<decltype(count)>(memOptions->max_occ);
-         k += step, ++count) {
-      //bwtint_t pos;
-      bwtint_t startPos, endPos;
-      //int len, isRev, isRevStart, isRevEnd, refID, refIDStart, refIDEnd;
-      int isRev, isRevStart, isRevEnd, refID, refIDStart, refIDEnd;
-      int queryStart = qstart;
-      //len = slen;
-      uint32_t rlen = readLen;
-
-      // Get the position in the reference index of this MEM occurrence
-      int64_t refStart = bwt_sa(idx->bwt, p->x[0] + k);
-
-      //pos = startPos = bns_depos(idx->bns, refStart, &isRevStart);
-      startPos = bns_depos(idx->bns, refStart, &isRevStart);
-      endPos = bns_depos(idx->bns, refStart + slen - 1, &isRevEnd);
-      // If we span the forward/reverse boundary, discard the hit
-      if (isRevStart != isRevEnd) {
-        continue;
-      }
-      // Otherwise, isRevStart = isRevEnd so just assign isRev = isRevStart
-      isRev = isRevStart;
-
-      // If the hit is reversed --- swap the start and end
-      if (isRev) {
-        if (endPos > startPos) {
-          salmonOpts.jointLog->warn("Hit is supposedly reversed, "
-                                    "but startPos = {} < endPos = {}",
-                                    startPos, endPos);
-        }
-        auto temp = startPos;
-        startPos = endPos;
-        endPos = temp;
-      }
-      // Get the ID of the reference sequence in which it occurs
-      refID = refIDStart = bns_pos2rid(idx->bns, startPos);
-      refIDEnd = bns_pos2rid(idx->bns, endPos);
-
-      if (refID < 0) {
-        continue;
-      } // bridging multiple reference sequences or the forward-reverse
-        // boundary;
-
-      auto tlen = idx->bns->anns[refID].len;
-
-      // The refence sequence-relative (e.g. transcript-relative) position of
-      // the MEM
-      long hitLoc = static_cast<long>(isRev ? endPos : startPos) -
-                    idx->bns->anns[refID].offset;
-
-      if ((refIDStart != refIDEnd)) {
-        // If a seed spans two transcripts
-
-        // If we're not considering splitting such seeds, then
-        // just discard this seed and continue.
-        if (not salmonOpts.splitSpanningSeeds) {
-          continue;
-        }
-
-        // std::cerr << "Seed spans two transcripts! --- attempting to split:
-        // \n";
-        if (!isRev) {
-          // If it's going forward, we have a situation like this
-          // packed transcripts: t1 ===========|t2|==========>
-          // hit:                          |==========>
-
-          // length of hit in t1
-          auto len1 = tlen - hitLoc;
-          // length of hit in t2
-          auto len2 = slen - len1;
-          if (std::max(len1, len2) < memOptions->min_seed_len) {
-            continue;
-          }
-
-          /** Keeping this here for now in case I need to debug splitting seeds
-          again std::cerr << "\t hit is in the forward direction: "; std::cerr
-          << "t1 part has length " << len1 << ", t2 part has length " << len2 <<
-          "\n";
-          */
-
-          // If the part in t1 is larger then just cut off the rest
-          if (len1 >= len2) {
-            slen = len1;
-            int32_t votePos = static_cast<int32_t>(hitLoc) - queryStart;
-            // std::cerr << "\t\t t1 (of length " << tlen << ") has larger hit
-            // --- new hit length = " << len1 << "; starts at pos " <<
-            // queryStart
-            // << " in the read (votePos will be " << votePos << ")\n";
-          } else {
-            // Otherwise, make the hit be in t2.
-            // Because the hit spans the boundary where t2 begins,
-            // the new seed begins matching at position 0 of
-            // transcript t2
-            hitLoc = 0;
-            slen = len2;
-            // The seed originally started at position q, now it starts  len1
-            // characters to the  right of that
-            queryStart += len1;
-            refID = refIDEnd;
-            int32_t votePos = static_cast<int32_t>(hitLoc) - queryStart;
-            tlen = idx->bns->anns[refID].len;
-            // std::cerr << "\t\t t2 (of length " << tlen << ") has larger hit
-            // --- new hit length = " << len2 << "; starts at pos " <<
-            // queryStart
-            // << " in the read (votePos will be " << votePos << ")\n";
-          }
-        } else {
-
-          // If it's going in the reverse direction, we have a situation like
-          // this packed transcripts: t1 <===========|t2|<========== hit:
-          // X======Y>======Z> Which means we have packed transcripts: t1
-          // <===========|t2|<========== hit:
-          // <Z=====Y<======X length of hit in t1
-
-          auto len2 = endPos - idx->bns->anns[refIDEnd].offset;
-          auto len1 = slen - len2;
-          if (std::max(len1, len2) < static_cast<decltype(len1)>(memOptions->min_seed_len)) {
-            continue;
-          }
-
-          /** Keeping this here for now in case I need to debug splitting seeds
-          again std::cerr << "\t hit is in the reverse direction: "; std::cerr
-          << "\n\n"; std::cerr << "startPos = " << startPos << ", endPos = " <<
-          endPos << ", offset[refIDStart] = "
-                    <<  idx->bns->anns[refIDStart].offset << ", offset[refIDEnd]
-          = " << idx->bns->anns[refIDEnd].offset << "\n"; std::cerr << "\n\n";
-          std::cerr << "t1 part has length " << len1 << ", t2 part has length "
-          << len2 << "\n\n";
-          */
-
-          if (len1 >= len2) {
-            slen = len1;
-            hitLoc = tlen - len2;
-            queryStart += len2;
-            rlen -= len2;
-            int32_t votePos =
-                static_cast<int32_t>(hitLoc) - (rlen - queryStart);
-            // std::cerr << "\t\t t1 (hitLoc: " << hitLoc << ") (of length " <<
-            // tlen << ") has larger hit --- new hit length = " << len1 << ";
-            // starts at pos " << queryStart << " in the read (votePos will be "
-            // << votePos << ")\n";
-          } else {
-            slen = len2;
-            refID = bns_pos2rid(idx->bns, endPos);
-            tlen = idx->bns->anns[refID].len;
-            hitLoc = len2;
-            rlen = hitLoc + queryStart;
-            int32_t votePos =
-                static_cast<int32_t>(hitLoc) - (rlen - queryStart);
-            // std::cerr << "\t\t t2 (of length " << tlen << ") (hitLoc: " <<
-            // hitLoc << ") has larger hit --- new hit length = " << len2 << ";
-            // starts at pos " << queryStart << " in the read (votePos will be "
-            // << votePos << ")\n";
-          }
-        }
-      }
-
-      auto hitIt = std::find_if(hits.begin(), hits.end(),
-                                [refID](CoverageCalculator& c) -> bool {
-                                  return (c.targetID) == static_cast<decltype(c.targetID)>(refID);
-                                });
-      if (isRev) {
-        if (hitIt == hits.end()) {
-          CoverageCalculator hit;
-          hit.targetID = refID;
-          hit.addFragMatchRC(hitLoc, queryStart, slen, rlen);
-          hits.emplace_back(hit);
-        } else {
-          hitIt->addFragMatchRC(hitLoc, queryStart, slen, rlen);
-          // hits[refID].addFragMatchRC(hitLoc, queryStart , slen, rlen);
-        }
-      } else {
-        if (hitIt == hits.end()) {
-          CoverageCalculator hit;
-          hit.targetID = refID;
-          hit.addFragMatch(hitLoc, queryStart, slen);
-          hits.emplace_back(hit);
-        } else {
-          hitIt->addFragMatch(hitLoc, queryStart, slen);
-          // hits[refID].addFragMatch(hitLoc, queryStart, slen);
-        }
-      }
-    } // for k
-  }
-}
-
-/*
-constexpr inline bool consistentNames(header_sequence_qual& r) { return true; }
-
-constexpr inline bool consistentNames(
-    std::pair<header_sequence_qual, header_sequence_qual>& rp) {
-  auto l1 = rp.first.header.length();
-  auto l2 = rp.second.header.length();
-  char* sptr = static_cast<char*>(memchr(&rp.first.header[0], ' ', l1));
-
-  bool compat = false;
-  // If we didn't find a space in the name of read1
-  if (sptr == NULL) {
-    if (l1 > 1) {
-      compat = (l1 == l2);
-      compat = compat and
-               (memcmp(&rp.first.header[0], &rp.second.header[0], l1 - 1) == 0);
-      compat =
-          compat and ((rp.first.header[l1 - 1] == '1' and
-                       rp.second.header[l2 - 1] == '2') or
-                      (rp.first.header[l1 - 1] == rp.second.header[l2 - 1]));
-    } else {
-      compat = (l1 == l2);
-      compat = compat and (rp.first.header[0] == rp.second.header[0]);
-    }
-  } else {
-    size_t offset = sptr - (&rp.first.header[0]);
-
-    // If read2 matches read1 up to and including the space
-    if (offset + 1 < l2) {
-      compat = memcmp(&rp.first.header[0], &rp.second.header[0], offset) == 0;
-      // and after the space, read1 and read2 have an identical character or
-      // read1 has a '1' and read2 has a '2', then this is a consistent pair.
-      compat = compat and
-               ((rp.first.header[offset + 1] == rp.second.header[offset + 1]) or
-                (rp.first.header[offset + 1] == '1' and
-                 rp.second.header[offset + 1] == '2'));
-    } else {
-      compat = false;
-    }
-  }
-  return compat;
-}
-*/
-
-/**
- *  Returns true if the @hit is within @cutoff bases of the end of
- *  transcript @txp and false otherwise.
- */
-template <typename CoverageCalculator>
-inline bool
-nearEndOfTranscript(CoverageCalculator& hit, Transcript& txp,
-                    int32_t cutoff = std::numeric_limits<int32_t>::max()) {
-  // check if hit appears close to the end of the given transcript
-  bool isForward = hit.isForward();
-  int32_t hitPos = static_cast<int32_t>(hit.bestHitPos);
-  return (hitPos <= cutoff or
-          std::abs(static_cast<int32_t>(txp.RefLength) - hitPos) <= cutoff);
-}
-
-template <typename CoverageCalculator>
-inline void getHitsForFragment(
-    fastx_parser::ReadPair& frag,
-    // std::pair<header_sequence_qual, header_sequence_qual>& frag,
-    SalmonIndex* sidx, smem_i* itr, const bwtintv_v* a, smem_aux_t* auxHits,
-    mem_opt_t* memOptions, BulkReadExpT& readExp,
-    const SalmonOpts& salmonOpts, double coverageThresh,
-    uint64_t& upperBoundHits, AlignmentGroup<SMEMAlignment>& hitList,
-    uint64_t& hitListCount, std::vector<Transcript>& transcripts) {
-
-  // std::unordered_map<uint64_t, CoverageCalculator> leftHits;
-  // std::unordered_map<uint64_t, CoverageCalculator> rightHits;
-
-  std::vector<CoverageCalculator> leftHits;
-  std::vector<CoverageCalculator> rightHits;
-
-  uint32_t leftReadLength{0};
-  uint32_t rightReadLength{0};
-
-  auto& eqBuilder = readExp.equivalenceClassBuilder();
-  bool allowOrphans{salmonOpts.allowOrphans};
-
-  /**
-   * As soon as we can decide on an acceptable way to validate read names,
-   * we'll inform the user and quit if we see something inconsistent.  However,
-   * we first need a reasonable way to verify potential naming formats from
-   * many different sources.
-   */
-  /*
-  if (!consistentNames(frag)) {
-      fmt::MemoryWriter errstream;
-
-      errstream << "Inconsistent paired-end reads!\n";
-      errstream << "mate1 : " << frag.first.header << "\n";
-      errstream << "mate2 : " << frag.second.header << "\n";
-      errstream << "Paired-end reads should appear consistently in their
-  respective files.\n"; errstream << "Please fix the paire-end input before
-  quantifying with salmon; exiting.\n";
-
-      std::cerr << errstream.str();
-      std::exit(-1);
-  }
-  */
-
-  //---------- End 1 ----------------------//
-  {
-    std::string readStr = frag.first.seq;
-    uint32_t readLen = readStr.size();
-
-    leftReadLength = readLen;
-
-    for (decltype(readLen) p = 0; p < readLen; ++p) {
-      readStr[p] = nst_nt4_table[static_cast<int>(readStr[p])];
-    }
-
-    collectHitsForRead(sidx, a, auxHits, memOptions, salmonOpts,
-                       reinterpret_cast<const uint8_t*>(readStr.c_str()),
-                       readLen, leftHits);
-  }
-
-  //---------- End 2 ----------------------//
-  {
-    std::string readStr = frag.second.seq;
-    uint32_t readLen = readStr.size();
-
-    rightReadLength = readLen;
-
-    for (decltype(readLen) p = 0; p < readLen; ++p) {
-      readStr[p] = nst_nt4_table[static_cast<int>(readStr[p])];
-    }
-
-    collectHitsForRead(sidx, a, auxHits, memOptions, salmonOpts,
-                       reinterpret_cast<const uint8_t*>(readStr.c_str()),
-                       readLen, rightHits);
-  } // end right
-
-  size_t numTrivialHits = (leftHits.size() + rightHits.size() > 0) ? 1 : 0;
-  upperBoundHits += (leftHits.size() + rightHits.size() > 0) ? 1 : 0;
-  size_t readHits{0};
-  auto& alnList = hitList.alignments();
-  hitList.isUniquelyMapped() = true;
-  alnList.clear();
-  // nothing more to do
-  if (numTrivialHits == 0) {
-    return;
-  }
-
-  double cutoffLeft{coverageThresh};  //* leftReadLength};
-  double cutoffRight{coverageThresh}; //* rightReadLength};
-
-  uint64_t leftHitCount{0};
-
-  // Fraction of the optimal coverage that a lightweight alignment
-  // must obtain in order to be retained.
-  float fOpt{0.95};
-
-  // First, see if there are transcripts where both ends of the
-  // fragments map
-  auto& minHitList =
-      (leftHits.size() < rightHits.size()) ? leftHits : rightHits;
-  auto& maxHitList =
-      (leftHits.size() < rightHits.size()) ? rightHits : leftHits;
-
-  struct JointHitPtr {
-    uint32_t transcriptID;
-    size_t leftIndex;
-    size_t rightIndex;
-  };
-
-  std::vector<JointHitPtr> jointHits; // haha (variable name)!
-  jointHits.reserve(minHitList.size());
-
-  // vector-based code
-  // Sort the left and right hits
-  std::sort(
-      leftHits.begin(), leftHits.end(),
-      [](const CoverageCalculator& c1, const CoverageCalculator& c2) -> bool {
-        return c1.targetID < c2.targetID;
-      });
-  std::sort(
-      rightHits.begin(), rightHits.end(),
-      [](const CoverageCalculator& c1, const CoverageCalculator& c2) -> bool {
-        return c1.targetID < c2.targetID;
-      });
-  // Take the intersection of these two hit lists
-  // Adopted from : http://en.cppreference.com/w/cpp/algorithm/set_intersection
-  {
-    auto leftIt = leftHits.begin();
-    auto leftEnd = leftHits.end();
-    auto rightIt = rightHits.begin();
-    auto rightEnd = rightHits.end();
-    while (leftIt != leftEnd && rightIt != rightEnd) {
-      if (leftIt->targetID < rightIt->targetID) {
-        ++leftIt;
-      } else {
-        if (!(rightIt->targetID < leftIt->targetID)) {
-          jointHits.push_back(
-              {leftIt->targetID,
-               static_cast<size_t>(std::distance(leftHits.begin(), leftIt)),
-               static_cast<size_t>(std::distance(rightHits.begin(), rightIt))});
-          ++leftIt;
-        }
-        ++rightIt;
-      }
-    }
-  }
-  // End vector-based code
-
-  /* map based code
-  {
-      auto notFound = maxHitList.end();
-      for (auto& kv : minHitList) {
-          uint64_t refID = kv.first;
-          if (maxHitList.find(refID) != notFound) {
-              jointHits.emplace_back(refID);
-          }
-      }
-  }
-  */
-
-  // Check if the fragment generated orphaned
-  // lightweight alignments.
-  bool isOrphan = (jointHits.size() == 0);
-
-  uint32_t firstTranscriptID = std::numeric_limits<uint32_t>::max();
-  double bestScore = -std::numeric_limits<double>::max();
-  bool sortedByTranscript = true;
-  int32_t lastTranscriptId = std::numeric_limits<int32_t>::min();
-
-  if (BOOST_UNLIKELY(isOrphan and allowOrphans)) {
-    // std::vector<CoverageCalculator> allHits;
-    // allHits.reserve(totalHits);
-
-    // search for a hit on the left
-    for (auto& tHitList : leftHits) {
-      auto transcriptID = tHitList.targetID;
-      auto& covChain = tHitList;
-      Transcript& t = transcripts[transcriptID];
-      if (!t.hasAnchorFragment()) {
-        continue;
-      }
-
-      covChain.computeBestChain(t, frag.first.seq);
-      double score = covChain.bestHitScore;
-
-      // make sure orphaned fragment is near the end of the transcript
-      // if (!nearEndOfTranscript(covChain, t, 1000)) { continue; }
-
-      if (score >= fOpt * bestScore and score >= cutoffLeft) {
-
-        if (score > bestScore) {
-          bestScore = score;
-        }
-        bool isForward = covChain.isForward();
-        int32_t hitPos = covChain.bestHitPos;
-        auto fmt = salmon::utils::hitType(hitPos, isForward);
-
-        if (leftHitCount == 0) {
-          firstTranscriptID = transcriptID;
-        } else if (hitList.isUniquelyMapped() and
-                   transcriptID != firstTranscriptID) {
-          hitList.isUniquelyMapped() = false;
-        }
-
-        if (static_cast<decltype(lastTranscriptId)>(transcriptID) < lastTranscriptId) {
-          sortedByTranscript = false;
-        }
-
-        alnList.emplace_back(transcriptID, fmt, score, hitPos);
-        alnList.back().fwd = isForward;
-        alnList.back().mateStatus = rapmap::utils::MateStatus::PAIRED_END_LEFT;
-        readHits += score;
-        ++hitListCount;
-        ++leftHitCount;
-      }
-    }
-
-    // search for a hit on the right
-    for (auto& tHitList : rightHits) {
-      // Prior
-      // auto transcriptID = tHitList.first;
-      auto transcriptID = tHitList.targetID;
-      auto& covChain = tHitList;
-      Transcript& t = transcripts[transcriptID];
-      if (!t.hasAnchorFragment()) {
-        continue;
-      }
-
-      covChain.computeBestChain(t, frag.second.seq);
-      double score = covChain.bestHitScore;
-
-      // make sure orphaned fragment is near the end of the transcript
-      // if (!nearEndOfTranscript(covChain, t, 1000)) { continue; }
-
-      if (score >= fOpt * bestScore and score >= cutoffRight) {
-        if (score > bestScore) {
-          bestScore = score;
-        }
-        bool isForward = covChain.isForward();
-        int32_t hitPos = covChain.bestHitPos;
-        auto fmt = salmon::utils::hitType(hitPos, isForward);
-        if (leftHitCount == 0) {
-          firstTranscriptID = transcriptID;
-        } else if (hitList.isUniquelyMapped() and
-                   transcriptID != firstTranscriptID) {
-          hitList.isUniquelyMapped() = false;
-        }
-
-        alnList.emplace_back(transcriptID, fmt, score, hitPos);
-        alnList.back().fwd = isForward;
-        alnList.back().mateStatus = rapmap::utils::MateStatus::PAIRED_END_RIGHT;
-        readHits += score;
-        ++hitListCount;
-        ++leftHitCount;
-      }
-    }
-
-    if (alnList.size() > 0) {
-      auto newEnd =
-          std::stable_partition(alnList.begin(), alnList.end(),
-                                [bestScore, fOpt](SMEMAlignment& aln) -> bool {
-                                  return aln.score() >= fOpt * bestScore;
-                                });
-      alnList.resize(std::distance(alnList.begin(), newEnd));
-      if (!sortedByTranscript) {
-        std::sort(alnList.begin(), alnList.end(),
-                  [](const SMEMAlignment& x, const SMEMAlignment& y) -> bool {
-                    return x.transcriptID() < y.transcriptID();
-                  });
-      }
-    } else {
-      return;
-      /*
-      // If we didn't have any *significant* hits --- add any *trivial* orphan
-      hits size_t totalHits = leftHits.size() + rightHits.size();
-      std::vector<uint32_t> txpIDs;
-      txpIDs.reserve(totalHits);
-      std::vector<double> auxProbs;
-      auxProbs.reserve(totalHits);
-
-      size_t txpIDsHash{0};
-      std::vector<CoverageCalculator> allHits;
-      allHits.reserve(totalHits);
-      std::merge(leftHits.begin(), leftHits.end(),
-                 rightHits.begin(), rightHits.end(),
-                 std::back_inserter(allHits),
-                 [](CoverageCalculator& c1, CoverageCalculator& c2) -> bool {
-                  return c1.targetID < c2.targetID;
-                 });
-      double totProb{0.0};
-      for (auto& h : allHits) {
-          boost::hash_combine(txpIDsHash, h.targetID);
-          txpIDs.push_back(h.targetID);
-          double refLen =  std::max(1.0,
-      static_cast<double>(transcripts[h.targetID].RefLength)); double startProb
-      = 1.0 / refLen; auxProbs.push_back(startProb); totProb += startProb;
-      }
-      if (totProb > 0.0) {
-          double norm = 1.0 / totProb;
-          for (auto& p : auxProbs) { p *= norm; }
-
-          TranscriptGroup tg(txpIDs, txpIDsHash);
-          eqBuilder.addGroup(std::move(tg), auxProbs);
-      } else {
-          salmonOpts.jointLog->warn("Unexpected empty hit group [orphaned]");
-      }
-      */
-    }
-  } else { // Not an orphan
-    for (auto jhp : jointHits) {
-      auto& jointHitPtr = jhp;
-      auto transcriptID = jhp.transcriptID;
-      Transcript& t = transcripts[transcriptID];
-      auto& leftHitList = leftHits[jhp.leftIndex];
-      leftHitList.computeBestChain(t, frag.first.seq);
-      if (leftHitList.bestHitScore >= cutoffLeft) {
-        auto& rightHitList = rightHits[jhp.rightIndex];
-
-        rightHitList.computeBestChain(t, frag.second.seq);
-        if (rightHitList.bestHitScore < cutoffRight) {
-          continue;
-        }
-
-        auto end1Start = leftHitList.bestHitPos;
-        auto end2Start = rightHitList.bestHitPos;
-
-        double score =
-            (leftHitList.bestHitScore + rightHitList.bestHitScore) * 0.5;
-        if (score < fOpt * bestScore) {
-          continue;
-        }
-
-        if (score > bestScore) {
-          bestScore = score;
-        }
-
-        uint32_t fragLength = std::abs(static_cast<int32_t>(end1Start) -
-                                       static_cast<int32_t>(end2Start)) +
-                              rightReadLength;
-
-        bool end1IsForward = leftHitList.isForward();
-        bool end2IsForward = rightHitList.isForward();
-
-        uint32_t end1Pos = (end1IsForward)
-                               ? leftHitList.bestHitPos
-                               : leftHitList.bestHitPos + leftReadLength;
-        uint32_t end2Pos = (end2IsForward)
-                               ? rightHitList.bestHitPos
-                               : rightHitList.bestHitPos + rightReadLength;
-        bool canDovetail = false;
-        auto fmt = salmon::utils::hitType(
-            end1Pos, end1IsForward, leftReadLength, end2Pos, end2IsForward,
-            rightReadLength, canDovetail);
-
-        if (readHits == 0) {
-          firstTranscriptID = transcriptID;
-        } else if (hitList.isUniquelyMapped() and
-                   transcriptID != firstTranscriptID) {
-          hitList.isUniquelyMapped() = false;
-        }
-
-        int32_t minHitPos = std::min(end1Pos, end2Pos);
-        if (static_cast<decltype(lastTranscriptId)>(transcriptID) < lastTranscriptId) {
-          sortedByTranscript = false;
-        }
-        // ANCHOR TEST
-        t.setAnchorFragment();
-        alnList.emplace_back(transcriptID, fmt, score, minHitPos, fragLength);
-        alnList.back().fwd = end1IsForward;
-        alnList.back().mateIsFwd = end2IsForward;
-        alnList.back().mateStatus =
-            rapmap::utils::MateStatus::PAIRED_END_PAIRED;
-        ++readHits;
-        ++hitListCount;
-      }
-    } // end for jointHits
-    if (alnList.size() > 0) {
-      auto newEnd =
-          std::stable_partition(alnList.begin(), alnList.end(),
-                                [bestScore, fOpt](SMEMAlignment& aln) -> bool {
-                                  return aln.score() >= fOpt * bestScore;
-                                });
-      alnList.resize(std::distance(alnList.begin(), newEnd));
-      if (!sortedByTranscript) {
-        std::sort(alnList.begin(), alnList.end(),
-                  [](const SMEMAlignment& x, const SMEMAlignment& y) -> bool {
-                    return x.transcriptID() < y.transcriptID();
-                  });
-      }
-    } else {
-      // If we didn't have any *significant* hits --- add any *trivial* joint
-      // hits
-      return;
-      /*
-      std::vector<uint32_t> txpIDs;
-      txpIDs.reserve(jointHits.size());
-      std::vector<double> auxProbs;
-      auxProbs.reserve(jointHits.size());
-
-      size_t txpIDsHash{0};
-      double totProb{0.0};
-      for (auto& h : jointHits) {
-          boost::hash_combine(txpIDsHash, h.transcriptID);
-          txpIDs.push_back(h.transcriptID);
-          double refLen =  std::max(1.0,
-      static_cast<double>(transcripts[h.transcriptID].RefLength)); double
-      startProb = 1.0 / refLen; auxProbs.push_back(startProb); totProb +=
-      startProb;
-      }
-      if (totProb > 0.0) {
-      double norm = 1.0 / totProb;
-      for (auto& p : auxProbs) { p *= norm; }
-
-      TranscriptGroup tg(txpIDs, txpIDsHash);
-      eqBuilder.addGroup(std::move(tg), auxProbs);
-      } else {
-          salmonOpts.jointLog->warn("Unexpected empty hit group [paired]");
-      }
-      */
-    }
-
-  } // end else
-}
-
-/**
- *   Get hits for single-end fragment
- *
- *
- */
-template <typename CoverageCalculator>
-inline void getHitsForFragment(fastx_parser::ReadSeq& frag,
-                               // jellyfish::header_sequence_qual& frag,
-                               SalmonIndex* sidx, smem_i* itr,
-                               const bwtintv_v* a, smem_aux_t* auxHits,
-                               mem_opt_t* memOptions, BulkReadExpT& readExp,
-                               const SalmonOpts& salmonOpts,
-                               double coverageThresh, uint64_t& upperBoundHits,
-                               AlignmentGroup<SMEMAlignment>& hitList,
-                               uint64_t& hitListCount,
-                               std::vector<Transcript>& transcripts) {
-
-  uint64_t leftHitCount{0};
-
-  // std::unordered_map<uint64_t, CoverageCalculator> hits;
-  std::vector<CoverageCalculator> hits;
-
-  auto& eqBuilder = readExp.equivalenceClassBuilder();
-
-  //uint32_t readLength{0};
-
-  //---------- get hits ----------------------//
-  {
-    std::string readStr = frag.seq;
-    uint32_t readLen = frag.seq.size();
-
-    //readLength = readLen;
-
-    for (int p = 0; p < static_cast<int>(readLen); ++p) {
-      readStr[p] = nst_nt4_table[static_cast<int>(readStr[p])];
-    }
-
-    char* readPtr = const_cast<char*>(readStr.c_str());
-
-    collectHitsForRead(sidx, a, auxHits, memOptions, salmonOpts,
-                       reinterpret_cast<const uint8_t*>(readStr.c_str()),
-                       readLen, hits);
-  }
-
-  upperBoundHits += (hits.size() > 0) ? 1 : 0;
-
-  int32_t lastTranscriptId = std::numeric_limits<int32_t>::min();
-  bool sortedByTranscript{true};
-  double fOpt{0.95};
-  double bestScore = -std::numeric_limits<double>::max();
-
-  size_t readHits{0};
-  auto& alnList = hitList.alignments();
-  hitList.isUniquelyMapped() = true;
-  alnList.clear();
-
-  uint32_t firstTranscriptID = std::numeric_limits<uint32_t>::max();
-  double cutoff{coverageThresh}; //* readLength};
-  for (auto& tHitList : hits) {
-    // Prior
-    // auto hitID = tHitList.first;
-    // auto& covVec = tHitList.second;
-    auto hitID = tHitList.targetID;
-    auto& covVec = tHitList;
-
-    // Coverage score
-    Transcript& t = transcripts[hitID];
-    covVec.computeBestChain(t, frag.seq);
-    double score = covVec.bestHitScore;
-    if (score >= fOpt * bestScore and covVec.bestHitScore >= cutoff) {
-
-      bool isForward = covVec.isForward();
-      if (score < fOpt * bestScore) {
-        continue;
-      }
-
-      if (score > bestScore) {
-        bestScore = score;
-      }
-
-      auto hitPos = covVec.bestHitPos;
-      auto fmt = salmon::utils::hitType(hitPos, isForward);
-
-      if (leftHitCount == 0) {
-        firstTranscriptID = hitID;
-      } else if (hitList.isUniquelyMapped() and hitID != firstTranscriptID) {
-        hitList.isUniquelyMapped() = false;
-      }
-
-      auto transcriptID = hitID;
-
-      if (static_cast<int32_t>(transcriptID) < lastTranscriptId) {
-        sortedByTranscript = false;
-      }
-
-      alnList.emplace_back(transcriptID, fmt, score, hitPos);
-      alnList.back().fwd = isForward;
-      alnList.back().mateStatus = rapmap::utils::MateStatus::SINGLE_END;
-      readHits += score;
-      ++hitListCount;
-      ++leftHitCount;
-    }
-  }
-  if (alnList.size() > 0) {
-    auto newEnd =
-        std::stable_partition(alnList.begin(), alnList.end(),
-                              [bestScore, fOpt](SMEMAlignment& aln) -> bool {
-                                return aln.score() >= fOpt * bestScore;
-                              });
-    alnList.resize(std::distance(alnList.begin(), newEnd));
-    if (!sortedByTranscript) {
-      std::sort(alnList.begin(), alnList.end(),
-                [](const SMEMAlignment& x, const SMEMAlignment& y) -> bool {
-                  return x.transcriptID() < y.transcriptID();
-                });
-    }
-  } else {
-    // If we didn't have any *significant* hits --- add any *trivial* joint hits
-    return;
-    /*
-    std::vector<uint32_t> txpIDs;
-    txpIDs.reserve(hits.size());
-    double uniProb = 1.0 / hits.size();
-    std::vector<double> auxProbs(hits.size(), uniProb);
-
-    size_t txpIDsHash{0};
-    for (auto& h : hits) {
-        boost::hash_combine(txpIDsHash, h.targetID);
-        txpIDs.push_back(h.targetID);
-    }
-
-    TranscriptGroup tg(txpIDs, txpIDsHash);
-    eqBuilder.addGroup(std::move(tg), auxProbs);
-    */
-  }
-}
-
-// To use the parser in the following, we get "jobs" until none is
-// available. A job behaves like a pointer to the type
-// jellyfish::sequence_list (see whole_sequence_parser.hpp).
-template <typename ParserT, typename CoverageCalculator>
-void processReadsMEM(
-    ParserT* parser, BulkReadExpT& readExp, ReadLibrary& rl,
-    AlnGroupVec<QuasiAlignment>& structureVec,
-    std::atomic<uint64_t>& numObservedFragments,
-    std::atomic<uint64_t>& numAssignedFragments,
-    std::atomic<uint64_t>& validHits, std::atomic<uint64_t>& upperBoundHits,
-    SalmonIndex* sidx, std::vector<Transcript>& transcripts,
-    ForgettingMassCalculator& fmCalc, ClusterForest& clusterForest,
-    FragmentLengthDistribution& fragLengthDist, BiasParams& observedGCParams,
-    /**
-     * NOTE : test new el model in future
-     * EffectiveLengthStats& obsEffLength,
-     **/
-    mem_opt_t* memOptions, const SalmonOpts& salmonOpts, double coverageThresh,
-    std::mutex& iomutex, bool initialRound, std::atomic<bool>& burnedIn,
-    volatile bool& writeToCache) {
-  // ERROR
-  salmonOpts.jointLog->error("Quasimapping cannot be used with the FMD index "
-                             "--- please report this bug on GitHub");
-  std::exit(1);
-}
-
-template <typename ParserT, typename CoverageCalculator>
-void processReadsMEM(
-    ParserT* parser, BulkReadExpT& readExp, ReadLibrary& rl,
-    AlnGroupVec<SMEMAlignment>& structureVec,
-    std::atomic<uint64_t>& numObservedFragments,
-    std::atomic<uint64_t>& numAssignedFragments,
-    std::atomic<uint64_t>& validHits, std::atomic<uint64_t>& upperBoundHits,
-    SalmonIndex* sidx, std::vector<Transcript>& transcripts,
-    ForgettingMassCalculator& fmCalc, ClusterForest& clusterForest,
-    FragmentLengthDistribution& fragLengthDist, BiasParams& observedGCParams,
-    /**
-     * NOTE : test new el model in future
-     * EffectiveLengthStats& obsEffLengths,
-     **/
-    mem_opt_t* memOptions, const SalmonOpts& salmonOpts, double coverageThresh,
-    std::mutex& iomutex, bool initialRound, std::atomic<bool>& burnedIn,
-    volatile bool& writeToCache) {
-  uint64_t count_fwd = 0, count_bwd = 0;
-  // Seed with a real random value, if available
-  std::random_device rd;
-
-  // Create a random uniform distribution
-  std::default_random_engine eng(rd());
-
-  uint64_t prevObservedFrags{1};
-  uint64_t leftHitCount{0};
-  uint64_t hitListCount{0};
-
-  // Super-MEM iterator
-  smem_i* itr = smem_itr_init(sidx->bwaIndex()->bwt);
-  const bwtintv_v* a = nullptr;
-  smem_aux_t* auxHits = smem_aux_init();
-
-  //auto expectedLibType = rl.format();
-
-  uint64_t firstTimestepOfRound = fmCalc.getCurrentTimestep();
-
-  size_t locRead{0};
-  uint64_t localUpperBoundHits{0};
-  size_t rangeSize{0};
-  double maxZeroFrac{0.0};
-  auto rg = parser->getReadGroup();
-  while (parser->refill(rg)) {
-    rangeSize = rg.size();
-    if (rangeSize > structureVec.size()) {
-      salmonOpts.jointLog->error("rangeSize = {}, but structureVec.size() = {} "
-                                 "--- this shouldn't happen.\n"
-                                 "Please report this bug on GitHub",
-                                 rangeSize, structureVec.size());
-      std::exit(1);
-    }
-
-    for (size_t i = 0; i < rangeSize; ++i) { // For all the read in this batch
-      localUpperBoundHits = 0;
-
-      auto& hitList = structureVec[i];
-      getHitsForFragment<CoverageCalculator>(
-          rg[i],
-          // j->data[i],
-          sidx, itr, a, auxHits, memOptions, readExp, salmonOpts,
-          coverageThresh, localUpperBoundHits, hitList, hitListCount,
-          transcripts);
-      if (initialRound) {
-        upperBoundHits += localUpperBoundHits;
-      }
-
-      // If the read mapped to > maxReadOccs places, discard it
-      if (hitList.size() > salmonOpts.maxReadOccs) {
-        hitList.alignments().clear();
-      }
-      validHits += hitList.size();
-      locRead++;
-      ++numObservedFragments;
-      if (numObservedFragments % 50000 == 0) {
-        iomutex.lock();
-        const char RESET_COLOR[] = "\x1b[0m";
-        char green[] = "\x1b[30m";
-        green[3] = '0' + static_cast<char>(fmt::GREEN);
-        char red[] = "\x1b[30m";
-        red[3] = '0' + static_cast<char>(fmt::RED);
-        if (initialRound) {
-          fmt::print(stderr, "\033[A\r\r{}processed{} {} {}fragments{}\n",
-                     green, red, numObservedFragments, green, RESET_COLOR);
-          fmt::print(stderr, "hits: {}; hits per frag:  {}", validHits,
-                     validHits / static_cast<float>(prevObservedFrags));
-        } else {
-          fmt::print(stderr, "\r\r{}processed{} {} {}fragments{}", green, red,
-                     numObservedFragments, green, RESET_COLOR);
-        }
-        iomutex.unlock();
-      }
-
-    } // end for i < j->nb_filled
-
-    prevObservedFrags = numObservedFragments;
-    AlnGroupVecRange<SMEMAlignment> hitLists = {structureVec.begin(), structureVec.begin()+rangeSize};
-      /*boost::make_iterator_range(
-        structureVec.begin(), structureVec.begin() + rangeSize);*/
-    processMiniBatch<SMEMAlignment>(
-        readExp, fmCalc, firstTimestepOfRound, rl, salmonOpts, hitLists,
-        transcripts, clusterForest, fragLengthDist, observedGCParams,
-        /**
-         * NOTE : test new el model in future
-         * obsEffLengths,
-         **/
-        numAssignedFragments, eng, initialRound, burnedIn, maxZeroFrac);
-  }
-
-  if (maxZeroFrac > 0.0) {
-    salmonOpts.jointLog->info("Thread saw mini-batch with a maximum of "
-                              "{0:.2f}\% zero probability fragments",
-                              maxZeroFrac);
-  }
-
-  smem_aux_destroy(auxHits);
-  smem_itr_destroy(itr);
-}
-
-#endif // LIGHTWEIGHT_ALIGNMENT_DEFS_HPP
diff --git a/include/ReadExperiment.hpp b/include/ReadExperiment.hpp
index 06f234210..c84f923ec 100644
--- a/include/ReadExperiment.hpp
+++ b/include/ReadExperiment.hpp
@@ -743,6 +743,10 @@ class ReadExperiment {
     return numDecoys_;
   }
 
+  salmon::utils::DuplicateTargetStatus index_retains_duplicates() const { 
+    return salmonIndex_->index_retains_duplicates(); 
+  }
+
 private:
   void setTranscriptLengthClasses_(std::vector<uint32_t>& lengths,
                                    size_t nbins) {
diff --git a/include/SalmonDefaults.hpp b/include/SalmonDefaults.hpp
index 3fbb9d491..9c67d8840 100644
--- a/include/SalmonDefaults.hpp
+++ b/include/SalmonDefaults.hpp
@@ -1,6 +1,8 @@
 #ifndef SALMON_DEFAULTS_HPP
 #define SALMON_DEFAULTS_HPP
 
+#include <thread>
+
 namespace salmon {
 namespace defaults {
   // general
@@ -28,10 +30,12 @@ namespace defaults {
   constexpr const int16_t gapOpenPenalty{4};
   constexpr const int16_t gapExtendPenalty{2};
   constexpr const int32_t dpBandwidth{15};
+  constexpr const bool disableChainingHeuristic{false};
   constexpr const bool eqClassMode{false};
   constexpr const bool hardFilter{false};
   constexpr const bool mimicStrictBT2{false};
   constexpr const bool mimicBT2{false};
+  constexpr const bool softclip{false};
   constexpr const bool softclipOverhangs{false};
   constexpr const bool fullLengthAlignment{false};
   constexpr const bool allowDovetail{false};
@@ -95,13 +99,14 @@ namespace defaults {
   constexpr const size_t numFragGCBins{25};
   constexpr const size_t numConditionalGCBins{3};
   constexpr const size_t numRequiredFrags{50000000}; // deprecated
-  constexpr const uint32_t maxHashResizeThreads{std::numeric_limits<uint32_t>::max()};
+  const uint32_t maxHashResizeThreads{std::thread::hardware_concurrency()};
 
   // experimental / testing
   constexpr const bool noRichEqClasses{false};
   constexpr const bool noFragLengthFactor{false};
   constexpr const bool rankEqClasses{false};
   constexpr const bool dontExtrapolateCounts{false};
+  constexpr const bool disableAlignmentCache{false};
 
   // alignment-based mode
   //constexpr const bool useErrorModel{true};
@@ -126,6 +131,7 @@ namespace defaults {
   constexpr const bool isChromiumV3{false};
   constexpr const bool isInDrop{false};
   constexpr const bool isGemcode{false};
+  constexpr const bool isCITESeq{false};
   constexpr const bool isCELSeq{false};
   constexpr const bool isCELSeq2{false};
   constexpr const bool isQuartzSeq2{false};
diff --git a/include/SalmonIndex.hpp b/include/SalmonIndex.hpp
index 49b5a3d8e..8bf38368c 100644
--- a/include/SalmonIndex.hpp
+++ b/include/SalmonIndex.hpp
@@ -9,11 +9,13 @@
 #include "cereal/archives/json.hpp"
 #include "cereal/types/vector.hpp"
 #include "spdlog/spdlog.h"
+#include "json.hpp"
 
 #include "pufferfish/ProgOpts.hpp"
 #include "pufferfish/PufferfishIndex.hpp"
 #include "pufferfish/PufferfishSparseIndex.hpp"
 
+#include "SalmonUtils.hpp"
 #include "SalmonConfig.hpp"
 #include "SalmonIndexVersionInfo.hpp"
 
@@ -24,7 +26,7 @@ class SalmonIndex {
 public:
   SalmonIndex(std::shared_ptr<spdlog::logger>& logger,
               SalmonIndexType indexType)
-      : loaded_(false), versionInfo_(0, false, 0, indexType), logger_(logger),
+      : loaded_(false), versionInfo_(0, false, 0, indexType, ""), logger_(logger),
         seqHash256_(""), nameHash256_(""), seqHash512_(""), nameHash512_(""),
         decoySeqHash256_(""), decoyNameHash256_("") {}
 
@@ -98,6 +100,10 @@ class SalmonIndex {
   std::string decoySeqHash256() const { return decoySeqHash256_; }
   std::string decoyNameHash256() const { return decoyNameHash256_; }
 
+  salmon::utils::DuplicateTargetStatus index_retains_duplicates() const { 
+    return keep_duplicates_; 
+  }
+
 private:
   bool buildPuffIndex_(boost::filesystem::path indexDir, pufferfish::IndexOptions& idxOpt) {
     namespace bfs = boost::filesystem;
@@ -108,6 +114,7 @@ class SalmonIndex {
     versionInfo_.hasAuxKmerIndex(false);
     versionInfo_.auxKmerLength(idxOpt.k);
     versionInfo_.indexType(SalmonIndexType::PUFF);
+    versionInfo_.salmonVersion(salmon::version);
     versionInfo_.save(versionFile);
     return ret;
   }
@@ -126,19 +133,46 @@ class SalmonIndex {
       std::string sampling_type_;
       int version_;
       std::ifstream indexStream(indexStr + "info.json");
-      {
-        cereal::JSONInputArchive ar(indexStream);
-        ar(cereal::make_nvp("sampling_type", sampling_type_),
-           cereal::make_nvp("SeqHash", seqHash256_),
-           cereal::make_nvp("NameHash", nameHash256_),
-           cereal::make_nvp("SeqHash512", seqHash512_),
-           cereal::make_nvp("NameHash512", nameHash512_),
-           cereal::make_nvp("DecoySeqHash", decoySeqHash256_),
-           cereal::make_nvp("DecoyNameHash", decoyNameHash256_),
-           cereal::make_nvp("index_version", version_)
-           );
+      if (indexStream.is_open()) {
+        nlohmann::json info_arch;
+        indexStream >> info_arch;
+
+        sampling_type_ = info_arch["sampling_type"];
+        seqHash256_ = info_arch["SeqHash"];
+        nameHash256_ = info_arch["NameHash"];
+        seqHash512_ = info_arch["SeqHash512"];
+        nameHash512_ = info_arch["NameHash512"];
+        decoySeqHash256_ = info_arch["DecoySeqHash"];
+        decoyNameHash256_ = info_arch["DecoyNameHash"];
+        version_ = info_arch["index_version"];
+        (void) version_;
+        if (info_arch.find("keep_duplicates") != info_arch.end()) {
+          bool kd = info_arch["keep_duplicates"];
+          keep_duplicates_ = kd ? salmon::utils::DuplicateTargetStatus::RETAINED_DUPLICATES : 
+            salmon::utils::DuplicateTargetStatus::REMOVED_DUPLICATES;
+        } else {
+          logger_->warn("The index did not record if the `--keepDuplicates` flag was used. "
+          "Please consider re-indexing with a newer version of salmon that will propagate this information.");
+        }
+      } else {
+        logger_->critical(
+            "Could not properly open the info.json file from the index : {}.",
+            indexStr + "info.json");
       }
       indexStream.close();
+      /*
+      cereal::JSONInputArchive ar(indexStream);
+      ar(cereal::make_nvp("sampling_type", sampling_type_),
+         cereal::make_nvp("SeqHash", seqHash256_),
+         cereal::make_nvp("NameHash", nameHash256_),
+         cereal::make_nvp("SeqHash512", seqHash512_),
+         cereal::make_nvp("NameHash512", nameHash512_),
+         cereal::make_nvp("DecoySeqHash", decoySeqHash256_),
+         cereal::make_nvp("DecoyNameHash", decoyNameHash256_),
+         cereal::make_nvp("index_version", version_));
+      }
+      indexStream.close();
+      */
 
       /*
       if (h.version() != salmon::requiredQuasiIndexVersion) {
@@ -198,6 +232,7 @@ class SalmonIndex {
   bool largeQuasi_{false};
   bool perfectHashQuasi_{false};
 
+  salmon::utils::DuplicateTargetStatus keep_duplicates_{salmon::utils::DuplicateTargetStatus::UNKNOWN};
   bool sparse_{false};
   std::unique_ptr<PufferfishIndex> pfi_{nullptr};
   std::unique_ptr<PufferfishSparseIndex> pfi_sparse_{nullptr};
diff --git a/include/SalmonIndexVersionInfo.hpp b/include/SalmonIndexVersionInfo.hpp
index 35cc7b0c9..fdfa9bdae 100644
--- a/include/SalmonIndexVersionInfo.hpp
+++ b/include/SalmonIndexVersionInfo.hpp
@@ -1,11 +1,13 @@
 #ifndef __SALMON_INDEX_VERSION_INFO_HPP__
 #define __SALMON_INDEX_VERSION_INFO_HPP__
 
+#include "SalmonConfig.hpp"
 #include "boost/filesystem.hpp"
 #include "cereal/archives/json.hpp"
 #include "spdlog/fmt/fmt.h"
+#include "json.hpp"
 
-enum class SalmonIndexType : uint8_t { FMD, QUASI, PUFF };
+enum class SalmonIndexType : uint8_t { FMD=0, QUASI=1, PUFF=2 };
 
 class SalmonIndexVersionInfo {
 public:
@@ -17,9 +19,9 @@ class SalmonIndexVersionInfo {
         indexType_(SalmonIndexType::PUFF) {}
 
   SalmonIndexVersionInfo(uint32_t indexVersionIn, bool hasAuxKmerIndexIn,
-                         uint32_t auxKmerLengthIn, SalmonIndexType indexTypeIn)
+                         uint32_t auxKmerLengthIn, SalmonIndexType indexTypeIn, std::string sverIn)
       : indexVersion_(indexVersionIn), hasAuxKmerIndex_(hasAuxKmerIndexIn),
-        auxKmerLength_(auxKmerLengthIn), indexType_(indexTypeIn) {}
+        auxKmerLength_(auxKmerLengthIn), indexType_(indexTypeIn), salmonVersion_(sverIn) {}
 
   /**
    * Read the index version info from file
@@ -35,11 +37,53 @@ class SalmonIndexVersionInfo {
     }
     std::ifstream ifs(versionFile.string());
     {
+      // NOTE: Let's keep this around for one version (until 1.3.0)
+      // just for reference.
+      // previous cereal-based implementation to load version information metadata
+      /*
       cereal::JSONInputArchive iarchive(ifs); // Create an input archive
       iarchive(cereal::make_nvp("indexVersion", indexVersion_),
                cereal::make_nvp("hasAuxIndex", hasAuxKmerIndex_),
                cereal::make_nvp("auxKmerLength", auxKmerLength_),
-               cereal::make_nvp("indexType", indexType_));
+               cereal::make_nvp("indexType", indexType_),
+               cereal::make_nvp("salmonVersion", salmonVersion_));
+      */
+
+      nlohmann::json j;
+      ifs >> j;
+
+      // Right now, we will make the reading of this optional 
+      // as we don't want to break the backward compatibility 
+      // of salmon 1.2.0 to _read_ indices made with earlier 
+      // versions.
+      if (j.find("salmonVersion") != j.end()) {
+        salmonVersion_ = j["salmonVersion"];
+      } else {
+        salmonVersion_ = "0.0.0";        
+      }
+
+      indexVersion_ = j["indexVersion"].get<uint32_t>();
+      hasAuxKmerIndex_ = j["hasAuxIndex"].get<bool>();
+      auxKmerLength_ = j["auxKmerLength"].get<uint32_t>();
+      // This is one (small) place where cereal handles things
+      // more nicely than nlohmann::json.
+      switch(j["indexType"].get<uint8_t>()) {
+        case 0:
+          indexType_ = SalmonIndexType::FMD;
+          break;
+        case 1:
+          indexType_ = SalmonIndexType::QUASI;
+          break;
+        case 2:
+          indexType_ = SalmonIndexType::PUFF;
+          break;
+        default: {
+          fmt::MemoryWriter infostr;
+          infostr << "Unknown index type tag : " << j["indexType"].get<uint32_t>() << ".";
+          throw std::invalid_argument(infostr.str());
+        }
+      }
+
     }
     ifs.close();
     return true;
@@ -52,7 +96,8 @@ class SalmonIndexVersionInfo {
       oarchive(cereal::make_nvp("indexVersion", indexVersion_),
                cereal::make_nvp("hasAuxIndex", hasAuxKmerIndex_),
                cereal::make_nvp("auxKmerLength", auxKmerLength_),
-               cereal::make_nvp("indexType", indexType_));
+               cereal::make_nvp("indexType", indexType_),
+               cereal::make_nvp("salmonVersion", salmonVersion_));
     }
     ofs.close();
     return true;
@@ -70,11 +115,15 @@ class SalmonIndexVersionInfo {
   SalmonIndexType indexType() { return indexType_; }
   void indexType(SalmonIndexType indexTypeIn) { indexType_ = indexTypeIn; };
 
+  std::string salmonVersion() const { return salmonVersion_; }
+  void salmonVersion(const std::string& sv) { salmonVersion_ = sv; }
+
 private:
   uint32_t indexVersion_;
   bool hasAuxKmerIndex_;
   uint32_t auxKmerLength_;
   SalmonIndexType indexType_;
+  std::string salmonVersion_;
 };
 
 #endif // __SALMON_INDEX_VERSION_INFO_HPP__
diff --git a/include/SalmonMappingUtils.hpp b/include/SalmonMappingUtils.hpp
index c605fd4dd..34e0f54a7 100644
--- a/include/SalmonMappingUtils.hpp
+++ b/include/SalmonMappingUtils.hpp
@@ -107,6 +107,15 @@ inline bool initMapperSettings(SalmonOpts& salmonOpts, MemCollector<IndexT>& mem
   aconf.mimicBT2 = salmonOpts.mimicBT2;
   aconf.mimicBT2Strict = salmonOpts.mimicStrictBT2;
   aconf.allowOverhangSoftclip = salmonOpts.softclipOverhangs;
+  aconf.allowSoftclip  = salmonOpts.softclip; 
+  aconf.useAlignmentCache = !salmonOpts.disableAlignmentCache;
+  aconf.alignmentMode = pufferfish::util::PuffAlignmentMode::SCORE_ONLY;
+
+  // we actually care about the softclips in the cigar string 
+  // if we are writing output and softclipping (or softclipping of overhangs) is enabled
+  if ( (!salmonOpts.qmFileName.empty()) and (salmonOpts.softclip or salmonOpts.softclipOverhangs) ) {
+    aconf.alignmentMode = pufferfish::util::PuffAlignmentMode::APPROXIMATE_CIGAR;
+  }
 
   mpol.noOrphans = !salmonOpts.allowOrphans;
   // TODO : PF_INTEGRATION
@@ -117,11 +126,13 @@ inline bool initMapperSettings(SalmonOpts& salmonOpts, MemCollector<IndexT>& mem
   // a dovetail read is considered concordant or discordant
   mpol.noDiscordant = true;
   mpol.noDovetail = !salmonOpts.allowDovetail;
+  aconf.noDovetail = mpol.noDovetail;
+
   return true;
 }
 
 
-inline void updateRefMappings(uint32_t tid, int32_t hitScore, size_t idx,
+inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, size_t idx,
                   const std::vector<Transcript>& transcripts,
                   int32_t invalidScore,
                   salmon::mapping_utils::MappingScoreInfo& msi,
@@ -159,7 +170,7 @@ inline void updateRefMappings(uint32_t tid, int32_t hitScore, size_t idx,
     // this is the current best
     bestScorePerTranscript[tid].first = hitScore;
     bestScorePerTranscript[tid].second = idx;
-  } else if (hitScore > it->second.first) {
+  } else if ((hitScore > it->second.first) or (hitScore == it->second.first and isCompat)) {
     // otherwise, if we had an alignment for this transcript and it's
     // better than the current best, then set the best score to this
     // alignment's score, and invalidate the previous alignment
diff --git a/include/SalmonOpts.hpp b/include/SalmonOpts.hpp
index 6c56e60b2..6a75338bd 100644
--- a/include/SalmonOpts.hpp
+++ b/include/SalmonOpts.hpp
@@ -260,6 +260,8 @@ struct SalmonOpts {
   bool validateMappings;
   bool disableSA;
   float consensusSlack;
+  bool disableChainingHeuristic;
+  bool disableAlignmentCache;
   double minScoreFraction;
   double scoreExp;
   int16_t matchScore;
@@ -269,6 +271,7 @@ struct SalmonOpts {
   int32_t dpBandwidth;
   bool mimicStrictBT2;
   bool mimicBT2;
+  bool softclip;
   bool softclipOverhangs;
   bool fullLengthAlignment;
   bool allowDovetail;
diff --git a/include/SalmonUtils.hpp b/include/SalmonUtils.hpp
index 23a4bee46..6ed0c54cd 100644
--- a/include/SalmonUtils.hpp
+++ b/include/SalmonUtils.hpp
@@ -28,7 +28,6 @@ extern "C" {
 
 #include "GenomicFeature.hpp"
 #include "LibraryFormat.hpp"
-//#include "RapMapUtils.hpp"
 #include "pufferfish/Util.hpp"
 #include "ReadLibrary.hpp"
 #include "SalmonConfig.hpp"
@@ -55,7 +54,14 @@ enum class MappingType : uint8_t {
   RIGHT_ORPHAN = 2,
   BOTH_ORPHAN = 3,
   PAIRED_MAPPED = 4,
-  SINGLE_MAPPED = 5
+  SINGLE_MAPPED = 5,
+  DECOY = 6
+};
+
+enum class DuplicateTargetStatus : uint8_t { 
+  UNKNOWN = 0, 
+  RETAINED_DUPLICATES = 1, 
+  REMOVED_DUPLICATES = 2 
 };
 
 std::string str(const MappingType& mt);
@@ -312,6 +318,8 @@ LibraryFormat hitType(int32_t end1Start, bool end1Fwd, uint32_t len1,
  */
 LibraryFormat hitType(int32_t readStart, bool isForward);
 
+double compute_1_edit_threshold(int32_t l, const SalmonOpts& sopt);
+
 /**
  *  Cache the mappings provided in an efficient binary format
  *  to the provided file handle.
diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp
index ebdbe4687..53cba31ce 100644
--- a/include/SingleCellProtocols.hpp
+++ b/include/SingleCellProtocols.hpp
@@ -45,6 +45,17 @@ namespace alevin{
       }
     };
 
+    struct CITESeq : Rule{
+      CITESeq(): Rule(16, 12, BarcodeEnd::FIVE, 4294967295){
+        featureLength = 15;
+        featureStart = 10;
+      }
+
+      size_t featureLength, featureStart;
+      void setFeatureLength(size_t length) { featureLength = length; }
+      void setFeatureStart(size_t startIdx) { featureStart = startIdx; }
+    };
+
     struct ChromiumV3 : Rule{
       ChromiumV3(): Rule(16, 12, BarcodeEnd::FIVE, 4294967295){}
     };
diff --git a/include/TranscriptAlignments.hpp b/include/TranscriptAlignments.hpp
deleted file mode 100644
index a9f0e32fd..000000000
--- a/include/TranscriptAlignments.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __TRANSCRIPT_ALIGNMENTS_HPP__
-#define __TRANSCRIPT_ALIGNMENTS_HPP__
-
-#include "SalmonMath.hpp"
-#include <vector>
-
-class SMEMAlignment;
-
-class TranscriptAlignments {
-public:
-  TranscriptAlignments()
-      : alignments(std::vector<SMEMAlignment*>()),
-        totalProb(salmon::math::LOG_0) {}
-
-  std::vector<SMEMAlignment*> alignments;
-  double totalProb;
-  double logMassPrior;
-  double logMassPosterior;
-};
-
-#endif //__TRANSCRIPT_ALIGNMENTS_HPP__
diff --git a/include/WhiteList.hpp b/include/WhiteList.hpp
index 84eec2445..67bf2b6a6 100644
--- a/include/WhiteList.hpp
+++ b/include/WhiteList.hpp
@@ -37,9 +37,8 @@ namespace alevin {
 
     template <typename ProtocolT>
     bool performWhitelisting(AlevinOpts<ProtocolT>& aopt,
-                             std::vector<uint32_t>& umiCount,
                              std::vector<std::string>& trueBarcodes,
-                             CFreqMapT& freqCounter, bool useRibo, bool useMito,
+                             bool useRibo, bool useMito,
                              size_t numLowConfidentBarcode);
   }
 }
diff --git a/include/libcuckoo_bucket_container.hh b/include/bucket_container.hh
similarity index 80%
rename from include/libcuckoo_bucket_container.hh
rename to include/bucket_container.hh
index 2b11f5851..b6726adc3 100644
--- a/include/libcuckoo_bucket_container.hh
+++ b/include/bucket_container.hh
@@ -1,5 +1,5 @@
-#ifndef LIBCUCKOO_BUCKET_CONTAINER_H
-#define LIBCUCKOO_BUCKET_CONTAINER_H
+#ifndef BUCKET_CONTAINER_H
+#define BUCKET_CONTAINER_H
 
 #include <array>
 #include <atomic>
@@ -12,8 +12,10 @@
 
 #include "cuckoohash_util.hh"
 
+namespace libcuckoo {
+
 /**
- * libcuckoo_bucket_container manages storage of key-value pairs for the table.
+ * bucket_container manages storage of key-value pairs for the table.
  * It stores the items inline in uninitialized memory, and keeps track of which
  * slots have live data and which do not. It also stores a partial hash for
  * each live key. It is sized by powers of two.
@@ -26,7 +28,7 @@
  */
 template <class Key, class T, class Allocator, class Partial,
           std::size_t SLOT_PER_BUCKET>
-class libcuckoo_bucket_container {
+class bucket_container {
 public:
   using key_type = Key;
   using mapped_type = T;
@@ -39,11 +41,11 @@ private:
 public:
   using allocator_type = typename traits_::allocator_type;
   using partial_t = Partial;
-  using size_type = std::size_t;
+  using size_type = typename traits_::size_type;
   using reference = value_type &;
   using const_reference = const value_type &;
-  using pointer = value_type *;
-  using const_pointer = const value_type *;
+  using pointer = typename traits_::pointer;
+  using const_pointer = typename traits_::const_pointer;
 
   /*
    * The bucket type holds SLOT_PER_BUCKET key-value pairs, along with their
@@ -55,7 +57,7 @@ public:
    */
   class bucket {
   public:
-    bucket() noexcept : occupied_{} {}
+    bucket() noexcept : occupied_() {}
 
     const value_type &kvpair(size_type ind) const {
       return *static_cast<const value_type *>(
@@ -84,7 +86,7 @@ public:
     bool &occupied(size_type ind) { return occupied_[ind]; }
 
   private:
-    friend class libcuckoo_bucket_container;
+    friend class bucket_container;
 
     using storage_value_type = std::pair<Key, T>;
 
@@ -105,48 +107,48 @@ public:
     std::array<bool, SLOT_PER_BUCKET> occupied_;
   };
 
-  libcuckoo_bucket_container(size_type hp, const allocator_type &allocator)
+  bucket_container(size_type hp, const allocator_type &allocator)
       : allocator_(allocator), bucket_allocator_(allocator), hashpower_(hp),
         buckets_(bucket_allocator_.allocate(size())) {
     // The bucket default constructor is nothrow, so we don't have to
     // worry about dealing with exceptions when constructing all the
     // elements.
     static_assert(std::is_nothrow_constructible<bucket>::value,
-                  "libcuckoo_bucket_container requires bucket to be nothrow "
+                  "bucket_container requires bucket to be nothrow "
                   "constructible");
     for (size_type i = 0; i < size(); ++i) {
       traits_::construct(allocator_, &buckets_[i]);
     }
   }
 
-  ~libcuckoo_bucket_container() noexcept { destroy_buckets(); }
+  ~bucket_container() noexcept { destroy_buckets(); }
 
-  libcuckoo_bucket_container(const libcuckoo_bucket_container &bc)
+  bucket_container(const bucket_container &bc)
       : allocator_(
             traits_::select_on_container_copy_construction(bc.allocator_)),
         bucket_allocator_(allocator_), hashpower_(bc.hashpower()),
         buckets_(transfer(bc.hashpower(), bc, std::false_type())) {}
 
-  libcuckoo_bucket_container(const libcuckoo_bucket_container &bc,
+  bucket_container(const bucket_container &bc,
                              const allocator_type &a)
       : allocator_(a), bucket_allocator_(allocator_),
         hashpower_(bc.hashpower()),
         buckets_(transfer(bc.hashpower(), bc, std::false_type())) {}
 
-  libcuckoo_bucket_container(libcuckoo_bucket_container &&bc)
+  bucket_container(bucket_container &&bc)
       : allocator_(std::move(bc.allocator_)), bucket_allocator_(allocator_),
         hashpower_(bc.hashpower()), buckets_(std::move(bc.buckets_)) {
     // De-activate the other buckets container
     bc.buckets_ = nullptr;
   }
 
-  libcuckoo_bucket_container(libcuckoo_bucket_container &&bc,
+  bucket_container(bucket_container &&bc,
                              const allocator_type &a)
       : allocator_(a), bucket_allocator_(allocator_) {
     move_assign(bc, std::false_type());
   }
 
-  libcuckoo_bucket_container &operator=(const libcuckoo_bucket_container &bc) {
+  bucket_container &operator=(const bucket_container &bc) {
     destroy_buckets();
     copy_allocator(allocator_, bc.allocator_,
                    typename traits_::propagate_on_container_copy_assignment());
@@ -156,16 +158,17 @@ public:
     return *this;
   }
 
-  libcuckoo_bucket_container &operator=(libcuckoo_bucket_container &&bc) {
+  bucket_container &operator=(bucket_container &&bc) {
     destroy_buckets();
     move_assign(bc, typename traits_::propagate_on_container_move_assignment());
     return *this;
   }
 
-  void swap(libcuckoo_bucket_container &bc) noexcept {
+  void swap(bucket_container &bc) noexcept {
     swap_allocator(allocator_, bc.allocator_,
                    typename traits_::propagate_on_container_swap());
-    bucket_allocator_ = allocator_;
+    swap_allocator(bucket_allocator_, bc.bucket_allocator_,
+                   typename traits_::propagate_on_container_swap());
     // Regardless of whether we actually swapped the allocators or not, it will
     // always be okay to do the remainder of the swap. This is because if the
     // allocators were swapped, then the subsequent operations are okay. If the
@@ -216,12 +219,13 @@ public:
     traits_::destroy(allocator_, std::addressof(b.storage_kvpair(slot)));
   }
 
-  // Destroys all the live data in the buckets
+  // Destroys all the live data in the buckets. Does not deallocate the bucket
+  // memory.
   void clear() noexcept {
     static_assert(
         std::is_nothrow_destructible<key_type>::value &&
             std::is_nothrow_destructible<mapped_type>::value,
-        "libcuckoo_bucket_container requires key and value to be nothrow "
+        "bucket_container requires key and value to be nothrow "
         "destructible");
     for (size_type i = 0; i < size(); ++i) {
       bucket &b = buckets_[i];
@@ -233,7 +237,17 @@ public:
     }
   }
 
+  // Destroys and deallocates all data in the buckets. After this operation,
+  // the bucket container will have no allocated data. It is still valid to
+  // swap, move or copy assign to this container.
+  void clear_and_deallocate() noexcept {
+    destroy_buckets();
+  }
+
 private:
+  using bucket_traits_ = typename traits_::template rebind_traits<bucket>;
+  using bucket_pointer = typename bucket_traits_::pointer;
+  
   // true here means the allocators from `src` are propagated on libcuckoo_copy
   template <typename A>
   void copy_allocator(A &dst, const A &src, std::true_type) {
@@ -251,7 +265,7 @@ private:
   template <typename A> void swap_allocator(A &, A &, std::false_type) {}
 
   // true here means the bucket allocator should be propagated
-  void move_assign(libcuckoo_bucket_container &src, std::true_type) {
+  void move_assign(bucket_container &src, std::true_type) {
     allocator_ = std::move(src.allocator_);
     bucket_allocator_ = allocator_;
     hashpower(src.hashpower());
@@ -259,7 +273,7 @@ private:
     src.buckets_ = nullptr;
   }
 
-  void move_assign(libcuckoo_bucket_container &src, std::false_type) {
+  void move_assign(bucket_container &src, std::false_type) {
     hashpower(src.hashpower());
     if (allocator_ == src.allocator_) {
       buckets_ = src.buckets_;
@@ -277,7 +291,7 @@ private:
     // worry about dealing with exceptions when constructing all the
     // elements.
     static_assert(std::is_nothrow_destructible<bucket>::value,
-                  "libcuckoo_bucket_container requires bucket to be nothrow "
+                  "bucket_container requires bucket to be nothrow "
                   "destructible");
     clear();
     for (size_type i = 0; i < size(); ++i) {
@@ -301,13 +315,13 @@ private:
   }
 
   template <bool B>
-  bucket *transfer(
+  bucket_pointer transfer(
       size_type dst_hp,
-      typename std::conditional<B, libcuckoo_bucket_container &,
-                                const libcuckoo_bucket_container &>::type src,
+      typename std::conditional<B, bucket_container &,
+                                const bucket_container &>::type src,
       std::integral_constant<bool, B> move) {
     assert(dst_hp >= src.hashpower());
-    libcuckoo_bucket_container dst(dst_hp, get_allocator());
+    bucket_container dst(dst_hp, get_allocator());
     // Move/copy all occupied slots of the source buckets
     for (size_t i = 0; i < src.size(); ++i) {
       for (size_t j = 0; j < SLOT_PER_BUCKET; ++j) {
@@ -317,7 +331,7 @@ private:
       }
     }
     // Take away the pointer from `dst` and return it
-    bucket *dst_pointer = dst.buckets_;
+    bucket_pointer dst_pointer = dst.buckets_;
     dst.buckets_ = nullptr;
     return dst_pointer;
   }
@@ -334,7 +348,7 @@ private:
   std::atomic<size_type> hashpower_;
   // These buckets are protected by striped locks (external to the
   // BucketContainer), which must be obtained before accessing a bucket.
-  bucket *buckets_;
+  bucket_pointer buckets_;
 
   // If the key and value are Trivial, the bucket be serilizable. Since we
   // already disallow user-specialized instances of std::pair, we know that the
@@ -342,11 +356,13 @@ private:
   // this should be okay. We could in theory just check if the type is
   // TriviallyCopyable but this check is not available on some compilers we
   // want to support.
-  template <typename Bogus = void *>
-  friend typename std::enable_if<sizeof(Bogus) && std::is_trivial<Key>::value &&
-                                     std::is_trivial<T>::value,
+  template <typename ThisKey, typename ThisT>
+  friend typename std::enable_if<std::is_trivial<ThisKey>::value &&
+                                     std::is_trivial<ThisT>::value,
                                  std::ostream &>::type
-  operator<<(std::ostream &os, const libcuckoo_bucket_container &bc) {
+  operator<<(std::ostream &os,
+             const bucket_container<ThisKey, ThisT, Allocator,
+                                              Partial, SLOT_PER_BUCKET> &bc) {
     size_type hp = bc.hashpower();
     os.write(reinterpret_cast<const char *>(&hp), sizeof(size_type));
     os.write(reinterpret_cast<const char *>(bc.buckets_),
@@ -354,14 +370,16 @@ private:
     return os;
   }
 
-  template <typename Bogus = void *>
-  friend typename std::enable_if<sizeof(Bogus) && std::is_trivial<Key>::value &&
-                                     std::is_trivial<T>::value,
+  template <typename ThisKey, typename ThisT>
+  friend typename std::enable_if<std::is_trivial<ThisKey>::value &&
+                                     std::is_trivial<ThisT>::value,
                                  std::istream &>::type
-  operator>>(std::istream &is, libcuckoo_bucket_container &bc) {
+  operator>>(std::istream &is,
+             bucket_container<ThisKey, ThisT, Allocator,
+                                        Partial, SLOT_PER_BUCKET> &bc) {
     size_type hp;
     is.read(reinterpret_cast<char *>(&hp), sizeof(size_type));
-    libcuckoo_bucket_container new_bc(hp, bc.get_allocator());
+    bucket_container new_bc(hp, bc.get_allocator());
     is.read(reinterpret_cast<char *>(new_bc.buckets_),
             new_bc.size() * sizeof(bucket));
     bc.swap(new_bc);
@@ -369,4 +387,6 @@ private:
   }
 };
 
-#endif // LIBCUCKOO_BUCKET_CONTAINER_H
+}  // namespace libcuckoo
+
+#endif // BUCKET_CONTAINER_H
diff --git a/include/concurrentqueue.h b/include/concurrentqueue.h
index a056b8092..7b864febe 100644
--- a/include/concurrentqueue.h
+++ b/include/concurrentqueue.h
@@ -150,6 +150,17 @@ namespace moodycamel { namespace details {
 } }
 #endif
 
+// Constexpr if
+#ifndef MOODYCAMEL_CONSTEXPR_IF
+#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
+#define MOODYCAMEL_CONSTEXPR_IF if constexpr
+#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
+#else
+#define MOODYCAMEL_CONSTEXPR_IF if
+#define MOODYCAMEL_MAYBE_UNUSED
+#endif
+#endif
+
 // Exceptions
 #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
 #if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
@@ -162,8 +173,8 @@ namespace moodycamel { namespace details {
 #define MOODYCAMEL_RETHROW throw
 #define MOODYCAMEL_THROW(expr) throw (expr)
 #else
-#define MOODYCAMEL_TRY if (true)
-#define MOODYCAMEL_CATCH(...) else if (false)
+#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true)
+#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false)
 #define MOODYCAMEL_RETHROW
 #define MOODYCAMEL_THROW(expr)
 #endif
@@ -798,7 +809,7 @@ class ConcurrentQueue
 		}
 		
 		// Destroy implicit producer hash tables
-		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
 			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
 			while (hash != nullptr) {
 				auto prev = hash->prev;
@@ -923,8 +934,8 @@ class ConcurrentQueue
 	// Thread-safe.
 	inline bool enqueue(T const& item)
 	{
-		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		return inner_enqueue<CanAlloc>(item);
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(item);
 	}
 	
 	// Enqueues a single item (by moving it, if possible).
@@ -934,8 +945,8 @@ class ConcurrentQueue
 	// Thread-safe.
 	inline bool enqueue(T&& item)
 	{
-		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		return inner_enqueue<CanAlloc>(std::move(item));
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(std::move(item));
 	}
 	
 	// Enqueues a single item (by copying it) using an explicit producer token.
@@ -965,8 +976,8 @@ class ConcurrentQueue
 	template<typename It>
 	bool enqueue_bulk(It itemFirst, size_t count)
 	{
-		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
 	}
 	
 	// Enqueues several items using an explicit producer token.
@@ -988,8 +999,8 @@ class ConcurrentQueue
 	// Thread-safe.
 	inline bool try_enqueue(T const& item)
 	{
-		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		return inner_enqueue<CannotAlloc>(item);
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(item);
 	}
 	
 	// Enqueues a single item (by moving it, if possible).
@@ -999,8 +1010,8 @@ class ConcurrentQueue
 	// Thread-safe.
 	inline bool try_enqueue(T&& item)
 	{
-		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		return inner_enqueue<CannotAlloc>(std::move(item));
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(std::move(item));
 	}
 	
 	// Enqueues a single item (by copying it) using an explicit producer token.
@@ -1029,8 +1040,8 @@ class ConcurrentQueue
 	template<typename It>
 	bool try_enqueue_bulk(It itemFirst, size_t count)
 	{
-		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
 	}
 	
 	// Enqueues several items using an explicit producer token.
@@ -1498,7 +1509,7 @@ class ConcurrentQueue
 		template<InnerQueueContext context>
 		inline bool is_empty() const
 		{
-			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
 				// Check flags
 				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
 					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
@@ -1523,9 +1534,9 @@ class ConcurrentQueue
 		
 		// Returns true if the block is now empty (does not apply in explicit context)
 		template<InnerQueueContext context>
-		inline bool set_empty(index_t i)
+		inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
 		{
-			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
 				// Set flag
 				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
 				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
@@ -1542,9 +1553,9 @@ class ConcurrentQueue
 		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
 		// Returns true if the block is now empty (does not apply in explicit context).
 		template<InnerQueueContext context>
-		inline bool set_many_empty(index_t i, size_t count)
+		inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
 		{
-			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
 				// Set flags
 				std::atomic_thread_fence(std::memory_order_release);
 				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
@@ -1565,7 +1576,7 @@ class ConcurrentQueue
 		template<InnerQueueContext context>
 		inline void set_all_empty()
 		{
-			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
 				// Set all flags
 				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
 					emptyFlags[i].store(true, std::memory_order_relaxed);
@@ -1580,7 +1591,7 @@ class ConcurrentQueue
 		template<InnerQueueContext context>
 		inline void reset_empty()
 		{
-			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
 				// Reset flags
 				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
 					emptyFlags[i].store(false, std::memory_order_relaxed);
@@ -1819,7 +1830,10 @@ class ConcurrentQueue
 						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
 						// the initial allocation failed in the constructor.
 						
-						if (allocMode == CannotAlloc || !new_block_index(pr_blockIndexSlotsUsed)) {
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							return false;
+						}
+						else if (!new_block_index(pr_blockIndexSlotsUsed)) {
 							return false;
 						}
 					}
@@ -2023,7 +2037,14 @@ class ConcurrentQueue
 					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
 					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
 					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
-						if (allocMode == CannotAlloc || full || !new_block_index(originalBlockIndexSlotsUsed)) {
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
 							// Failed to allocate, undo changes (but keep injected blocks)
 							pr_blockIndexFront = originalBlockIndexFront;
 							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
@@ -2831,7 +2852,10 @@ class ConcurrentQueue
 			}
 			
 			// No room in the old block index, try to allocate another one!
-			if (allocMode == CannotAlloc || !new_block_index()) {
+			MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+				return false;
+			}
+			else if (!new_block_index()) {
 				return false;
 			}
 			localBlockIndex = blockIndex.load(std::memory_order_relaxed);
@@ -3011,11 +3035,12 @@ class ConcurrentQueue
 			return block;
 		}
 		
-		if (canAlloc == CanAlloc) {
+		MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) {
 			return create<Block>();
 		}
-		
-		return nullptr;
+		else {
+			return nullptr;
+		}
 	}
 	
 
@@ -3244,50 +3269,56 @@ class ConcurrentQueue
 	
 	inline void populate_initial_implicit_producer_hash()
 	{
-		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return;
-		
-		implicitProducerHashCount.store(0, std::memory_order_relaxed);
-		auto hash = &initialImplicitProducerHash;
-		hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
-		hash->entries = &initialImplicitProducerHashEntries[0];
-		for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
-			initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
+		}
+		else {
+			implicitProducerHashCount.store(0, std::memory_order_relaxed);
+			auto hash = &initialImplicitProducerHash;
+			hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+			hash->entries = &initialImplicitProducerHashEntries[0];
+			for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+				initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+			}
+			hash->prev = nullptr;
+			implicitProducerHash.store(hash, std::memory_order_relaxed);
 		}
-		hash->prev = nullptr;
-		implicitProducerHash.store(hash, std::memory_order_relaxed);
 	}
 	
 	void swap_implicit_producer_hashes(ConcurrentQueue& other)
 	{
-		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return;
-		
-		// Swap (assumes our implicit producer hash is initialized)
-		initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
-		initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
-		other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
-		
-		details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
-		
-		details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
-		if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
-			implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
 		}
 		else {
-			ImplicitProducerHash* hash;
-			for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
-				continue;
+			// Swap (assumes our implicit producer hash is initialized)
+			initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+			initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
+			other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+			
+			details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+			
+			details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+			if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
+				implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
 			}
-			hash->prev = &initialImplicitProducerHash;
-		}
-		if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
-			other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
-		}
-		else {
-			ImplicitProducerHash* hash;
-			for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
-				continue;
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &initialImplicitProducerHash;
+			}
+			if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
+				other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+			}
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &other.initialImplicitProducerHash;
 			}
-			hash->prev = &other.initialImplicitProducerHash;
 		}
 	}
 	
@@ -3654,5 +3685,3 @@ inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, ty
 #if defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif
-
-// lgtm [cpp/assignment-does-not-return-this]
\ No newline at end of file
diff --git a/include/cuckoohash_config.hh b/include/cuckoohash_config.hh
index fff166cdb..910094ceb 100644
--- a/include/cuckoohash_config.hh
+++ b/include/cuckoohash_config.hh
@@ -6,26 +6,30 @@
 #include <cstddef>
 #include <limits>
 
+namespace libcuckoo {
+
 //! The default maximum number of keys per bucket
-constexpr size_t LIBCUCKOO_DEFAULT_SLOT_PER_BUCKET = 4;
+constexpr size_t DEFAULT_SLOT_PER_BUCKET = 4;
 
 //! The default number of elements in an empty hash table
-constexpr size_t LIBCUCKOO_DEFAULT_SIZE =
-    (1U << 16) * LIBCUCKOO_DEFAULT_SLOT_PER_BUCKET;
+constexpr size_t DEFAULT_SIZE =
+    (1U << 16) * DEFAULT_SLOT_PER_BUCKET;
 
 //! The default minimum load factor that the table allows for automatic
 //! expansion. It must be a number between 0.0 and 1.0. The table will throw
-//! libcuckoo_load_factor_too_low if the load factor falls below this value
+//! load_factor_too_low if the load factor falls below this value
 //! during an automatic expansion.
-constexpr double LIBCUCKOO_DEFAULT_MINIMUM_LOAD_FACTOR = 0.05;
+constexpr double DEFAULT_MINIMUM_LOAD_FACTOR = 0.05;
 
 //! An alias for the value that sets no limit on the maximum hashpower. If this
 //! value is set as the maximum hashpower limit, there will be no limit. This
 //! is also the default initial value for the maximum hashpower in a table.
-constexpr size_t LIBCUCKOO_NO_MAXIMUM_HASHPOWER =
+constexpr size_t NO_MAXIMUM_HASHPOWER =
     std::numeric_limits<size_t>::max();
 
 //! set LIBCUCKOO_DEBUG to 1 to enable debug output
 #define LIBCUCKOO_DEBUG 0
 
+}  // namespace libcuckoo
+
 #endif // _CUCKOOHASH_CONFIG_HH
diff --git a/include/cuckoohash_map.hh b/include/cuckoohash_map.hh
index 0b0b18cf7..88f1f4334 100644
--- a/include/cuckoohash_map.hh
+++ b/include/cuckoohash_map.hh
@@ -18,6 +18,7 @@
 #include <memory>
 #include <mutex>
 #include <stdexcept>
+#include <string>
 #include <thread>
 #include <type_traits>
 #include <utility>
@@ -25,7 +26,9 @@
 
 #include "cuckoohash_config.hh"
 #include "cuckoohash_util.hh"
-#include "libcuckoo_bucket_container.hh"
+#include "bucket_container.hh"
+
+namespace libcuckoo {
 
 /**
  * A concurrent hash table
@@ -34,13 +37,15 @@
  * @tparam T type of values in the table
  * @tparam Hash type of hash functor
  * @tparam KeyEqual type of equality comparison functor
- * @tparam Allocator type of allocator
+ * @tparam Allocator type of allocator. We suggest using an aligned allocator,
+ * because the table relies on types that are over-aligned to optimize
+ * concurrent cache usage.
  * @tparam SLOT_PER_BUCKET number of slots for each bucket in the table
  */
 template <class Key, class T, class Hash = std::hash<Key>,
           class KeyEqual = std::equal_to<Key>,
           class Allocator = std::allocator<std::pair<const Key, T>>,
-          std::size_t SLOT_PER_BUCKET = LIBCUCKOO_DEFAULT_SLOT_PER_BUCKET>
+          std::size_t SLOT_PER_BUCKET = DEFAULT_SLOT_PER_BUCKET>
 class cuckoohash_map {
 private:
   // Type of the partial key
@@ -48,7 +53,7 @@ private:
 
   // The type of the buckets container
   using buckets_t =
-      libcuckoo_bucket_container<Key, T, Allocator, partial_t, SLOT_PER_BUCKET>;
+      bucket_container<Key, T, Allocator, partial_t, SLOT_PER_BUCKET>;
 
 public:
   /** @name Type Declarations */
@@ -96,13 +101,17 @@ public:
    * @param equal equality function instance to use
    * @param alloc allocator instance to use
    */
-  cuckoohash_map(size_type n = LIBCUCKOO_DEFAULT_SIZE, const Hash &hf = Hash(),
+  cuckoohash_map(size_type n = DEFAULT_SIZE, const Hash &hf = Hash(),
                  const KeyEqual &equal = KeyEqual(),
                  const Allocator &alloc = Allocator())
-      : hash_fn_(hf), eq_fn_(equal), buckets_(reserve_calc(n), alloc),
+      : hash_fn_(hf), eq_fn_(equal),
+        buckets_(reserve_calc(n), alloc),
+        old_buckets_(0, alloc),
         all_locks_(get_allocator()),
-        minimum_load_factor_(LIBCUCKOO_DEFAULT_MINIMUM_LOAD_FACTOR),
-        maximum_hashpower_(LIBCUCKOO_NO_MAXIMUM_HASHPOWER) {
+        num_remaining_lazy_rehash_locks_(0),
+        minimum_load_factor_(DEFAULT_MINIMUM_LOAD_FACTOR),
+        maximum_hashpower_(NO_MAXIMUM_HASHPOWER),
+        max_num_worker_threads_(0) {
     all_locks_.emplace_back(std::min(bucket_count(), size_type(kMaxNumLocks)),
                             spinlock(), get_allocator());
   }
@@ -121,7 +130,7 @@ public:
    */
   template <typename InputIt>
   cuckoohash_map(InputIt first, InputIt last,
-                 size_type n = LIBCUCKOO_DEFAULT_SIZE, const Hash &hf = Hash(),
+                 size_type n = DEFAULT_SIZE, const Hash &hf = Hash(),
                  const KeyEqual &equal = KeyEqual(),
                  const Allocator &alloc = Allocator())
       : cuckoohash_map(n, hf, equal, alloc) {
@@ -136,10 +145,7 @@ public:
    *
    * @param other the map being copied
    */
-  cuckoohash_map(const cuckoohash_map &other)
-      : cuckoohash_map(other, std::allocator_traits<allocator_type>::
-                                  select_on_container_copy_construction(
-                                  other.get_allocator())) { }
+  cuckoohash_map(const cuckoohash_map &other) = default;
 
   /**
    * Copy constructor with separate allocator. If @p other is being modified
@@ -150,10 +156,14 @@ public:
    */
   cuckoohash_map(const cuckoohash_map &other, const Allocator &alloc)
       : hash_fn_(other.hash_fn_), eq_fn_(other.eq_fn_),
-        buckets_(other.buckets_, alloc), all_locks_(alloc),
-        minimum_load_factor_(other.minimum_load_factor()),
-        maximum_hashpower_(other.maximum_hashpower()),
-        max_resize_threads_(other.get_max_resize_threads()) {
+        buckets_(other.buckets_, alloc),
+        old_buckets_(other.old_buckets_, alloc),
+        all_locks_(alloc),
+        num_remaining_lazy_rehash_locks_(
+            other.num_remaining_lazy_rehash_locks_),
+        minimum_load_factor_(other.minimum_load_factor_),
+        maximum_hashpower_(other.maximum_hashpower_),
+        max_num_worker_threads_(other.max_num_worker_threads_) {
     if (other.get_allocator() == alloc) {
       all_locks_ = other.all_locks_;
     } else {
@@ -167,8 +177,7 @@ public:
    *
    * @param other the map being moved
    */
-  cuckoohash_map(cuckoohash_map &&other)
-      : cuckoohash_map(std::move(other), other.get_allocator()) {}
+  cuckoohash_map(cuckoohash_map &&other) = default;
 
   /**
    * Move constructor with separate allocator. If the map being moved is being
@@ -179,10 +188,14 @@ public:
    */
   cuckoohash_map(cuckoohash_map &&other, const Allocator &alloc)
       : hash_fn_(std::move(other.hash_fn_)), eq_fn_(std::move(other.eq_fn_)),
-        buckets_(std::move(other.buckets_), alloc), all_locks_(alloc),
-        minimum_load_factor_(other.minimum_load_factor()),
-        maximum_hashpower_(other.maximum_hashpower()),
-        max_resize_threads_(other.get_max_resize_threads()) {
+        buckets_(std::move(other.buckets_), alloc),
+        old_buckets_(std::move(other.old_buckets_), alloc),
+        all_locks_(alloc),
+        num_remaining_lazy_rehash_locks_(
+            other.num_remaining_lazy_rehash_locks_),
+        minimum_load_factor_(other.minimum_load_factor_),
+        maximum_hashpower_(other.maximum_hashpower_),
+        max_num_worker_threads_(other.max_num_worker_threads_) {
     if (other.get_allocator() == alloc) {
       all_locks_ = std::move(other.all_locks_);
     } else {
@@ -200,7 +213,7 @@ public:
    * @param alloc allocator instance to use
    */
   cuckoohash_map(std::initializer_list<value_type> init,
-                 size_type n = LIBCUCKOO_DEFAULT_SIZE, const Hash &hf = Hash(),
+                 size_type n = DEFAULT_SIZE, const Hash &hf = Hash(),
                  const KeyEqual &equal = KeyEqual(),
                  const Allocator &alloc = Allocator())
       : cuckoohash_map(init.begin(), init.end(), n, hf, equal, alloc) {}
@@ -223,7 +236,6 @@ public:
         maximum_hashpower_.exchange(other.maximum_hashpower(),
                                     std::memory_order_release),
         std::memory_order_release);
-    std::swap(max_resize_threads_, other.max_resize_threads_);
   }
 
   /**
@@ -233,16 +245,7 @@ public:
    * @param other the map to assign from
    * @return @c *this
    */
-  cuckoohash_map &operator=(const cuckoohash_map &other) {
-    hash_fn_ = other.hash_fn_;
-    eq_fn_ = other.eq_fn_;
-    buckets_ = other.buckets_;
-    all_locks_ = other.all_locks_;
-    minimum_load_factor_ = other.minimum_load_factor();
-    maximum_hashpower_ = other.maximum_hashpower();
-    max_resize_threads_ = other.max_resize_threads_;
-    return *this;
-  }
+  cuckoohash_map &operator=(const cuckoohash_map &other) = default;
 
   /**
    * Move assignment operator. If @p other is being modified concurrently,
@@ -251,16 +254,7 @@ public:
    * @param other the map to assign from
    * @return @c *this
    */
-  cuckoohash_map &operator=(cuckoohash_map &&other) {
-    hash_fn_ = std::move(other.hash_fn_);
-    eq_fn_ = std::move(other.eq_fn_);
-    buckets_ = std::move(other.buckets_);
-    all_locks_ = std::move(other.all_locks_);
-    minimum_load_factor_ = std::move(other.minimum_load_factor());
-    maximum_hashpower_ = std::move(other.maximum_hashpower());
-    max_resize_threads_ = std::move(other.max_resize_threads_);
-    return *this;
-  }
+  cuckoohash_map &operator=(cuckoohash_map &&other) = default;
 
   /**
    * Initializer list assignment operator
@@ -369,7 +363,7 @@ public:
   /**
    * Sets the minimum load factor allowed for automatic expansions. If an
    * expansion is needed when the load factor of the table is lower than this
-   * threshold, @ref libcuckoo_load_factor_too_low is thrown. It will not be
+   * threshold, @ref load_factor_too_low is thrown. It will not be
    * thrown for an explicitly-triggered expansion.
    *
    * @param mlf the load factor to set the minimum to
@@ -400,7 +394,7 @@ public:
 
   /**
    * Sets the maximum hashpower the table can be. If set to @ref
-   * LIBCUCKOO_NO_MAXIMUM_HASHPOWER, there will be no limit on the hashpower.
+   * NO_MAXIMUM_HASHPOWER, there will be no limit on the hashpower.
    * Otherwise, the table will not be able to expand beyond the given
    * hashpower, either by an explicit or an automatic expansion.
    *
@@ -424,6 +418,29 @@ public:
     return maximum_hashpower_.load(std::memory_order_acquire);
   }
 
+
+  /**
+   * Set the maximum number of extra worker threads the table can spawn when
+   * doing large batch operations. Currently batch operations occur in the
+   * following scenarios.
+   *   - Any resizing operation which invokes cuckoo_expand_simple. This
+   *   includes any explicit rehash/resize operation, or any general resize if
+   *   the data is not nothrow-move-constructible.
+   *   - Creating a locked_table or resizing within a locked_table.
+   *
+   * @param num_threads the number of extra threads
+   */
+  void max_num_worker_threads(size_type extra_threads) {
+    max_num_worker_threads_.store(extra_threads, std::memory_order_release);
+  }
+
+  /**
+   * Returns the maximum number of extra worker threads.
+   */
+  size_type max_num_worker_threads() const {
+    return max_num_worker_threads_.load(std::memory_order_acquire);
+  }
+
   /**@}*/
 
   /** @name Table Operations
@@ -655,15 +672,11 @@ public:
    */
   bool reserve(size_type n) { return cuckoo_reserve<normal_mode>(n); }
 
-  void set_max_resize_threads(uint32_t t) { max_resize_threads_ = t; }
-
-  uint32_t get_max_resize_threads() const { return max_resize_threads_; }
-
   /**
    * Removes all elements in the table, calling their destructors.
    */
   void clear() {
-    auto all_locks_manager = snapshot_and_lock_all(normal_mode());
+    auto all_locks_manager = lock_all(normal_mode());
     cuckoo_clear();
   }
 
@@ -691,8 +704,15 @@ private:
 
   // true if the key is small and simple, which means using partial keys for
   // lookup would probably slow us down
-  static constexpr bool is_simple =
-      std::is_pod<key_type>::value && sizeof(key_type) <= 8;
+  static constexpr bool is_simple() {
+    return std::is_pod<key_type>::value && sizeof(key_type) <= 8;
+  }
+
+  // Whether or not the data is nothrow-move-constructible.
+  static constexpr bool is_data_nothrow_move_constructible() {
+    return std::is_nothrow_move_constructible<key_type>::value &&
+           std::is_nothrow_move_constructible<mapped_type>::value;
+  }
 
   // Contains a hash and partial for a given key. The partial key is used for
   // partial-key cuckoohashing, and for finding the alternate bucket of that a
@@ -763,17 +783,34 @@ private:
   using counter_type = int64_t;
 
   // A fast, lightweight spinlock
+  //
+  // Per-spinlock, we also maintain some metadata about the contents of the
+  // table. Storing data per-spinlock avoids false sharing issues when multiple
+  // threads need to update this metadata. We store the following information:
+  //
+  // - elem_counter: A counter indicating how many elements in the table are
+  // under this lock. One can compute the size of the table by summing the
+  // elem_counter over all locks.
+  //
+  // - is_migrated: When resizing with cuckoo_fast_doulbe, we do not
+  // immediately rehash elements from the old buckets array to the new one.
+  // Instead, we'll mark all of the locks as not migrated. So anybody trying to
+  // acquire the lock must also migrate the corresponding buckets if
+  // !is_migrated.
   LIBCUCKOO_SQUELCH_PADDING_WARNING
   class LIBCUCKOO_ALIGNAS(64) spinlock {
   public:
-    spinlock() : elem_counter_(0) { lock_.clear(); }
+    spinlock() : elem_counter_(0), is_migrated_(true) { lock_.clear(); }
 
-    spinlock(const spinlock &other) : elem_counter_(other.elem_counter()) {
+    spinlock(const spinlock &other) noexcept
+        : elem_counter_(other.elem_counter()),
+          is_migrated_(other.is_migrated()) {
       lock_.clear();
     }
 
-    spinlock &operator=(const spinlock &other) {
+    spinlock &operator=(const spinlock &other) noexcept {
       elem_counter() = other.elem_counter();
+      is_migrated() = other.is_migrated();
       return *this;
     }
 
@@ -789,12 +826,15 @@ private:
     }
 
     counter_type &elem_counter() noexcept { return elem_counter_; }
-
     counter_type elem_counter() const noexcept { return elem_counter_; }
 
+    bool &is_migrated() noexcept { return is_migrated_; }
+    bool is_migrated() const noexcept { return is_migrated_; }
+
   private:
     std::atomic_flag lock_;
     counter_type elem_counter_;
+    bool is_migrated_;
   };
 
   template <typename U>
@@ -814,6 +854,11 @@ private:
 
   using LockManager = std::unique_ptr<spinlock, LockDeleter>;
 
+  // Each of the locking methods can operate in two modes: locked_table_mode
+  // and normal_mode. When we're in locked_table_mode, we assume the caller has
+  // already taken all locks on the buckets. We also require that all data is
+  // rehashed immediately, so that the caller never has to look through any
+  // locks. In normal_mode, we actually do take locks, and can rehash lazily.
   using locked_table_mode = std::integral_constant<bool, true>;
   using normal_mode = std::integral_constant<bool, false>;
 
@@ -869,6 +914,45 @@ private:
     }
   }
 
+  // If necessary, rehashes the buckets corresponding to the given lock index,
+  // and sets the is_migrated flag to true. We should only ever do migrations
+  // if the data is nothrow move constructible, so this function is noexcept.
+  //
+  // This only works if our current locks array is at the maximum size, because
+  // otherwise, rehashing could require taking other locks. Assumes the lock at
+  // the given index is taken.
+  //
+  // If IS_LAZY is true, we assume the lock is being rehashed in a lazy
+  // (on-demand) fashion, so we additionally decrement the number of locks we
+  // need to lazy_rehash. This may trigger false sharing with other
+  // lazy-rehashing threads, but the hope is that the fraction of such
+  // operations is low-enough to not significantly impact overall performance.
+  static constexpr bool kIsLazy = true;
+  static constexpr bool kIsNotLazy = false;
+
+  template <bool IS_LAZY>
+  void rehash_lock(size_t l) const noexcept {
+    locks_t &locks = get_current_locks();
+    spinlock &lock = locks[l];
+    if (lock.is_migrated()) return;
+
+    assert(is_data_nothrow_move_constructible());
+    assert(locks.size() == kMaxNumLocks);
+    assert(old_buckets_.hashpower() + 1 == buckets_.hashpower());
+    assert(old_buckets_.size() >= kMaxNumLocks);
+    // Iterate through all buckets in old_buckets that are controlled by this
+    // lock, and move them into the current buckets array.
+    for (size_type bucket_ind = l; bucket_ind < old_buckets_.size();
+         bucket_ind += kMaxNumLocks) {
+      move_bucket(old_buckets_, buckets_, bucket_ind);
+    }
+    lock.is_migrated() = true;
+
+    if (IS_LAZY) {
+      decrement_num_remaining_lazy_rehash_locks();
+    }
+  }
+
   // locks the given bucket index.
   //
   // throws hashpower_changed if it changed after taking the lock.
@@ -878,9 +962,11 @@ private:
 
   LockManager lock_one(size_type hp, size_type i, normal_mode) const {
     locks_t &locks = get_current_locks();
-    spinlock &lock = locks[lock_ind(i)];
+    const size_type l = lock_ind(i);
+    spinlock &lock = locks[l];
     lock.lock();
     check_hashpower(hp, lock);
+    rehash_lock<kIsLazy>(l);
     return LockManager(&lock);
   }
 
@@ -906,6 +992,8 @@ private:
     if (l2 != l1) {
       locks[l2].lock();
     }
+    rehash_lock<kIsLazy>(l1);
+    rehash_lock<kIsLazy>(l2);
     return TwoBuckets(locks, i1, i2, normal_mode());
   }
 
@@ -941,6 +1029,9 @@ private:
     if (l[2] != l[1]) {
       locks[l[2]].lock();
     }
+    rehash_lock<kIsLazy>(l[0]);
+    rehash_lock<kIsLazy>(l[1]);
+    rehash_lock<kIsLazy>(l[2]);
     return std::make_pair(TwoBuckets(locks, i1, i2, normal_mode()),
                           LockManager((lock_ind(i3) == lock_ind(i1) ||
                                        lock_ind(i3) == lock_ind(i2))
@@ -970,23 +1061,21 @@ private:
     }
   }
 
-  // snapshot_and_lock_all takes all the locks, and returns a deleter object
-  // that releases the locks upon destruction. Note that after taking all the
-  // locks, it is okay to resize the buckets_ container, since no other threads
-  // should be accessing the buckets. This should only be called if we are not
-  // in locked_table mode, and after this function is over, we will be in
-  // locked_table mode. When the deleter object goes out of scope, we will be
-  // out of locked_table mode.
-  AllLocksManager snapshot_and_lock_all(locked_table_mode) {
+  // lock_all takes all the locks, and returns a deleter object that releases
+  // the locks upon destruction. It does NOT perform any hashpower checks, or
+  // rehash any un-migrated buckets.
+  //
+  // Note that after taking all the locks, it is okay to resize the buckets_
+  // container, since no other threads should be accessing the buckets.
+  AllLocksManager lock_all(locked_table_mode) {
     return AllLocksManager();
   }
 
-  AllLocksManager snapshot_and_lock_all(normal_mode) {
+  AllLocksManager lock_all(normal_mode) {
     // all_locks_ should never decrease in size, so if it is non-empty now, it
     // will remain non-empty
     assert(!all_locks_.empty());
-    auto first_locked = all_locks_.end();
-    --first_locked;
+    const auto first_locked = std::prev(all_locks_.end());
     auto current_locks = first_locked;
     while (current_locks != all_locks_.end()) {
       locks_t &locks = *current_locks;
@@ -1056,7 +1145,7 @@ private:
     // Silence a warning from MSVC about partial being unused if is_simple.
     (void)partial;
     for (int i = 0; i < static_cast<int>(slot_per_bucket()); ++i) {
-      if (!b.occupied(i) || (!is_simple && partial != b.partial(i))) {
+      if (!b.occupied(i) || (!is_simple() && partial != b.partial(i))) {
         continue;
       } else if (key_eq()(b.key(i), key)) {
         return i;
@@ -1077,7 +1166,7 @@ private:
    * @return table_position of the location to insert the new element, or the
    * site of the duplicate element with a status code if there was a duplicate.
    * In either case, the locks will still be held after the function ends.
-   * @throw libcuckoo_load_factor_too_low if expansion is necessary, but the
+   * @throw load_factor_too_low if expansion is necessary, but the
    * load factor of the table is below the threshold
    */
   template <typename TABLE_MODE, typename K>
@@ -1203,7 +1292,7 @@ private:
     slot = -1;
     for (int i = 0; i < static_cast<int>(slot_per_bucket()); ++i) {
       if (b.occupied(i)) {
-        if (!is_simple && partial != b.partial(i)) {
+        if (!is_simple() && partial != b.partial(i)) {
           continue;
         }
         if (key_eq()(b.key(i), key)) {
@@ -1375,10 +1464,10 @@ private:
       // cuckoopath_search found empty isn't empty anymore, we unlock them
       // and return false. Otherwise, the bucket is empty and insertable,
       // so we hold the locks and return true.
-      const size_type bucket = cuckoo_path[0].bucket;
-      assert(bucket == b.i1 || bucket == b.i2);
+      const size_type bucket_i = cuckoo_path[0].bucket;
+      assert(bucket_i == b.i1 || bucket_i == b.i2);
       b = lock_two(hp, b.i1, b.i2, TABLE_MODE());
-      if (!buckets_[bucket].occupied(cuckoo_path[0].slot)) {
+      if (!buckets_[bucket_i].occupied(cuckoo_path[0].slot)) {
         return true;
       } else {
         b.unlock();
@@ -1565,97 +1654,128 @@ private:
   // provides a strong exception guarantee.
   template <typename TABLE_MODE, typename AUTO_RESIZE>
   cuckoo_status cuckoo_fast_double(size_type current_hp) {
-    if (!std::is_nothrow_move_constructible<key_type>::value ||
-        !std::is_nothrow_move_constructible<mapped_type>::value) {
+    if (!is_data_nothrow_move_constructible()) {
       LIBCUCKOO_DBG("%s", "cannot run cuckoo_fast_double because key-value"
                           " pair is not nothrow move constructible");
       return cuckoo_expand_simple<TABLE_MODE, AUTO_RESIZE>(current_hp + 1);
     }
     const size_type new_hp = current_hp + 1;
-    auto all_locks_manager = snapshot_and_lock_all(TABLE_MODE());
+    auto all_locks_manager = lock_all(TABLE_MODE());
     cuckoo_status st = check_resize_validity<AUTO_RESIZE>(current_hp, new_hp);
     if (st != ok) {
       return st;
     }
 
-    // We must re-hash the table, moving items in each bucket to a different
-    // one. The hash functions are carefully designed so that when doubling the
-    // number of buckets, each element either stays in its existing bucket or
-    // goes to exactly one new bucket. This means we can re-hash each bucket in
-    // parallel. We create a new empty buckets container and move all the
-    // elements from the old container to the new one.
-    buckets_t new_buckets(new_hp, get_allocator());
-    // For certain types, MSVC may decide that move_buckets() cannot throw and
-    // so the catch block below is dead code. Since that won't always be true,
-    // we just disable the warning here.
-    LIBCUCKOO_SQUELCH_DEADCODE_WARNING_BEGIN;
-    parallel_exec(
-        0, hashsize(current_hp),
-        [this, &new_buckets, current_hp, new_hp](size_type start, size_type end,
-                                                 std::exception_ptr &eptr) {
-          try {
-            move_buckets(new_buckets, current_hp, new_hp, start, end);
-          } catch (...) {
-            eptr = std::current_exception();
-          }
-        });
-    LIBCUCKOO_SQUELCH_DEADCODE_WARNING_END;
+    // Finish rehashing any un-rehashed buckets, so that we can move out any
+    // remaining data in old_buckets_.  We should be running cuckoo_fast_double
+    // only after trying to cuckoo for a while, which should mean we've tried
+    // going through most of the table and thus done a lot of rehashing
+    // already. So this shouldn't be too expensive.
+    //
+    // We restrict ourselves to the current thread because we want to avoid
+    // possibly spawning extra threads in this function, unless the
+    // circumstances are predictable (i.e. data is nothrow move constructible,
+    // we're in locked_table mode and must keep the buckets_ container
+    // up-to-date, etc).
+    //
+    // If we have fewer than kNumLocks buckets, there shouldn't be any buckets
+    // left to rehash, so this should be a no-op.
+    {
+      locks_t &current_locks = get_current_locks();
+      for (size_t i = 0; i < current_locks.size(); ++i) {
+        rehash_lock<kIsNotLazy>(i);
+      }
+      num_remaining_lazy_rehash_locks(0);
+    }
 
     // Resize the locks array if necessary. This is done before we update the
     // hashpower so that other threads don't grab the new hashpower and the old
-    // locks
+    // locks.
     maybe_resize_locks(size_type(1) << new_hp);
-    // Swap the old and new buckets. The old bucket data will be destroyed when
-    // the function exits
-    buckets_.swap(new_buckets);
+    locks_t &current_locks = get_current_locks();
+
+    // Move the current buckets into old_buckets_, and create a new empty
+    // buckets container, which will become the new current one. The
+    // old_buckets_ data will be destroyed when move-assigning to buckets_.
+    old_buckets_.swap(buckets_);
+    buckets_ = buckets_t(new_hp, get_allocator());
+
+    // If we have less than kMaxNumLocks buckets, we do a full rehash in the
+    // current thread. On-demand rehashing wouldn't be very easy with less than
+    // kMaxNumLocks buckets, because it would require taking extra lower-index
+    // locks to do the rehashing. Because kMaxNumLocks is relatively small,
+    // this should not be very expensive. We have already set all locks to
+    // migrated at the start of the function, so we shouldn't have to touch
+    // them again.
+    //
+    // Otherwise, if we're in locked_table_mode, the expectation is that we can
+    // access the latest data in buckets_ without taking any locks. So we must
+    // rehash the data immediately. This would not be much different from
+    // lazy-rehashing in locked_table_mode anyways, because it would still be
+    // going on in one thread.
+    if (old_buckets_.size() < kMaxNumLocks) {
+      for (size_type i = 0; i < old_buckets_.size(); ++i) {
+        move_bucket(old_buckets_, buckets_, i);
+      }
+      // This will also delete the old_buckets_ data.
+      num_remaining_lazy_rehash_locks(0);
+    } else {
+      // Mark all current locks as un-migrated, so that we rehash the data
+      // on-demand when the locks are taken.
+      for (spinlock &lock : current_locks) {
+        lock.is_migrated() = false;
+      }
+      num_remaining_lazy_rehash_locks(current_locks.size());
+      if (std::is_same<TABLE_MODE, locked_table_mode>::value) {
+        rehash_with_workers();
+      }
+    }
     return ok;
   }
 
-  void move_buckets(buckets_t &new_buckets, size_type current_hp,
-                    size_type new_hp, size_type start_ind, size_type end_ind) {
-    for (size_type old_bucket_ind = start_ind; old_bucket_ind < end_ind;
-         ++old_bucket_ind) {
-      // By doubling the table size, the index_hash and alt_index of
-      // each key got one bit added to the top, at position
-      // current_hp, which means anything we have to move will either
-      // be at the same bucket position, or exactly
-      // hashsize(current_hp) later than the current bucket
-      bucket &old_bucket = buckets_[old_bucket_ind];
-      const size_type new_bucket_ind = old_bucket_ind + hashsize(current_hp);
-      size_type new_bucket_slot = 0;
-
-      // For each occupied slot, either move it into its same position in the
-      // new buckets container, or to the first available spot in the new
-      // bucket in the new buckets container.
-      for (size_type old_bucket_slot = 0; old_bucket_slot < slot_per_bucket();
-           ++old_bucket_slot) {
-        if (!old_bucket.occupied(old_bucket_slot)) {
-          continue;
-        }
-        const hash_value hv = hashed_key(old_bucket.key(old_bucket_slot));
-        const size_type old_ihash = index_hash(current_hp, hv.hash);
-        const size_type old_ahash =
-            alt_index(current_hp, hv.partial, old_ihash);
-        const size_type new_ihash = index_hash(new_hp, hv.hash);
-        const size_type new_ahash = alt_index(new_hp, hv.partial, new_ihash);
-        size_type dst_bucket_ind, dst_bucket_slot;
-        if ((old_bucket_ind == old_ihash && new_ihash == new_bucket_ind) ||
-            (old_bucket_ind == old_ahash && new_ahash == new_bucket_ind)) {
-          // We're moving the key to the new bucket
-          dst_bucket_ind = new_bucket_ind;
-          dst_bucket_slot = new_bucket_slot++;
-        } else {
-          // We're moving the key to the old bucket
-          assert((old_bucket_ind == old_ihash && new_ihash == old_ihash) ||
-                 (old_bucket_ind == old_ahash && new_ahash == old_ahash));
-          dst_bucket_ind = old_bucket_ind;
-          dst_bucket_slot = old_bucket_slot;
-        }
-        new_buckets.setKV(dst_bucket_ind, dst_bucket_slot++,
-                          old_bucket.partial(old_bucket_slot),
-                          old_bucket.movable_key(old_bucket_slot),
-                          std::move(old_bucket.mapped(old_bucket_slot)));
+  void move_bucket(buckets_t &old_buckets, buckets_t &new_buckets,
+                   size_type old_bucket_ind) const noexcept {
+    const size_t old_hp = old_buckets.hashpower();
+    const size_t new_hp = new_buckets.hashpower();
+
+    // By doubling the table size, the index_hash and alt_index of each key got
+    // one bit added to the top, at position old_hp, which means anything we
+    // have to move will either be at the same bucket position, or exactly
+    // hashsize(old_hp) later than the current bucket.
+    bucket &old_bucket = old_buckets_[old_bucket_ind];
+    const size_type new_bucket_ind = old_bucket_ind + hashsize(old_hp);
+    size_type new_bucket_slot = 0;
+
+    // For each occupied slot, either move it into its same position in the
+    // new buckets container, or to the first available spot in the new
+    // bucket in the new buckets container.
+    for (size_type old_bucket_slot = 0; old_bucket_slot < slot_per_bucket();
+         ++old_bucket_slot) {
+      if (!old_bucket.occupied(old_bucket_slot)) {
+        continue;
       }
+      const hash_value hv = hashed_key(old_bucket.key(old_bucket_slot));
+      const size_type old_ihash = index_hash(old_hp, hv.hash);
+      const size_type old_ahash = alt_index(old_hp, hv.partial, old_ihash);
+      const size_type new_ihash = index_hash(new_hp, hv.hash);
+      const size_type new_ahash = alt_index(new_hp, hv.partial, new_ihash);
+      size_type dst_bucket_ind, dst_bucket_slot;
+      if ((old_bucket_ind == old_ihash && new_ihash == new_bucket_ind) ||
+          (old_bucket_ind == old_ahash && new_ahash == new_bucket_ind)) {
+        // We're moving the key to the new bucket
+        dst_bucket_ind = new_bucket_ind;
+        dst_bucket_slot = new_bucket_slot++;
+      } else {
+        // We're moving the key to the old bucket
+        assert((old_bucket_ind == old_ihash && new_ihash == old_ihash) ||
+               (old_bucket_ind == old_ahash && new_ahash == old_ahash));
+        dst_bucket_ind = old_bucket_ind;
+        dst_bucket_slot = old_bucket_slot;
+      }
+      new_buckets.setKV(dst_bucket_ind, dst_bucket_slot++,
+                        old_bucket.partial(old_bucket_slot),
+                        old_bucket.movable_key(old_bucket_slot),
+                        std::move(old_bucket.mapped(old_bucket_slot)));
     }
   }
 
@@ -1668,11 +1788,11 @@ private:
   cuckoo_status check_resize_validity(const size_type orig_hp,
                                       const size_type new_hp) {
     const size_type mhp = maximum_hashpower();
-    if (mhp != LIBCUCKOO_NO_MAXIMUM_HASHPOWER && new_hp > mhp) {
-      throw libcuckoo_maximum_hashpower_exceeded(new_hp);
+    if (mhp != NO_MAXIMUM_HASHPOWER && new_hp > mhp) {
+      throw maximum_hashpower_exceeded(new_hp);
     }
     if (AUTO_RESIZE::value && load_factor() < minimum_load_factor()) {
-      throw libcuckoo_load_factor_too_low(minimum_load_factor());
+      throw load_factor_too_low(minimum_load_factor());
     }
     if (hashpower() != orig_hp) {
       // Most likely another expansion ran before this one could grab the
@@ -1702,11 +1822,11 @@ private:
 
     locks_t new_locks(std::min(size_type(kMaxNumLocks), new_bucket_count),
                       spinlock(), get_allocator());
+    assert(new_locks.size() > current_locks.size());
+    std::copy(current_locks.begin(), current_locks.end(), new_locks.begin());
     for (spinlock &lock : new_locks) {
       lock.lock();
     }
-    assert(new_locks.size() > current_locks.size());
-    std::copy(current_locks.begin(), current_locks.end(), new_locks.begin());
     all_locks_.emplace_back(std::move(new_locks));
   }
 
@@ -1715,73 +1835,124 @@ private:
   // contains more elements than can be held by new_hashpower, the resulting
   // hashpower will be greater than `new_hp`. It needs to take all the bucket
   // locks, since no other operations can change the table during expansion.
-  // Throws libcuckoo_maximum_hashpower_exceeded if we're expanding beyond the
+  // Throws maximum_hashpower_exceeded if we're expanding beyond the
   // maximum hashpower, and we have an actual limit.
   template <typename TABLE_MODE, typename AUTO_RESIZE>
   cuckoo_status cuckoo_expand_simple(size_type new_hp) {
-    auto all_locks_manager = snapshot_and_lock_all(TABLE_MODE());
+    auto all_locks_manager = lock_all(TABLE_MODE());
     const size_type hp = hashpower();
     cuckoo_status st = check_resize_validity<AUTO_RESIZE>(hp, new_hp);
     if (st != ok) {
       return st;
     }
-    // Creates a new hash table with hashpower new_hp and adds all
-    // the elements from the old buckets.
+
+    // Finish rehashing any data into buckets_.
+    rehash_with_workers();
+
+    // Creates a new hash table with hashpower new_hp and adds all the elements
+    // from buckets_ and old_buckets_. Allow this map to spawn extra threads if
+    // it needs to resize during the resize.
     cuckoohash_map new_map(hashsize(new_hp) * slot_per_bucket(),
                            hash_function(), key_eq(), get_allocator());
+    new_map.max_num_worker_threads(max_num_worker_threads());
 
-    parallel_exec(0, hashsize(hp), [this, &new_map](size_type i, size_type end,
-                                                    std::exception_ptr &eptr) {
-      try {
-        for (; i < end; ++i) {
-          for (size_type j = 0; j < slot_per_bucket(); ++j) {
-            if (buckets_[i].occupied(j)) {
-              new_map.insert(buckets_[i].movable_key(j),
-                             std::move(buckets_[i].mapped(j)));
+    parallel_exec(
+        0, hashsize(hp),
+        [this, &new_map]
+        (size_type i, size_type end, std::exception_ptr &eptr) {
+          try {
+            for (; i < end; ++i) {
+              auto &bucket = buckets_[i];
+              for (size_type j = 0; j < slot_per_bucket(); ++j) {
+                if (bucket.occupied(j)) {
+                  new_map.insert(bucket.movable_key(j),
+                                 std::move(bucket.mapped(j)));
+                }
+              }
             }
+          } catch (...) {
+            eptr = std::current_exception();
           }
-        }
-      } catch (...) {
-        eptr = std::current_exception();
-      }
-    });
+        });
+
+    // Finish rehashing any data in new_map.
+    new_map.rehash_with_workers();
 
-    // Swap the current buckets containers with new_map's. This is okay,
-    // because we have all the locks, so nobody else should be reading from the
-    // buckets array. Then the old buckets array will be deleted when new_map
-    // is deleted. We also resize the locks array if necessary.
+    // Swap the buckets_ container with new_map's. This is okay, because we
+    // have all the locks, so nobody else should be reading from the buckets
+    // array. Then the old buckets will be deleted when new_map is deleted.
     maybe_resize_locks(new_map.bucket_count());
     buckets_.swap(new_map.buckets_);
+    
     return ok;
   }
 
-  // Executes the function over the given range split over num_threads threads
+  // Executes the function over the given range, splitting the work between the
+  // current thread and any available worker threads.
+  //
+  // In the noexcept version, the functor must implement operator()(size_type
+  // start, size_type end).
+  //
+  // In the non-noexcept version, the functor will receive an additional
+  // std::exception_ptr& argument.
+
+  template <typename F>
+  void parallel_exec_noexcept(size_type start, size_type end, F func) {
+    const size_type num_extra_threads = max_num_worker_threads();
+    const size_type num_workers = 1 + num_extra_threads;
+    size_type work_per_thread = (end - start) / num_workers;
+    std::vector<std::thread, rebind_alloc<std::thread>> threads(
+        get_allocator());
+    threads.reserve(num_extra_threads);
+    for (size_type i = 0; i < num_extra_threads; ++i) {
+      threads.emplace_back(func, start, start + work_per_thread);
+      start += work_per_thread;
+    }
+    func(start, end);
+    for (std::thread &t : threads) {
+      t.join();
+    }
+  }
+
   template <typename F>
   void parallel_exec(size_type start, size_type end, F func) {
-    static const size_type num_threads =
-      std::max(std::min(max_resize_threads_, std::thread::hardware_concurrency()), 1U);
-    size_type work_per_thread = (end - start) / num_threads;
+    const size_type num_extra_threads = max_num_worker_threads();
+    const size_type num_workers = 1 + num_extra_threads;
+    size_type work_per_thread = (end - start) / num_workers;
     std::vector<std::thread, rebind_alloc<std::thread>> threads(
         get_allocator());
-    threads.reserve(num_threads);
+    threads.reserve(num_extra_threads);
+
     std::vector<std::exception_ptr, rebind_alloc<std::exception_ptr>> eptrs(
-        num_threads, nullptr, get_allocator());
-    for (size_type i = 0; i < num_threads - 1; ++i) {
+        num_workers, nullptr, get_allocator());
+    for (size_type i = 0; i < num_extra_threads; ++i) {
       threads.emplace_back(func, start, start + work_per_thread,
                            std::ref(eptrs[i]));
       start += work_per_thread;
     }
-    threads.emplace_back(func, start, end, std::ref(eptrs.back()));
+    func(start, end, std::ref(eptrs.back()));
     for (std::thread &t : threads) {
       t.join();
     }
     for (std::exception_ptr &eptr : eptrs) {
-      if (eptr) {
-        std::rethrow_exception(eptr);
-      }
+      if (eptr) std::rethrow_exception(eptr);
     }
   }
 
+  // Does a batch resize of the remaining data in old_buckets_. Assumes all the
+  // locks have already been taken.
+  void rehash_with_workers() noexcept {
+    locks_t &current_locks = get_current_locks();
+    parallel_exec_noexcept(
+        0, current_locks.size(),
+        [this](size_type start, size_type end) {
+          for (size_type i = start; i < end; ++i) {
+            rehash_lock<kIsNotLazy>(i);
+          }
+        });
+    num_remaining_lazy_rehash_locks(0);
+  }
+
   // Deletion functions
 
   // Removes an item from a bucket, decrementing the associated counter as
@@ -1793,12 +1964,15 @@ private:
 
   // Empties the table, calling the destructors of all the elements it removes
   // from the table. It assumes the locks are taken as necessary.
-  cuckoo_status cuckoo_clear() {
+  void cuckoo_clear() {
     buckets_.clear();
+    // This will also clear out any data in old_buckets and delete it, if we
+    // haven't already.
+    num_remaining_lazy_rehash_locks(0);
     for (spinlock &lock : get_current_locks()) {
       lock.elem_counter() = 0;
+      lock.is_migrated() = true;
     }
-    return ok;
   }
 
   // Rehashing functions
@@ -1840,6 +2014,30 @@ private:
 
   locks_t &get_current_locks() const { return all_locks_.back(); }
 
+  // Get/set/decrement num remaining lazy rehash locks. If we reach 0 remaining
+  // lazy locks, we can deallocate the memory in old_buckets_.
+  size_type num_remaining_lazy_rehash_locks() const {
+    return num_remaining_lazy_rehash_locks_.load(
+        std::memory_order_acquire);
+  }
+
+  void num_remaining_lazy_rehash_locks(size_type n) const {
+    num_remaining_lazy_rehash_locks_.store(
+        n, std::memory_order_release);
+    if (n == 0) {
+      old_buckets_.clear_and_deallocate();
+    }
+  }
+
+  void decrement_num_remaining_lazy_rehash_locks() const {
+    size_type old_num_remaining = num_remaining_lazy_rehash_locks_.fetch_sub(
+      1, std::memory_order_acq_rel);
+    assert(old_num_remaining >= 1);
+    if (old_num_remaining == 1) {
+      old_buckets_.clear_and_deallocate();
+    }
+  }
+
   // Member variables
 
   // The hash function
@@ -1851,7 +2049,18 @@ private:
   // container of buckets. The size or memory location of the buckets cannot be
   // changed unless all the locks are taken on the table. Thus, it is only safe
   // to access the buckets_ container when you have at least one lock held.
-  buckets_t buckets_;
+  //
+  // Marked mutable so that const methods can rehash into this container when
+  // necessary.
+  mutable buckets_t buckets_;
+
+  // An old container of buckets, containing data that may not have been
+  // rehashed into the current one. If valid, this will always have a hashpower
+  // exactly one less than the one in buckets_.
+  //
+  // Marked mutable so that const methods can rehash into this container when
+  // necessary.
+  mutable buckets_t old_buckets_;
 
   // A linked list of all lock containers. We never discard lock containers,
   // since there is currently no mechanism for detecting when all threads are
@@ -1865,18 +2074,45 @@ private:
   // mutable so that const methods can access and take locks.
   mutable all_locks_t all_locks_;
 
-  // stores the minimum load factor allowed for automatic expansions. Whenever
+  // A small wrapper around std::atomic to make it copyable for constructors.
+  template <typename AtomicT>
+  class CopyableAtomic : public std::atomic<AtomicT> {
+   public:
+    using std::atomic<AtomicT>::atomic;
+
+    CopyableAtomic(const CopyableAtomic& other) noexcept
+        : CopyableAtomic(other.load(std::memory_order_acquire)) {}
+
+    CopyableAtomic& operator=(const CopyableAtomic& other) noexcept {
+        this->store(other.load(std::memory_order_acquire),
+                    std::memory_order_release);
+        return *this;
+    }
+  };
+
+  // We keep track of the number of remaining locks in the latest locks array,
+  // that remain to be rehashed. Once this reaches 0, we can free the memory of
+  // the old buckets. It should only be accessed or modified when
+  // lazy-rehashing a lock, so not in the common case.
+  //
+  // Marked mutable so that we can modify this during rehashing.
+  mutable CopyableAtomic<size_t> num_remaining_lazy_rehash_locks_;
+
+  // Stores the minimum load factor allowed for automatic expansions. Whenever
   // an automatic expansion is triggered (during an insertion where cuckoo
   // hashing fails, for example), we check the load factor against this
   // double, and throw an exception if it's lower than this value. It can be
   // used to signal when the hash function is bad or the input adversarial.
-  std::atomic<double> minimum_load_factor_;
+  CopyableAtomic<double> minimum_load_factor_;
 
   // stores the maximum hashpower allowed for any expansions. If set to
   // NO_MAXIMUM_HASHPOWER, this limit will be disregarded.
-  std::atomic<size_type> maximum_hashpower_;
+  CopyableAtomic<size_type> maximum_hashpower_;
+
+  // Maximum number of extra threads to spawn when doing any large batch
+  // operations.
+  CopyableAtomic<size_type> max_num_worker_threads_;
 
-  uint32_t max_resize_threads_{std::numeric_limits<uint32_t>::max()};
 public:
   /**
    * An ownership wrapper around a @ref cuckoohash_map table instance. When
@@ -2175,6 +2411,14 @@ public:
       return map_.get().maximum_hashpower();
     }
 
+    void max_num_worker_threads(size_type extra_threads) {
+      map_.get().max_num_worker_threads(extra_threads);
+    }
+
+    size_type max_num_worker_threads() const {
+      return map_.get().max_num_worker_threads();
+    }
+
     /**@}*/
 
     /** @name Iterators */
@@ -2415,12 +2659,15 @@ public:
     /**@}*/
 
   private:
-    // The constructor locks the entire table. We keep this constructor
-    // private (but expose it to the cuckoohash_map class), since we don't
-    // want users calling it.
+    // The constructor locks the entire table. We keep this constructor private
+    // (but expose it to the cuckoohash_map class), since we don't want users
+    // calling it. We also complete any remaining rehashing in the table, so
+    // that everything is in map.buckets_.
     locked_table(cuckoohash_map &map) noexcept
         : map_(map),
-          all_locks_manager_(map.snapshot_and_lock_all(normal_mode())) {}
+          all_locks_manager_(map.lock_all(normal_mode())) {
+      map.rehash_with_workers();
+    }
 
     // Dispatchers for methods on cuckoohash_map
 
@@ -2477,8 +2724,6 @@ public:
   };
 };
 
-namespace std {
-
 /**
  * Specializes the @c std::swap algorithm for @c cuckoohash_map. Calls @c
  * lhs.swap(rhs).
@@ -2495,6 +2740,6 @@ void swap(
   lhs.swap(rhs);
 }
 
-} // namespace std
+}  // namespace libcuckoo
 
 #endif // _CUCKOOHASH_MAP_HH
diff --git a/include/cuckoohash_util.hh b/include/cuckoohash_util.hh
index 31e161798..f8a1f734f 100644
--- a/include/cuckoohash_util.hh
+++ b/include/cuckoohash_util.hh
@@ -9,6 +9,8 @@
 #include <utility>
 #include <vector>
 
+namespace libcuckoo {
+
 #if LIBCUCKOO_DEBUG
 //! When \ref LIBCUCKOO_DEBUG is 0, LIBCUCKOO_DBG will printing out status
 //! messages in various situations
@@ -70,14 +72,14 @@
  * function does not properly distribute keys, or for certain adversarial
  * workloads.
  */
-class libcuckoo_load_factor_too_low : public std::exception {
+class load_factor_too_low : public std::exception {
 public:
   /**
    * Constructor
    *
    * @param lf the load factor of the table when the exception was thrown
    */
-  libcuckoo_load_factor_too_low(const double lf) : load_factor_(lf) {}
+  load_factor_too_low(const double lf) noexcept : load_factor_(lf) {}
 
   /**
    * @return a descriptive error message
@@ -90,7 +92,7 @@ public:
   /**
    * @return the load factor of the table when the exception was thrown
    */
-  double load_factor() const { return load_factor_; }
+  double load_factor() const noexcept { return load_factor_; }
 
 private:
   const double load_factor_;
@@ -101,14 +103,14 @@ private:
  * than the maximum, which can be set with the \ref
  * cuckoohash_map::maximum_hashpower method.
  */
-class libcuckoo_maximum_hashpower_exceeded : public std::exception {
+class maximum_hashpower_exceeded : public std::exception {
 public:
   /**
    * Constructor
    *
    * @param hp the hash power we were trying to expand to
    */
-  libcuckoo_maximum_hashpower_exceeded(const size_t hp) : hashpower_(hp) {}
+  maximum_hashpower_exceeded(const size_t hp) noexcept : hashpower_(hp) {}
 
   /**
    * @return a descriptive error message
@@ -120,10 +122,12 @@ public:
   /**
    * @return the hashpower we were trying to expand to
    */
-  size_t hashpower() const { return hashpower_; }
+  size_t hashpower() const noexcept { return hashpower_; }
 
 private:
   const size_t hashpower_;
 };
 
+}  // namespace libcuckoo
+
 #endif // _CUCKOOHASH_UTIL_HH
diff --git a/include/libcuckoo_lazy_array.hh b/include/libcuckoo_lazy_array.hh
deleted file mode 100644
index 3bc920f34..000000000
--- a/include/libcuckoo_lazy_array.hh
+++ /dev/null
@@ -1,194 +0,0 @@
-/** \file */
-
-#ifndef _LIBCUCKOO_LAZY_ARRAY_HH
-#define _LIBCUCKOO_LAZY_ARRAY_HH
-
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <memory>
-
-#include "cuckoohash_util.hh"
-
-/**
- * A fixed-size array, broken up into segments that are dynamically allocated
- * upon request. It is the user's responsibility to make sure they only access
- * allocated parts of the array.
- *
- * @tparam OFFSET_BITS the number of bits of the index used as the offset within
- * a segment
- * @tparam SEGMENT_BITS the number of bits of the index used as the segment
- * index
- * @tparam T the type of stored in the container
- * @tparam Alloc the allocator used to allocate data
- */
-template <uint8_t OFFSET_BITS, uint8_t SEGMENT_BITS, class T,
-          class Alloc = std::allocator<T>>
-class libcuckoo_lazy_array {
-public:
-  using value_type = T;
-  using allocator_type = Alloc;
-
-private:
-  using traits_ = std::allocator_traits<allocator_type>;
-
-public:
-  using size_type = std::size_t;
-  using reference = value_type&;
-  using const_reference = const value_type&;
-
-  static_assert(SEGMENT_BITS + OFFSET_BITS <= sizeof(size_type) * 8,
-                "The number of segment and offset bits cannot exceed "
-                " the number of bits in a size_type");
-
-  /**
-   * Default constructor. Creates an empty array with no allocated segments.
-   */
-  libcuckoo_lazy_array(const allocator_type& allocator = Alloc()) noexcept(
-      noexcept(Alloc()))
-      : segments_{{nullptr}}, allocated_segments_(0), allocator_(allocator) {}
-
-  /**
-   * Constructs an array with enough segments allocated to fit @p target
-   * elements. Each allocated element is default-constructed.
-   *
-   * @param target the number of elements to allocate space for
-   */
-  libcuckoo_lazy_array(
-      size_type target,
-      const allocator_type& allocator = Alloc()) noexcept(noexcept(Alloc()))
-      : libcuckoo_lazy_array(allocator) {
-    segments_.fill(nullptr);
-    resize(target);
-  }
-
-  libcuckoo_lazy_array(const libcuckoo_lazy_array&) = delete;
-  libcuckoo_lazy_array& operator=(const libcuckoo_lazy_array&) = delete;
-
-  /**
-   * Move constructor
-   *
-   * @param arr the array being moved
-   */
-  libcuckoo_lazy_array(libcuckoo_lazy_array&& arr) noexcept
-      : segments_(arr.segments_), allocated_segments_(arr.allocated_segments_),
-        allocator_(std::move(arr.allocator_)) {
-    // Deactivate the array by setting its allocated segment count to 0
-    arr.allocated_segments_ = 0;
-  }
-
-  /**
-   * Destructor. Destroys all elements allocated in the array.
-   */
-  ~libcuckoo_lazy_array() noexcept(std::is_nothrow_destructible<T>::value) {
-    clear();
-  }
-
-  /**
-   * Destroys all elements allocated in the array.
-   */
-  void clear() {
-    for (size_type i = 0; i < allocated_segments_; ++i) {
-      destroy_array(segments_[i]);
-      segments_[i] = nullptr;
-    }
-  }
-
-  /**
-   * Index operator
-   *
-   * @return a reference to the data at the given index
-   */
-  reference operator[](size_type i) {
-    assert(get_segment(i) < allocated_segments_);
-    return segments_[get_segment(i)][get_offset(i)];
-  }
-
-  /**
-   * Const index operator
-   *
-   * @return a const reference to the data at the given index
-   */
-  const_reference operator[](size_type i) const {
-    assert(get_segment(i) < allocated_segments_);
-    return segments_[get_segment(i)][get_offset(i)];
-  }
-
-  /**
-   * Returns the number of elements the array has allocated space for
-   *
-   * @return current size of the array
-   */
-  size_type size() const { return allocated_segments_ * SEGMENT_SIZE; }
-
-  /**
-   * Returns the maximum number of elements the array can hold
-   *
-   * @return maximum size of the array
-   */
-  static constexpr size_type max_size() {
-    return 1UL << (OFFSET_BITS + SEGMENT_BITS);
-  }
-
-  /**
-   * Allocate enough space for @p target elements, not exceeding the capacity
-   * of the array. Under no circumstance will the array be shrunk.
-   *
-   * @param target the number of elements to ensure space is allocated for
-   */
-  void resize(size_type target) {
-    target = std::min(target, max_size());
-    if (target == 0) {
-      return;
-    }
-    const size_type last_segment = get_segment(target - 1);
-    for (size_type i = allocated_segments_; i <= last_segment; ++i) {
-      segments_[i] = create_array();
-    }
-    allocated_segments_ = last_segment + 1;
-  }
-
-private:
-  static constexpr size_type SEGMENT_SIZE = 1UL << OFFSET_BITS;
-  static constexpr size_type NUM_SEGMENTS = 1UL << SEGMENT_BITS;
-  static constexpr size_type OFFSET_MASK = SEGMENT_SIZE - 1;
-
-  std::array<T*, NUM_SEGMENTS> segments_;
-  size_type allocated_segments_;
-  allocator_type allocator_;
-
-  static size_type get_segment(size_type i) { return i >> OFFSET_BITS; }
-
-  static size_type get_offset(size_type i) { return i & OFFSET_MASK; }
-
-  // Allocates a SEGMENT_SIZE-sized array and default-initializes each element
-  typename traits_::pointer create_array() {
-    typename traits_::pointer arr = traits_::allocate(allocator_, SEGMENT_SIZE);
-    // Initialize all the elements, safely deallocating and destroying
-    // everything in case of error.
-    size_type i;
-    try {
-      for (i = 0; i < SEGMENT_SIZE; ++i) {
-        traits_::construct(allocator_, &arr[i]);
-      }
-    } catch (...) {
-      for (size_type j = 0; j < i; ++j) {
-        traits_::destroy(allocator_, &arr[j]);
-      }
-      traits_::deallocate(allocator_, arr, SEGMENT_SIZE);
-      throw;
-    }
-    return arr;
-  }
-
-  // Destroys every element of a SEGMENT_SIZE-sized array and then deallocates
-  // the memory.
-  void destroy_array(typename traits_::pointer arr) {
-    for (size_type i = 0; i < SEGMENT_SIZE; ++i) {
-      traits_::destroy(allocator_, &arr[i]);
-    }
-    traits_::deallocate(allocator_, arr, SEGMENT_SIZE);
-  }
-};
-
-#endif // _LIBCUCKOO_LAZY_ARRAY_HH
diff --git a/include/lightweightsemaphore.h b/include/lightweightsemaphore.h
index b4b184186..591313604 100644
--- a/include/lightweightsemaphore.h
+++ b/include/lightweightsemaphore.h
@@ -117,6 +117,7 @@ class Semaphore
 		assert(initialCount >= 0);
 		kern_return_t rc = semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
 		assert(rc == KERN_SUCCESS);
+		(void)rc;
 	}
 
 	~Semaphore()
@@ -176,6 +177,7 @@ class Semaphore
 		assert(initialCount >= 0);
 		int rc = sem_init(&m_sema, 0, initialCount);
 		assert(rc == 0);
+		(void)rc;
 	}
 
 	~Semaphore()
@@ -208,8 +210,8 @@ class Semaphore
 		const int usecs_in_1_sec = 1000000;
 		const int nsecs_in_1_sec = 1000000000;
 		clock_gettime(CLOCK_REALTIME, &ts);
-		ts.tv_sec += usecs / usecs_in_1_sec;
-		ts.tv_nsec += (usecs % usecs_in_1_sec) * 1000;
+		ts.tv_sec += (time_t)(usecs / usecs_in_1_sec);
+		ts.tv_nsec += (long)(usecs % usecs_in_1_sec) * 1000;
 		// sem_timedwait bombs if you have more than 1e9 in tv_nsec
 		// so we have to clean things up before passing it in
 		if (ts.tv_nsec >= nsecs_in_1_sec) {
diff --git a/scripts/fetchPufferfish.sh b/scripts/fetchPufferfish.sh
index eb0f883ee..a886ca3dc 100755
--- a/scripts/fetchPufferfish.sh
+++ b/scripts/fetchPufferfish.sh
@@ -22,11 +22,10 @@ if [ -d ${INSTALL_DIR}/src/pufferfish ] ; then
     rm -fr ${INSTALL_DIR}/src/pufferfish
 fi
 
-#SVER=salmon-v1.1.0
-SVER=develop
-#SVER=pe-chaining
+SVER=salmon-v1.2.0
+#SVER=develop
 
-EXPECTED_SHA256=aa51d7123f38568b1c9da5ab4f925b1705c5b3569048911b0aae4ea5cf7edac7
+EXPECTED_SHA256=4c5958db43e30894c80171a535808000afac49bae23e356d4d3fbfe44d9597ad
 
 mkdir -p ${EXTERNAL_DIR}
 curl -k -L https://github.com/COMBINE-lab/pufferfish/archive/${SVER}.zip -o ${EXTERNAL_DIR}/pufferfish.zip
@@ -84,6 +83,7 @@ cp ${EXTERNAL_DIR}/pufferfish/include/CommonTypes.hpp ${INSTALL_DIR}/include/puf
 cp ${EXTERNAL_DIR}/pufferfish/include/SAMWriter.hpp ${INSTALL_DIR}/include/pufferfish
 cp ${EXTERNAL_DIR}/pufferfish/include/PufferfishConfig.hpp ${INSTALL_DIR}/include/pufferfish
 cp ${EXTERNAL_DIR}/pufferfish/include/BinWriter.hpp ${INSTALL_DIR}/include/pufferfish
+cp -r ${EXTERNAL_DIR}/pufferfish/include/libdivide ${INSTALL_DIR}/include/pufferfish
 cp -r ${EXTERNAL_DIR}/pufferfish/include/ksw2pp ${INSTALL_DIR}/include/pufferfish
 cp -r ${EXTERNAL_DIR}/pufferfish/include/compact_vector ${INSTALL_DIR}/include/pufferfish
 cp -r ${EXTERNAL_DIR}/pufferfish/include/metro ${INSTALL_DIR}/include/pufferfish
diff --git a/src/Alevin.cpp b/src/Alevin.cpp
index 5fd37fd56..7cdd19900 100644
--- a/src/Alevin.cpp
+++ b/src/Alevin.cpp
@@ -298,6 +298,25 @@ uint32_t getLeftBoundary(std::vector<size_t>& sortedIdx,
   return 0;
 }
 
+
+void dumpFeatures(std::string& frequencyFileName,
+                  std::vector<size_t>& sortedIdx,
+                  const std::vector<uint32_t>& freqCounter,
+                  std::unordered_map<uint32_t, nonstd::basic_string_view<char>>& colMap
+                  ){
+  std::ofstream freqFile;
+  freqFile.open(frequencyFileName);
+  for (auto i:sortedIdx){
+    uint32_t count = freqCounter[i];
+    if (count == 0){
+      break;
+    }
+    freqFile << colMap[i] << "\t"
+             << count << "\n";
+  }
+  freqFile.close();
+}
+
 /*
   Knee calculation and sample true set of barcodes
  */
@@ -312,7 +331,6 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
   double lowConfidenceFraction { 0.5 };
   uint32_t topxBarcodes = std::min(maxNumBarcodes, freqCounter.size());
   uint64_t history { 0 };
-  uint32_t threshold;
 
   if (aopt.forceCells > 0) {
     topxBarcodes = aopt.forceCells - 1;
@@ -321,6 +339,9 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
       --topxBarcodes;
       ++belowThresholdCb;
     }
+
+    // converting 0 index to 1 index
+    ++topxBarcodes;
     aopt.jointLog->info("Throwing {} barcodes with < {} reads", belowThresholdCb, aopt.freqThreshold);
     aopt.jointLog->flush();
   }
@@ -339,7 +360,7 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
     topxBarcodes = maxNumCells;
     for(size_t i=baselineBcs; i<maxNumCells; i++){
       if (freqCounter[sortedIdx[i]] < cutoffFrequency) {
-        topxBarcodes = i;
+        topxBarcodes = i + 1;
         break;
       }
     }
@@ -432,25 +453,12 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
                       green, lowRegionNumBarcodes, RESET_COLOR);
   aopt.jointLog->flush();
 
-  threshold = topxBarcodes;
-
-  if(aopt.dumpfeatures){
-    auto frequencyFileName = aopt.outputDirectory / "raw_cb_frequency.txt";
-    std::ofstream freqFile;
-    freqFile.open(frequencyFileName.string());
-    for (auto i:sortedIdx){
-      uint32_t count = freqCounter[i];
-      if (count == 0){
-        break;
-      }
-      freqFile << colMap[i] << "\t"
-               << count << "\n";
-      topxBarcodes--;
-    }
-    freqFile.close();
+  if (aopt.dumpfeatures) {
+    auto frequencyFileName = (aopt.outputDirectory / "raw_cb_frequency.txt").string();
+    dumpFeatures(frequencyFileName, sortedIdx, freqCounter, colMap);
   }
 
-  for (size_t i=0; i<threshold; i++){
+  for (size_t i=0; i<topxBarcodes; i++){
     trueBarcodes.insert(nonstd::to_string(colMap[sortedIdx[i]]));
   }
   aopt.numCells = trueBarcodes.size();
@@ -691,6 +699,22 @@ void processBarcodes(std::vector<std::string>& barcodeFiles,
     }
 
     aopt.jointLog->info("Total {} white-listed Barcodes", trueBarcodes.size());
+
+    if (aopt.dumpfeatures) {
+      aopt.jointLog->info("Sorting and dumping raw barcodes");
+      auto frequencyFileName = (aopt.outputDirectory / "raw_cb_frequency.txt").string();
+      std::vector<uint32_t> collapsedfrequency;
+      std::unordered_map<uint32_t, nonstd::basic_string_view<char>> collapMap;
+      size_t ind{0};
+      for(auto fqIt = freqCounter.begin(); fqIt != freqCounter.end(); ++fqIt){
+        collapsedfrequency.push_back(fqIt.value());
+        collapMap[ind] = fqIt.key_sv();
+        ind += 1;
+      }
+
+      std::vector<size_t> sortedIdx = sort_indexes(collapsedfrequency);
+      dumpFeatures(frequencyFileName, sortedIdx, collapsedfrequency, collapMap);
+    }
   }
   else {
     std::vector<uint32_t> collapsedfrequency;
@@ -794,10 +818,10 @@ void initiatePipeline(AlevinOpts<ProtocolT>& aopt,
                       SalmonOpts& sopt,
                       OrderedOptionsT& orderedOptions,
                       boost::program_options::variables_map& vm,
-                      std::string commentString,
+                      std::string commentString, bool noTgMap,
                       std::vector<std::string> barcodeFiles,
                       std::vector<std::string> readFiles){
-  bool isOptionsOk = aut::processAlevinOpts(aopt, sopt, vm);
+  bool isOptionsOk = aut::processAlevinOpts(aopt, sopt, noTgMap, vm);
   if (!isOptionsOk){
     exit(1);
   }
@@ -834,7 +858,7 @@ void initiatePipeline(AlevinOpts<ProtocolT>& aopt,
                        txpInfoFile.string(),
                        txpLengthFile.string(),
                        headerFile.string(),
-                       aopt.jointLog);
+                       aopt.jointLog, noTgMap);
 
   // If we're supposed to be quiet, set the global logger level to >= warn
   if (aopt.quiet) {
@@ -955,8 +979,10 @@ salmon-based processing of single-cell RNA-seq data.
     red[3] = '0' + static_cast<char>(fmt::RED);
 
 
+    bool noTgMap {false};
     bool dropseq = vm["dropseq"].as<bool>();
     bool indrop = vm["indrop"].as<bool>();
+    bool citeseq = vm["citeseq"].as<bool>();
     bool chromV3 = vm["chromiumV3"].as<bool>();
     bool chrom = vm["chromium"].as<bool>();
     bool gemcode = vm["gemcode"].as<bool>();
@@ -970,6 +996,7 @@ salmon-based processing of single-cell RNA-seq data.
     uint8_t validate_num_protocols {0};
     if (dropseq) validate_num_protocols += 1;
     if (indrop) validate_num_protocols += 1;
+    if (citeseq) { validate_num_protocols += 1; noTgMap = true;}
     if (chromV3) validate_num_protocols += 1;
     if (chrom) validate_num_protocols += 1;
     if (gemcode) validate_num_protocols += 1;
@@ -1005,7 +1032,7 @@ salmon-based processing of single-cell RNA-seq data.
       AlevinOpts<apt::DropSeq> aopt;
       //aopt.jointLog->warn("Using DropSeq Setting for Alevin");
       initiatePipeline(aopt, sopt, orderedOptions,
-                       vm, commentString,
+                       vm, commentString, noTgMap,
                        barcodeFiles, readFiles);
     }
     else if(indrop){
@@ -1017,7 +1044,7 @@ salmon-based processing of single-cell RNA-seq data.
         aopt.protocol.setW1(w1);
         //aopt.jointLog->warn("Using InDrop Setting for Alevin");
         initiatePipeline(aopt, sopt, orderedOptions,
-                         vm, commentString,
+                         vm, commentString, noTgMap,
                          barcodeFiles, readFiles);
       }
       else{
@@ -1025,53 +1052,69 @@ salmon-based processing of single-cell RNA-seq data.
         exit(1);
       }
     }
+    else if(citeseq){
+      if(vm.count("featureStart") != 0 and vm.count("featureLength") != 0){
+        AlevinOpts<apt::CITESeq> aopt;
+        aopt.protocol.setFeatureLength(vm["featureLength"].as<size_t>());
+        aopt.protocol.setFeatureStart(vm["featureStart"].as<size_t>());
+
+        //aopt.jointLog->warn("Using InDrop Setting for Alevin");
+        initiatePipeline(aopt, sopt, orderedOptions,
+                         vm, commentString, noTgMap,
+                         barcodeFiles, readFiles);
+      }
+      else{
+        fmt::print(stderr, "ERROR: citeseq needs featureStart and featureLength flag too.\n Exiting Now");
+        exit(1);
+      }
+    }
     else if(chromV3){
       AlevinOpts<apt::ChromiumV3> aopt;
       //aopt.jointLog->warn("Using 10x v3 Setting for Alevin");
       initiatePipeline(aopt, sopt, orderedOptions,
-                       vm, commentString,
+                       vm, commentString, noTgMap,
                        barcodeFiles, readFiles);
     }
     else if(chrom){
       AlevinOpts<apt::Chromium> aopt;
       //aopt.jointLog->warn("Using 10x v2 Setting for Alevin");
       initiatePipeline(aopt, sopt, orderedOptions,
-                       vm, commentString,
+                       vm, commentString, noTgMap,
                        barcodeFiles, readFiles);
     }
     else if(gemcode){
       AlevinOpts<apt::Gemcode> aopt;
       //aopt.jointLog->warn("Using 10x v1 Setting for Alevin");
       initiatePipeline(aopt, sopt, orderedOptions,
-                       vm, commentString,
+                       vm, commentString, noTgMap,
                        unmateFiles, readFiles);
     }
     else if(celseq){
       AlevinOpts<apt::CELSeq> aopt;
       //aopt.jointLog->warn("Using CEL-Seq Setting for Alevin");
       initiatePipeline(aopt, sopt, orderedOptions,
-                       vm, commentString,
+                       vm, commentString, noTgMap,
                        barcodeFiles, readFiles);
     }
     else if(celseq2){
       AlevinOpts<apt::CELSeq2> aopt;
       //aopt.jointLog->warn("Using CEL-Seq2 Setting for Alevin");
       initiatePipeline(aopt, sopt, orderedOptions,
-                       vm, commentString,
+                       vm, commentString, noTgMap,
                        barcodeFiles, readFiles);
     }
     else if(quartzseq2){
       AlevinOpts<apt::QuartzSeq2> aopt;
       //aopt.jointLog->warn("Using Quartz-Seq2 Setting for Alevin");
       initiatePipeline(aopt, sopt, orderedOptions,
-                       vm, commentString,
+                       vm, commentString, noTgMap,
                        barcodeFiles, readFiles);
     }
     else{
       AlevinOpts<apt::Custom> aopt;
       //aopt.jointLog->warn("Using Custom Setting for Alevin");
       initiatePipeline(aopt, sopt, orderedOptions,
-                       vm, commentString,
+                       vm, commentString, noTgMap,
                        barcodeFiles, readFiles);
     }
 
diff --git a/src/AlevinHash.cpp b/src/AlevinHash.cpp
index dcdcb3d36..6d21e54c4 100644
--- a/src/AlevinHash.cpp
+++ b/src/AlevinHash.cpp
@@ -122,7 +122,7 @@ size_t readBfh(bfs::path& eqFilePath,
     } // end else case of not hasWhitelist
   } // end name/index rearrangement
 
-  countMap.set_max_resize_threads(1);
+  countMap.max_num_worker_threads(1);
   countMap.reserve(1000000);
 
   alevin::types::AlevinUMIKmer umiObj;
@@ -290,6 +290,10 @@ int salmonHashQuantify(AlevinOpts<apt::DropSeq>& aopt,
                        bfs::path& outputDirectory,
                        CFreqMapT& freqCounter);
 template
+int salmonHashQuantify(AlevinOpts<apt::CITESeq>& aopt,
+                       bfs::path& outputDirectory,
+                       CFreqMapT& freqCounter);
+template
 int salmonHashQuantify(AlevinOpts<apt::InDrop>& aopt,
                        bfs::path& outputDirectory,
                        CFreqMapT& freqCounter);
@@ -309,3 +313,4 @@ template
 int salmonHashQuantify(AlevinOpts<apt::Custom>& aopt,
                        bfs::path& outputDirectory,
                        CFreqMapT& freqCounter);
+
diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp
index 77b24f0d4..3dda707b2 100644
--- a/src/AlevinUtils.cpp
+++ b/src/AlevinUtils.cpp
@@ -40,6 +40,68 @@ namespace alevin {
     std::vector<std::string> nts{"A", "T", "C", "G"};
 
 
+    template <>
+    void getReadSequence(apt::CITESeq& protocol,
+                         std::string& seq,
+                         std::string& subseq){
+      subseq = seq.substr(protocol.featureStart,
+                          protocol.featureLength);
+    }
+    template <>
+    void getReadSequence(apt::DropSeq& protocol,
+                         std::string& seq,
+                         std::string& subseq){
+      subseq = seq;
+    }
+    template <>
+    void getReadSequence(apt::Chromium& protocol,
+                         std::string& seq,
+                         std::string& subseq){
+      subseq = seq;
+    }
+    template <>
+    void getReadSequence(apt::ChromiumV3& protocol,
+                         std::string& seq,
+                         std::string& subseq){
+      subseq = seq;
+    }
+    template <>
+    void getReadSequence(apt::CELSeq& protocol,
+                         std::string& seq,
+                         std::string& subseq){
+      subseq = seq;
+    }
+    template <>
+    void getReadSequence(apt::CELSeq2& protocol,
+                         std::string& seq,
+                         std::string& subseq){
+      subseq = seq;
+    }
+    template <>
+    void getReadSequence(apt::QuartzSeq2& protocol,
+                         std::string& seq,
+                         std::string& subseq){
+      subseq = seq;
+    }
+    template <>
+    void getReadSequence(apt::Custom& protocol,
+                         std::string& seq,
+                         std::string& subseq){
+      subseq = seq;
+    }
+    template <>
+    void getReadSequence(apt::Gemcode& protocol,
+                         std::string& seq,
+                         std::string& subseq){
+      subseq = seq;
+    }
+    template <>
+    void getReadSequence(apt::InDrop& protocol,
+                         std::string& seq,
+                         std::string& subseq){
+      subseq = seq;
+    }
+
     template <>
     bool extractUMI<apt::DropSeq>(std::string& read,
                                   apt::DropSeq& pt,
@@ -48,6 +110,13 @@ namespace alevin {
       return true;
     }
     template <>
+    bool extractUMI<apt::CITESeq>(std::string& read,
+                                  apt::CITESeq& pt,
+                                  std::string& umi){
+      umi = read.substr(pt.barcodeLength, pt.umiLength);
+      return true;
+    }
+    template <>
     bool extractUMI<apt::Chromium>(std::string& read,
                                    apt::Chromium& pt,
                                    std::string& umi){
@@ -72,6 +141,19 @@ namespace alevin {
     bool extractUMI<apt::Custom>(std::string& read,
                                  apt::Custom& pt,
                                  std::string& umi){
+      if ( pt.end == BarcodeEnd::FIVE ) {
+        umi = read.substr(pt.barcodeLength, pt.umiLength);
+      } else if (pt.end == BarcodeEnd::THREE ) {
+        umi = read.substr(0, pt.umiLength);
+      } else {
+        return false;
+      }
+      return true;
+    }
+    template <>
+    bool extractUMI<apt::QuartzSeq2>(std::string& read,
+                                     apt::QuartzSeq2& pt,
+                                     std::string& umi){
       umi = read.substr(pt.barcodeLength, pt.umiLength);
       return true;
     }
@@ -83,13 +165,6 @@ namespace alevin {
       return true;
     }
     template <>
-    bool extractUMI<apt::QuartzSeq2>(std::string& read,
-                                     apt::QuartzSeq2& pt,
-                                     std::string& umi){
-      umi = read.substr(0, pt.umiLength);
-      return true;
-    }
-    template <>
     bool extractUMI<apt::CELSeq>(std::string& read,
                                  apt::CELSeq& pt,
                                  std::string& umi){
@@ -109,44 +184,49 @@ namespace alevin {
                                       apt::DropSeq& pt){
       return (read.length() >= pt.barcodeLength) ?
         nonstd::optional<std::string>(read.substr(0, pt.barcodeLength)) : nonstd::nullopt;
-      // = read.substr(0, pt.barcodeLength);
-    //return true;
+    }
+    template <>
+    nonstd::optional<std::string> extractBarcode<apt::CITESeq>(std::string& read,
+                                                               apt::CITESeq& pt){
+      return (read.length() >= pt.barcodeLength) ?
+        nonstd::optional<std::string>(read.substr(0, pt.barcodeLength)) : nonstd::nullopt;
     }
     template <>
     nonstd::optional<std::string> extractBarcode<apt::ChromiumV3>(std::string& read,
                                                                   apt::ChromiumV3& pt){
       return (read.length() >= pt.barcodeLength) ?
         nonstd::optional<std::string>(read.substr(0, pt.barcodeLength)) : nonstd::nullopt;
-      //return (read.length() >= pt.barcodeLength) ? (bc.append(read.data(), pt.barcodeLength), true) : false;
-      //bc = read.substr(0, pt.barcodeLength);
-      //return true;
     }
     template <>
     nonstd::optional<std::string> extractBarcode<apt::Chromium>(std::string& read,
                                        apt::Chromium& pt){
       return (read.length() >= pt.barcodeLength) ?
         nonstd::optional<std::string>(read.substr(0, pt.barcodeLength)) : nonstd::nullopt;
-      //return (read.length() >= pt.barcodeLength) ? (bc.append(read.data(), pt.barcodeLength), true) : false;
-      //bc = read.substr(0, pt.barcodeLength);
-      //return true;
     }
     template <>
     nonstd::optional<std::string> extractBarcode<apt::Gemcode>(std::string& read,
                                        apt::Gemcode& pt){
       return (read.length() >= pt.barcodeLength) ?
         nonstd::optional<std::string>(read.substr(0, pt.barcodeLength)) : nonstd::nullopt;
-      //return (read.length() >= pt.barcodeLength) ? (bc.append(read.data(), pt.barcodeLength), true) : false;
-      //bc = read.substr(0, pt.barcodeLength);
-      //return true;
     }
     template <>
     nonstd::optional<std::string> extractBarcode<apt::Custom>(std::string& read,
                                      apt::Custom& pt){
+      if (pt.end == BarcodeEnd::FIVE) {
+        return (read.length() >= pt.barcodeLength) ?
+          nonstd::optional<std::string>(read.substr(0, pt.barcodeLength)) : nonstd::nullopt;
+      } else if (pt.end == BarcodeEnd::THREE) {
+        return (read.length() >= (pt.umiLength + pt.barcodeLength)) ?
+          nonstd::optional<std::string>(read.substr(pt.umiLength, pt.barcodeLength)) : nonstd::nullopt;
+      } else {
+        return nonstd::nullopt;
+      }
+    }
+    template <>
+    nonstd::optional<std::string> extractBarcode<apt::QuartzSeq2>(std::string& read,
+                                                                  apt::QuartzSeq2& pt){
       return (read.length() >= pt.barcodeLength) ?
         nonstd::optional<std::string>(read.substr(0, pt.barcodeLength)) : nonstd::nullopt;
-      //return (read.length() >= pt.barcodeLength) ? (bc.append(read.data(), pt.barcodeLength), true) : false;
-      //bc = read.substr(0, pt.barcodeLength);
-      //return true;
     }
     template <>
     nonstd::optional<std::string> extractBarcode<apt::CELSeq2>(std::string& read,
@@ -154,13 +234,6 @@ namespace alevin {
       return (read.length() >= (pt.umiLength + pt.barcodeLength)) ?
         nonstd::optional<std::string>(read.substr(pt.umiLength, pt.barcodeLength)) : nonstd::nullopt;
 
-    }
-    template <>
-    nonstd::optional<std::string> extractBarcode<apt::QuartzSeq2>(std::string& read,
-                                                                  apt::QuartzSeq2& pt){
-      return (read.length() >= (pt.umiLength + pt.barcodeLength)) ?
-        nonstd::optional<std::string>(read.substr(pt.umiLength, pt.barcodeLength)) : nonstd::nullopt;
-
     }
     template <>
     nonstd::optional<std::string> extractBarcode<apt::CELSeq>(std::string& read,
@@ -270,7 +343,8 @@ namespace alevin {
                          const std::string& refNamesFile,
                          const std::string& refLengthFile,
                          const std::string& headerFile,
-                         std::shared_ptr<spdlog::logger>& jointLog){
+                         std::shared_ptr<spdlog::logger>& jointLog,
+                         bool noTgMap){
       size_t  kSize, numDupTxps{0};
       uint64_t numberOfDecoys, firstDecoyIndex;
       spp::sparse_hash_map<std::string, std::vector<uint32_t>> txpIdxMap;
@@ -331,38 +405,45 @@ namespace alevin {
                      txpNames.size() - numberOfDecoys - numShort - numDupTxps ,
                      numberOfDecoys, numShort, numDupTxps);
 
-      std::ifstream t2gFile(t2gFileName);
-      uint32_t gid, geneCount{0};
-      std::vector<uint32_t> tids;
-      std::string tStr, gStr;
-      if(t2gFile.is_open()) {
-        while( not t2gFile.eof() ) {
-          t2gFile >> tStr >> gStr;
-
-          if(not txpIdxMap.contains(tStr)  ){
-            continue;
-          }
+      if (noTgMap){ // mapping the protein name to itself
+        for (size_t i=0; i<txpNames.size(); i++) {
+          geneIdxMap[txpNames[i]] = i;
+          txpToGeneMap[i] = i;
+        }
+      } else {
+        std::ifstream t2gFile(t2gFileName);
+        uint32_t gid, geneCount{0};
+        std::vector<uint32_t> tids;
+        std::string tStr, gStr;
+        if(t2gFile.is_open()) {
+          while( not t2gFile.eof() ) {
+            t2gFile >> tStr >> gStr;
+
+            if(not txpIdxMap.contains(tStr)  ){
+              continue;
+            }
 
-          tids = txpIdxMap[tStr];
-          if (geneIdxMap.contains(gStr)){
-            gid = geneIdxMap[gStr];
-          }
-          else{
-            gid = geneCount;
-            geneIdxMap[gStr] = gid;
-            geneCount++;
-          }
+            tids = txpIdxMap[tStr];
+            if (geneIdxMap.contains(gStr)){
+              gid = geneIdxMap[gStr];
+            }
+            else{
+              gid = geneCount;
+              geneIdxMap[gStr] = gid;
+              geneCount++;
+            }
 
-          for (auto tid: tids) {
-            if (txpToGeneMap.find(tid) != txpToGeneMap.end() &&
-                txpToGeneMap[tid] != gid ) {
-              jointLog->warn("Dual txp to gene map for txp {}", txpNames[tid]);
+            for (auto tid: tids) {
+              if (txpToGeneMap.find(tid) != txpToGeneMap.end() &&
+                  txpToGeneMap[tid] != gid ) {
+                jointLog->warn("Dual txp to gene map for txp {}", txpNames[tid]);
+              }
+              txpToGeneMap[tid] = gid;
             }
-            txpToGeneMap[tid] = gid;
           }
+          t2gFile.close();
         }
-        t2gFile.close();
-      }
+      } // done parsing txp to gene map file
 
       jointLog->info( "Filled with {} txp to gene entries ", txpToGeneMap.size());
       for ( auto it: txpIdxMap ) {
@@ -387,7 +468,7 @@ namespace alevin {
 
     template <typename ProtocolT>
     bool processAlevinOpts(AlevinOpts<ProtocolT>& aopt,
-                           SalmonOpts& sopt,
+                           SalmonOpts& sopt, bool noTgMap,
                            boost::program_options::variables_map& vm){
       namespace bfs = boost::filesystem;
       namespace po = boost::program_options;
@@ -462,18 +543,20 @@ namespace alevin {
         }
       }
 
-      if (vm.count("tgMap")){
-        aopt.geneMapFile = vm["tgMap"].as<std::string>();
-        if (!bfs::exists(aopt.geneMapFile)) {
-          fmt::print(stderr,"\nTranscript to Gene Map File {} does not exists\n Exiting Now",
-                     aopt.geneMapFile.string());
+      if (not noTgMap) {
+        if (vm.count("tgMap")){
+          aopt.geneMapFile = vm["tgMap"].as<std::string>();
+          if (!bfs::exists(aopt.geneMapFile)) {
+            fmt::print(stderr,"\nTranscript to Gene Map File {} does not exists\n Exiting Now",
+                       aopt.geneMapFile.string());
+            return false;
+          }
+        }
+        else{
+          fmt::print(stderr,"\nTranscript to Gene Map File not provided\n Exiting Now");
           return false;
         }
       }
-      else{
-        fmt::print(stderr,"\nTranscript to Gene Map File not provided\n Exiting Now");
-        return false;
-      }
 
       //create logger
       spdlog::set_async_mode(131072);
@@ -536,7 +619,7 @@ namespace alevin {
                             "This is not recommended way to run the pipeline,"
                             "and it might slow the pipeline",
                             aopt.keepCBFraction);
-      }
+      } else if (noTgMap) { aopt.keepCBFraction = 1.0;  }
 
       if (not vm.count("threads")) {
         auto tot_cores = std::thread::hardware_concurrency();
@@ -549,9 +632,6 @@ namespace alevin {
       }  // things which needs to be updated for salmonOpts
 
       // validate customized options for custom protocol
-      // NOTE : @k3yavi, I tried to clean this up a little bit
-      // because the previous logic was a little complicated.
-      // Please look over the below.
       bool haveCustomEnd = vm.count("end");
       bool haveCustomBC= vm.count("barcodeLength");
       bool haveCustomUMI = vm.count("umiLength");
@@ -654,20 +734,24 @@ namespace alevin {
 
       // enable validate mappings
       sopt.validateMappings = true;
-      // @k3yavi --- doing this for now as a result of our testing.
-      // let me know if you have any other thoughts.
-      sopt.hitFilterPolicyStr = "BEFORE";
+      sopt.hitFilterPolicyStr = "BOTH";
       bool optionsOK =
         salmon::utils::processQuantOptions(sopt, vm, vm["numBiasSamples"].as<int32_t>());
       if (!vm.count("minScoreFraction")) {
-        sopt.minScoreFraction = alevin::defaults::minScoreFraction;
+        if (noTgMap) {
+          int32_t l = vm["featureLength"].as<size_t>();
+          sopt.minScoreFraction = salmon::utils::compute_1_edit_threshold(l, sopt);
+          aopt.jointLog->info("set CITE-seq minScoreFraction parameter to : {}",
+                              sopt.minScoreFraction);
+        } else {
+          sopt.minScoreFraction = alevin::defaults::minScoreFraction;
+        }
+
         sopt.consensusSlack = alevin::defaults::consensusSlack;
         sopt.jointLog->info(
-                            "Using default value of {} for minScoreFraction in Alevin\n"
-                            "Using default value of {} for consensusSlack in Alevin",
-                            sopt.minScoreFraction,
-                            sopt.consensusSlack
-                            );
+            "Using default value of {} for minScoreFraction in Alevin\n"
+            "Using default value of {} for consensusSlack in Alevin",
+            sopt.minScoreFraction, sopt.consensusSlack);
       }
 
       if (!optionsOK) {
@@ -788,39 +872,43 @@ namespace alevin {
 
     template
     bool processAlevinOpts(AlevinOpts<apt::DropSeq>& aopt,
-                           SalmonOpts& sopt,
+                           SalmonOpts& sopt, bool noTgMap,
+                           boost::program_options::variables_map& vm);
+    template
+    bool processAlevinOpts(AlevinOpts<apt::CITESeq>& aopt,
+                           SalmonOpts& sopt, bool noTgMap,
                            boost::program_options::variables_map& vm);
     template
     bool processAlevinOpts(AlevinOpts<apt::InDrop>& aopt,
-                           SalmonOpts& sopt,
+                           SalmonOpts& sopt, bool noTgMap,
                            boost::program_options::variables_map& vm);
     template
     bool processAlevinOpts(AlevinOpts<apt::ChromiumV3>& aopt,
-                           SalmonOpts& sopt,
+                           SalmonOpts& sopt, bool noTgMap,
                            boost::program_options::variables_map& vm);
     template
     bool processAlevinOpts(AlevinOpts<apt::Chromium>& aopt,
-                           SalmonOpts& sopt,
+                           SalmonOpts& sopt, bool noTgMap,
                            boost::program_options::variables_map& vm);
     template
     bool processAlevinOpts(AlevinOpts<apt::Gemcode>& aopt,
-                           SalmonOpts& sopt,
+                           SalmonOpts& sopt, bool noTgMap,
                            boost::program_options::variables_map& vm);
     template
     bool processAlevinOpts(AlevinOpts<apt::Custom>& aopt,
-                           SalmonOpts& sopt,
+                           SalmonOpts& sopt, bool noTgMap,
                            boost::program_options::variables_map& vm);
     template
     bool processAlevinOpts(AlevinOpts<apt::CELSeq>& aopt,
-                           SalmonOpts& sopt,
+                           SalmonOpts& sopt, bool noTgMap,
                            boost::program_options::variables_map& vm);
     template
     bool processAlevinOpts(AlevinOpts<apt::CELSeq2>& aopt,
-                           SalmonOpts& sopt,
+                           SalmonOpts& sopt, bool noTgMap,
                            boost::program_options::variables_map& vm);
     template
     bool processAlevinOpts(AlevinOpts<apt::QuartzSeq2>& aopt,
-                           SalmonOpts& sopt,
+                           SalmonOpts& sopt, bool noTgMap,
                            boost::program_options::variables_map& vm);
   }
 }
diff --git a/src/BWAUtils.cpp b/src/BWAUtils.cpp
deleted file mode 100644
index 90da1de4d..000000000
--- a/src/BWAUtils.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-#include "BWAUtils.hpp"
-
-namespace bwautils {
-static void bwt_reverse_intvs(bwtintv_v* p) {
-  if (p->n > 1) {
-    int j;
-    for (j = 0; static_cast<bwtint_t>(j) < p->n >> 1; ++j) {
-      bwtintv_t tmp = p->a[p->n - 1 - j];
-      p->a[p->n - 1 - j] = p->a[j];
-      p->a[j] = tmp;
-    }
-  }
-}
-
-// Function modified from bwt_smem1a:
-// https://github.com/lh3/bwa/blob/eb428d7d31ced059ad39af2701a22ebe6d175657/bwt.c#L289
-/**
- * Search for the k-mer of length @len starting at @q.
- * Return true if an interval is found for the k-mer and false
- * otherwise. The appropriate bwt interval will be placed
- * in @resInterval upon success.
- *
- */
-bool getIntervalForKmer(const bwt_t* bwt, // the bwt index
-                        int len,          // k-mer length
-                        const uint8_t* q, // query
-                        bwtintv_t& resInterval) {
-  int i, j, c, ret;
-  int x = 0;
-  bwtintv_t ik, ok[4];
-  //bwtintv_v a[2], *prev, *curr, *swap;
-  bwtintv_v a[2], *curr, *swap;
-
-  if (q[x] > 3)
-    return false;
-  // if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
-  kv_init(a[0]);
-  kv_init(a[1]);
-  //prev = &a[0]; // use the temporary vector if provided
-  curr = &a[1];
-  bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
-  ik.info = x + 1;
-
-  for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
-    if (q[i] < 4) {                            // an A/C/G/T base
-      c = 3 - q[i];                            // complement of q[i]
-      bwt_extend(bwt, &ik, ok, 0);
-      ik = ok[c];
-      ik.info = i + 1;
-    } else { // an ambiguous base
-      break; // always terminate extension at an ambiguous base; in this case,
-             // i<len always stands
-    }
-  }
-  if (i == len) { // kv_push(bwtintv_t, *curr, ik); // push the last interval if
-                  // we reach the end
-    /// Copy over the final interval to our output
-    resInterval.x[0] = ik.x[0];
-    resInterval.x[1] = ik.x[1];
-    resInterval.x[2] = ik.x[2];
-    resInterval.info = ik.info;
-    return true;
-  } else { // we didn't find a k-mer of the requested length
-    return false;
-  }
-}
-
-// NOTE: $max_intv is not currently used in BWA-MEM
-// NOTE: Modified from the original functions to take an initial interval for
-// the search query
-int bwt_smem1a_with_kmer(const bwt_t* bwt, int len, const uint8_t* q, int x,
-                         int min_intv, uint64_t max_intv,
-                         bwtintv_t initial_interval, bwtintv_v* mem,
-                         bwtintv_v* tmpvec[2]) {
-  int i, j, c, ret;
-  bwtintv_t ik, ok[4];
-  bwtintv_v a[2], *prev, *curr, *swap;
-
-  mem->n = 0;
-  if (q[x] > 3)
-    return x + 1;
-  if (min_intv < 1)
-    min_intv = 1; // the interval size should be at least 1
-  kv_init(a[0]);
-  kv_init(a[1]);
-  prev = tmpvec && tmpvec[0] ? tmpvec[0]
-                             : &a[0]; // use the temporary vector if provided
-  curr = tmpvec && tmpvec[1] ? tmpvec[1] : &a[1];
-  // bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
-  // ik.info = x + 1;
-
-  // ROB: set ik to our initial interval
-  ik.x[0] = initial_interval.x[0];
-  ik.x[1] = initial_interval.x[1];
-  ik.x[2] = initial_interval.x[2];
-  // Is this last one right?
-  int k = initial_interval.info;
-  ik.info = x + k;
-
-  for (i = x + k, curr->n = 0; i < len; ++i) { // forward search
-    if (ik.x[2] < max_intv) {                  // an interval small enough
-      kv_push(bwtintv_t, *curr, ik);
-      break;
-    } else if (q[i] < 4) { // an A/C/G/T base
-      c = 3 - q[i];        // complement of q[i]
-      bwt_extend(bwt, &ik, ok, 0);
-      if (ok[c].x[2] != ik.x[2]) { // change of the interval size
-        kv_push(bwtintv_t, *curr, ik);
-        if (ok[c].x[2] < static_cast<bwtint_t>(min_intv))
-          break; // the interval size is too small to be extended further
-      }
-      ik = ok[c];
-      ik.info = i + 1;
-    } else { // an ambiguous base
-      kv_push(bwtintv_t, *curr, ik);
-      break; // always terminate extension at an ambiguous base; in this case,
-             // i<len always stands
-    }
-  }
-  if (i == len)
-    kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
-  bwt_reverse_intvs(
-      curr); // s.t. smaller intervals (i.e. longer matches) visited first
-  ret = curr->a[0].info; // this will be the returned value
-  swap = curr;
-  curr = prev;
-  prev = swap;
-
-  for (i = x - 1; i >= -1; --i) { // backward search for MEMs
-    c = i < 0
-            ? -1
-            : q[i] < 4 ? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
-    for (j = 0, curr->n = 0; static_cast<bwtint_t>(j) < prev->n; ++j) {
-      bwtintv_t* p = &prev->a[j];
-      if (c >= 0 && ik.x[2] >= max_intv)
-        bwt_extend(bwt, p, ok, 1);
-      if (c < 0 || ik.x[2] < max_intv ||
-          ok[c].x[2] < static_cast<bwtint_t>(min_intv)) { // keep the hit if reaching the beginning or
-                                   // an ambiguous base or the intv is small
-                                   // enough
-        if (curr->n ==
-            0) { // test curr->n>0 to make sure there are no longer matches
-          if (mem->n == 0 ||
-              static_cast<bwtint_t>(i) + 1 < mem->a[mem->n - 1].info >> 32) { // skip contained matches
-            ik = *p;
-            ik.info |= (uint64_t)(i + 1) << 32;
-            kv_push(bwtintv_t, *mem, ik);
-          }
-        } // otherwise the match is contained in another longer match
-      } else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n - 1].x[2]) {
-        ok[c].info = p->info;
-        kv_push(bwtintv_t, *curr, ok[c]);
-      }
-    }
-    if (curr->n == 0)
-      break;
-    swap = curr;
-    curr = prev;
-    prev = swap;
-  }
-  bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
-
-  if (tmpvec == 0 || tmpvec[0] == 0)
-    free(a[0].a);
-  if (tmpvec == 0 || tmpvec[1] == 0)
-    free(a[1].a);
-  return ret;
-}
-
-int bwt_smem1_with_kmer(const bwt_t* bwt, int len, const uint8_t* q, int x,
-                        int min_intv, bwtintv_t initial_interval,
-                        bwtintv_v* mem, bwtintv_v* tmpvec[2]) {
-  return bwt_smem1a_with_kmer(bwt, len, q, x, min_intv, 0, initial_interval,
-                              mem, tmpvec);
-}
-} // namespace bwautils
diff --git a/src/BuildSalmonIndex.cpp b/src/BuildSalmonIndex.cpp
index c2f49d4b2..8e97e69ad 100644
--- a/src/BuildSalmonIndex.cpp
+++ b/src/BuildSalmonIndex.cpp
@@ -60,57 +60,62 @@ int salmonIndex(int argc, const char* argv[]) {
   bool useQuasi{false};
   bool perfectHash{false};
   bool gencodeRef{false};
+  bool featuresRef{false};
   bool keepDuplicates{false};
 
   pufferfish::IndexOptions idxOpt;
 
   po::options_description generic("Command Line Options");
-  generic.add_options()("version,v", "print version string")(
-      "help,h", "produce help message")("transcripts,t",
-                                        po::value<string>()->required(),
-                                        "Transcript fasta file.")(
-      "kmerLen,k",
+  generic.add_options()
+    ("version,v", "print version string")
+    ("help,h", "produce help message")
+    ("transcripts,t", po::value<string>()->required(), "Transcript fasta file.")
+    ("kmerLen,k",
       po::value<uint32_t>(&idxOpt.k)->default_value(31)->required(),
-      "The size of k-mers that should be used for the quasi index.")(
-      "index,i", po::value<string>()->required(), "salmon index.")(
-      "gencode", po::bool_switch(&gencodeRef)->default_value(false),
+      "The size of k-mers that should be used for the quasi index.")
+    ("index,i", po::value<string>()->required(), "salmon index.")
+    ("gencode", po::bool_switch(&gencodeRef)->default_value(false),
       "This flag will expect the input transcript fasta to be in GENCODE "
       "format, and will split the transcript name at the first \'|\' character. "
       "These reduced names will be used in the output and when looking for these "
       "transcripts in a gene to transcript "
-      "GTF.")("keepDuplicates",
-              po::bool_switch(&idxOpt.keep_duplicates)->default_value(false),
-              "This flag will disable the default indexing behavior of "
-              "discarding sequence-identical duplicate "
-              "transcripts.  If this flag is passed, then duplicate "
-              "transcripts that appear in the input will be "
-              "retained and quantified separately.")(
-      "threads,p", po::value<uint32_t>(&idxOpt.p)->default_value(2)->required(),
-      "Number of threads to use during indexing.")(
-      "keepFixedFasta",
-      po::bool_switch(&idxOpt.keep_fixed_fasta)->default_value(false),
+      "GTF.")
+    ("features", po::bool_switch(&featuresRef)->default_value(false),
+     "This flag will expect the input reference to be in the tsv file "
+     "format, and will split the feature name at the first \'tab\' character. "
+     "These reduced names will be used in the output and when looking for the "
+     "sequence of the features."
+     "GTF.")
+    ("keepDuplicates",po::bool_switch(&idxOpt.keep_duplicates)->default_value(false),
+      "This flag will disable the default indexing behavior of "
+       "discarding sequence-identical duplicate "
+       "transcripts.  If this flag is passed, then duplicate "
+       "transcripts that appear in the input will be "
+       "retained and quantified separately.")
+    ("threads,p", po::value<uint32_t>(&idxOpt.p)->default_value(2)->required(),
+      "Number of threads to use during indexing.")
+    ("keepFixedFasta",po::bool_switch(&idxOpt.keep_fixed_fasta)->default_value(false),
       "Retain the fixed fasta file (without short transcripts and duplicates, "
-      "clipped, etc.) generated during indexing")(
-      "filterSize,f", po::value<int32_t>(&idxOpt.filt_size)->default_value(-1),
+      "clipped, etc.) generated during indexing")
+    ("filterSize,f", po::value<int32_t>(&idxOpt.filt_size)->default_value(-1),
       "The size of the Bloom filter that will be used by TwoPaCo during indexing. "
       "The filter will be of size 2^{filterSize}. The default value of -1 "
       "means that the filter size will be automatically set based on the number of "
-      "distinct k-mers in the input, as estimated by nthll.")(
-      "tmpdir",
-      po::value<std::string>(&idxOpt.twopaco_tmp_dir)->default_value(""),
+      "distinct k-mers in the input, as estimated by nthll.")
+    ("tmpdir",po::value<std::string>(&idxOpt.twopaco_tmp_dir)->default_value(""),
       "The directory location that will be used for TwoPaCo temporary files; "
       "it will be created if need be and be removed prior to indexing completion. "
       "The default value will cause a (temporary) subdirectory of the salmon index "
-      "directory to be used for this purpose.")(
-      "sparse", po::bool_switch(&idxOpt.isSparse)->default_value(false),
+      "directory to be used for this purpose.")
+    ("sparse", po::bool_switch(&idxOpt.isSparse)->default_value(false),
       "Build the index using a sparse sampling of k-mer positions "
       "This will require less memory (especially during quantification), but "
-      "will take longer to construct and can slow down mapping / alignment")(
-      "decoys,d", po::value<string>(&idxOpt.decoy_file),
+      "will take longer to construct and can slow down mapping / alignment")
+    ("decoys,d", po::value<string>(&idxOpt.decoy_file),
       "Treat these sequences ids from the reference as the decoys that may "
       "have sequence homologous to some known transcript. for example in "
-      "case of the genome, provide a list of chromosome name --- one per line")(
-      "type",
+      "case of the genome, provide a list of chromosome name --- one per line")
+    ("type",
       po::value<string>(&indexTypeStr)->default_value("puff")->required(),
       "The type of index to build; the only option is \"puff\" in this version "
       "of salmon.");
@@ -203,9 +208,19 @@ Creates a salmon index.
         idxOpt.k = 31;
       }
 
+      // give the user a warning if they are not using any decoy file
+      if (idxOpt.decoy_file.empty()) {
+        jointLog->warn("The salmon index is being built without any decoy sequences.  It is recommended that "
+                      "decoy sequence (either computed auxiliary decoy sequence or the genome of the organism) "
+                      "be provided during indexing. Further details can be found at "
+                      "https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode.");
+      }
+
       if (gencodeRef) {
         idxOpt.header_sep = "|";
       }
+
+      if (featuresRef) { idxOpt.featuresRef = true; }
       sidx.reset(new SalmonIndex(jointLog, SalmonIndexType::PUFF));
     } else {
       jointLog->error("This version of salmon does not support FMD or "
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a41a48b38..5ea2f4146 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -88,44 +88,6 @@ if(HAS_IPO AND (NOT NO_IPO))
   set_property(TARGET ksw2pp PROPERTY INTERPROCEDURAL_OPTIMIZATION True)
 endif()
 
-
-if (NOT BUILD_PUFF_FOR_SALMON)
-message("BUILDING OUR OWN KSW2PP")
-message("BUILD_PUFF_FOR_SALMON IS = ${BUILD_PUFF_FOR_SALMON}")
-set (KSW2PP_BASIC_LIB_SRCS
-${GAT_SOURCE_DIR}/external/install/src/rapmap/ksw2pp/kalloc.c
-${GAT_SOURCE_DIR}/external/install/src/rapmap/ksw2pp/ksw2_extd.c
-${GAT_SOURCE_DIR}/external/install/src/rapmap/ksw2pp/ksw2_extz.c
-${GAT_SOURCE_DIR}/external/install/src/rapmap/ksw2pp/ksw2_gg.c
-${GAT_SOURCE_DIR}/external/install/src/rapmap/ksw2pp/ksw2_gg2.c
-${GAT_SOURCE_DIR}/external/install/src/rapmap/ksw2pp/ksw2_gg2_sse.c
-${GAT_SOURCE_DIR}/external/install/src/rapmap/ksw2pp/KSW2Aligner.cpp
-)
-
-set (KSW2PP_ADVANCED_LIB_SRCS
-${GAT_SOURCE_DIR}/external/install/src/rapmap/ksw2pp/ksw2_extd2_sse.c
-${GAT_SOURCE_DIR}/external/install/src/rapmap/ksw2pp/ksw2_extf2_sse.c
-${GAT_SOURCE_DIR}/external/install/src/rapmap/ksw2pp/ksw2_extz2_sse.c
-)
-
-add_library(ksw2pp_sse2 OBJECT ${KSW2PP_ADVANCED_LIB_SRCS})
-add_library(ksw2pp_sse4 OBJECT ${KSW2PP_ADVANCED_LIB_SRCS})
-add_library(ksw2pp_basic OBJECT ${KSW2PP_BASIC_LIB_SRCS})
-
-set_target_properties(ksw2pp_sse2 PROPERTIES COMPILE_FLAGS "-O3 -msse -msse2 -mno-sse4.1")
-set_target_properties(ksw2pp_sse2 PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;KSW_SSE2_ONLY;HAVE_KALLOC")
-set_target_properties(ksw2pp_sse4 PROPERTIES COMPILE_FLAGS "-O3 -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1")
-set_target_properties(ksw2pp_sse4 PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;HAVE_KALLOC")
-set_target_properties(ksw2pp_basic PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;HAVE_KALLOC")
-
-set_target_properties(ksw2pp_basic PROPERTIES INCLUDE_DIRECTORIES ${GAT_SOURCE_DIR}/external/install/include/rapmap)
-set_target_properties(ksw2pp_sse4 PROPERTIES INCLUDE_DIRECTORIES ${GAT_SOURCE_DIR}/external/install/include/rapmap)
-
-# Build the ksw2pp library
-add_library(ksw2pp STATIC $<TARGET_OBJECTS:ksw2pp_sse2> $<TARGET_OBJECTS:ksw2pp_sse4> $<TARGET_OBJECTS:ksw2pp_basic>)
-set_target_properties(ksw2pp PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;HAVE_KALLOC")
-endif()
-
 set ( UNIT_TESTS_SRCS
     ${GAT_SOURCE_DIR}/tests/UnitTests.cpp
     FragmentLengthDistribution.cpp
@@ -175,6 +137,7 @@ RAPMAP_SALMON_SUPPORT=1
 PUFFERFISH_SALMON_SUPPORT=1
 HAVE_ANSI_TERM=1
 HAVE_SSTREAM=1
+STX_NO_STD_STRING_VIEW=1
 span_FEATURE_MAKE_SPAN_TO_STD=14
 )
 target_compile_options(salmon_core PUBLIC "$<$<CONFIG:DEBUG>:${TGT_DEBUG_FLAGS}>")
diff --git a/src/CollapsedCellOptimizer.cpp b/src/CollapsedCellOptimizer.cpp
index dc154ad1d..bbdefc781 100644
--- a/src/CollapsedCellOptimizer.cpp
+++ b/src/CollapsedCellOptimizer.cpp
@@ -1040,6 +1040,7 @@ bool CollapsedCellOptimizer::optimize(EqMapT& fullEqMap,
   }
 
   if( skippedCBcount > 0 ) {
+    aopt.numNoMapCB = skippedCBcount;
     aopt.jointLog->warn("Skipped {} barcodes due to No mapped read",
                         skippedCBcount);
     auto lowRegionCutoffIdx = numCells - numLowConfidentBarcode;
@@ -1080,17 +1081,16 @@ bool CollapsedCellOptimizer::optimize(EqMapT& fullEqMap,
                           "Can't performing whitelisting; Skipping",
                           numLowConfidentBarcode,
                           aopt.lowRegionMinNumBarcodes);
-
+      aopt.intelligentCutoff = trueBarcodes.size();
     } else if (trueBarcodes.size() - numLowConfidentBarcode < 90) {
         aopt.jointLog->warn("Num High confidence barcodes too less {} < 90."
                             "Can't performing whitelisting; Skipping",
                             trueBarcodes.size() - numLowConfidentBarcode);
+      aopt.intelligentCutoff = trueBarcodes.size();
     } else {
       aopt.jointLog->info("Starting white listing of {} cells", trueBarcodes.size());
       bool whitelistingSuccess = alevin::whitelist::performWhitelisting(aopt,
-                                                                        umiCount,
                                                                         trueBarcodes,
-                                                                        freqCounter,
                                                                         useRibo,
                                                                         useMito,
                                                                         numLowConfidentBarcode);
@@ -1113,104 +1113,7 @@ bool CollapsedCellOptimizer::optimize(EqMapT& fullEqMap,
   }
 
   if (aopt.dumpMtx){
-    aopt.jointLog->info("Starting dumping cell v gene counts in mtx format");
-    boost::filesystem::path qFilePath = aopt.outputDirectory / "quants_mat.mtx.gz";
-
-    boost::iostreams::filtering_ostream qFile;
-    qFile.push(boost::iostreams::gzip_compressor(6));
-    qFile.push(boost::iostreams::file_sink(qFilePath.string(),
-                                           std::ios_base::out | std::ios_base::binary));
-
-    // mtx header
-    qFile << "%%MatrixMarket\tmatrix\tcoordinate\treal\tgeneral" << std::endl
-          << numCells << "\t" << numGenes << "\t" << totalExpGeneCounts << std::endl;
-
-    {
-
-      auto popcount = [](uint8_t n) {
-        size_t count {0};
-        while (n) {
-          n &= n-1;
-          ++count;
-        }
-        return count;
-      };
-
-      uint32_t zerod_cells {0};
-      size_t numFlags = std::ceil(numGenes/8.0);
-      std::vector<uint8_t> alphasFlag (numFlags, 0);
-      size_t flagSize = sizeof(decltype(alphasFlag)::value_type);
-
-      std::vector<float> alphasSparse;
-      alphasSparse.reserve(numFlags/2);
-      size_t elSize = sizeof(decltype(alphasSparse)::value_type);
-
-      auto countMatFilename = aopt.outputDirectory / "quants_mat.gz";
-      if(not boost::filesystem::exists(countMatFilename)){
-        std::cout<<"ERROR: Can't import Binary file quants.mat.gz, it doesn't exist" << std::flush;
-        exit(84);
-      }
-
-      boost::iostreams::filtering_istream countMatrixStream;
-      countMatrixStream.push(boost::iostreams::gzip_decompressor());
-      countMatrixStream.push(boost::iostreams::file_source(countMatFilename.string(),
-                                                           std::ios_base::in | std::ios_base::binary));
-
-      for (size_t cellCount=0; cellCount<numCells; cellCount++){
-        countMatrixStream.read(reinterpret_cast<char*>(alphasFlag.data()), flagSize * numFlags);
-
-        size_t numExpGenes {0};
-        std::vector<size_t> indices;
-        for (size_t j=0; j<alphasFlag.size(); j++) {
-          uint8_t flag = alphasFlag[j];
-          size_t numNonZeros = popcount(flag);
-          numExpGenes += numNonZeros;
-
-          for (size_t i=0; i<8; i++){
-            if (flag & (128 >> i)) {
-              indices.emplace_back( i+(8*j) );
-            }
-          }
-        }
-
-        if (indices.size() != numExpGenes) {
-          aopt.jointLog->error("binary format reading error {}: {}: {}",
-                               indices.size(), numExpGenes);
-          aopt.jointLog->flush();
-          exit(84);
-        }
-
-
-        alphasSparse.clear();
-        alphasSparse.resize(numExpGenes);
-        countMatrixStream.read(reinterpret_cast<char*>(alphasSparse.data()), elSize * numExpGenes);
-
-        float readCount {0.0};
-        readCount += std::accumulate(alphasSparse.begin(), alphasSparse.end(), 0.0);
-        if (readCount > 1000000) {
-          aopt.jointLog->warn("A cell has more 1M count, Possible error");
-          aopt.jointLog->flush();
-        }
-
-        for(size_t i=0; i<numExpGenes; i++) {
-          qFile << std::fixed
-                << cellCount + 1 << "\t"
-                << indices[i] + 1 << "\t"
-                << alphasSparse[i] <<  std::endl;
-        }
-
-        if (readCount == 0.0){
-          zerod_cells += 1;
-        }
-      } // end-for each cell
-
-      if (zerod_cells > 0) {
-        aopt.jointLog->warn("Found {} cells with 0 counts", zerod_cells);
-      }
-    }
-
-    boost::iostreams::close(qFile);
-    aopt.jointLog->info("Finished dumping counts into mtx");
+    gzw.writeMtx(aopt.jointLog, aopt.outputDirectory, numGenes, numCells, totalExpGeneCounts);
   }
 
   return true;
@@ -1229,6 +1132,16 @@ bool CollapsedCellOptimizer::optimize(EqMapT& fullEqMap,
                                       CFreqMapT& freqCounter,
                                       size_t numLowConfidentBarcode);
 template
+bool CollapsedCellOptimizer::optimize(EqMapT& fullEqMap,
+                                      spp::sparse_hash_map<uint32_t, uint32_t>& txpToGeneMap,
+                                      spp::sparse_hash_map<std::string, uint32_t>& geneIdxMap,
+                                      AlevinOpts<apt::CITESeq>& aopt,
+                                      GZipWriter& gzw,
+                                      std::vector<std::string>& trueBarcodes,
+                                      std::vector<uint32_t>& umiCount,
+                                      CFreqMapT& freqCounter,
+                                      size_t numLowConfidentBarcode);
+template
 bool CollapsedCellOptimizer::optimize(EqMapT& fullEqMap,
                                       spp::sparse_hash_map<uint32_t, uint32_t>& txpToGeneMap,
                                       spp::sparse_hash_map<std::string, uint32_t>& geneIdxMap,
diff --git a/src/CollapsedGibbsSampler.cpp b/src/CollapsedGibbsSampler.cpp
index 7d8ef8830..60f30f6ad 100644
--- a/src/CollapsedGibbsSampler.cpp
+++ b/src/CollapsedGibbsSampler.cpp
@@ -339,7 +339,18 @@ bool CollapsedGibbsSampler::sample(
   }
 
   bool perTranscriptPrior = (sopt.useVBOpt) ? sopt.perTranscriptPrior : true;
-  double priorValue = (sopt.useVBOpt) ? sopt.vbPrior : 1e-4;
+  // for Gibbs sampling, we really want a notion of the uncertainty, and we 
+  // should steer away from sparsity more than we do when computing an MLE/MAP
+  // we set the minimum prior to 1 or 1e-3 if it is per-nucleotide
+  double prior = 1e-3; // prior to use if main algo was EM
+  if (sopt.useVBOpt) {
+    if (perTranscriptPrior) {
+      prior = (sopt.vbPrior  < 1.0) ? 1.0 : sopt.vbPrior;
+    } else { // per-nucleotide
+      prior = (sopt.vbPrior < 1e-3) ? 1e-3 : sopt.vbPrior;
+    } 
+  }
+  double priorValue = prior;
   std::vector<double> priorAlphas = populatePriorAlphasGibbs_(
       transcripts, effLens, priorValue, perTranscriptPrior);
   /** DEBUG
diff --git a/src/GZipWriter.cpp b/src/GZipWriter.cpp
index c090bf6a1..1f696c497 100644
--- a/src/GZipWriter.cpp
+++ b/src/GZipWriter.cpp
@@ -13,6 +13,7 @@
 #include "ReadExperiment.hpp"
 #include "ReadPair.hpp"
 #include "SalmonOpts.hpp"
+#include "SalmonUtils.hpp"
 #include "UnpairedRead.hpp"
 #include "TranscriptGroup.hpp"
 #include "SingleCellProtocols.hpp"
@@ -294,7 +295,6 @@ bool GZipWriter::writeBFH(boost::filesystem::path& outDir,
         auto count = umiIt.second;
 
         std::string s = umiObj.toStr();
-        std::reverse(s.begin(), s.end());
         equivFile << "\t" << s << "\t" << count;
       }
     }
@@ -376,6 +376,7 @@ template <typename ExpT>
 bool GZipWriter::writeEmptyMeta(const SalmonOpts& opts, const ExpT& experiment,
                                 std::vector<std::string>& errors) {
   namespace bfs = boost::filesystem;
+  using salmon::utils::DuplicateTargetStatus;
 
   bfs::path auxDir = path_ / opts.auxDir;
   bool auxSuccess = boost::filesystem::create_directories(auxDir);
@@ -407,6 +408,19 @@ bool GZipWriter::writeEmptyMeta(const SalmonOpts& opts, const ExpT& experiment,
     std::string mapTypeStr = opts.alnMode ? "alignment" : "mapping";
     oa(cereal::make_nvp("mapping_type", mapTypeStr));
 
+    auto has_dups = experiment.index_retains_duplicates();
+    switch(has_dups) {
+      case DuplicateTargetStatus::RETAINED_DUPLICATES:
+        oa(cereal::make_nvp("keep_duplicates", true));
+        break;
+      case DuplicateTargetStatus::REMOVED_DUPLICATES:
+        oa(cereal::make_nvp("keep_duplicates", false));
+        break;
+      case DuplicateTargetStatus::UNKNOWN:
+      default:
+        break;
+    }
+
     auto numValidTargets = transcripts.size();
     auto numDecoys = experiment.getNumDecoys();
     oa(cereal::make_nvp("num_targets", numValidTargets));
@@ -483,6 +497,9 @@ bool GZipWriter::writeMetaAlevin(const AlevinOpts<ProtocolT>& opts,
       oa(cereal::make_nvp("low_conf_cbs", opts.totalLowConfidenceCBs));
       oa(cereal::make_nvp("num_features", opts.numFeatures));
     }
+    if(opts.numNoMapCB > 0) {
+      oa(cereal::make_nvp("no_read_mapping_cbs", opts.numNoMapCB));
+    }
     oa(cereal::make_nvp("final_num_cbs", opts.intelligentCutoff));
 
     if (opts.numBootstraps > 0) {
@@ -511,6 +528,7 @@ template <typename ExpT>
 bool GZipWriter::writeMeta(const SalmonOpts& opts, const ExpT& experiment, const MappingStatistics& mstats) {
 
   namespace bfs = boost::filesystem;
+  using salmon::utils::DuplicateTargetStatus;
 
   bfs::path auxDir = path_ / opts.auxDir;
   bool auxSuccess = boost::filesystem::create_directories(auxDir);
@@ -758,6 +776,19 @@ bool GZipWriter::writeMeta(const SalmonOpts& opts, const ExpT& experiment, const
     std::string mapTypeStr = opts.alnMode ? "alignment" : "mapping";
     oa(cereal::make_nvp("mapping_type", mapTypeStr));
 
+    auto has_dups = experiment.index_retains_duplicates();
+    switch(has_dups) {
+      case DuplicateTargetStatus::RETAINED_DUPLICATES:
+        oa(cereal::make_nvp("keep_duplicates", true));
+        break;
+      case DuplicateTargetStatus::REMOVED_DUPLICATES:
+        oa(cereal::make_nvp("keep_duplicates", false));
+        break;
+      case DuplicateTargetStatus::UNKNOWN:
+      default:
+        break;
+    }
+
     auto numValidTargets = transcripts.size();
     auto numDecoys = experiment.getNumDecoys();
     oa(cereal::make_nvp("num_valid_targets", numValidTargets));
@@ -1080,8 +1111,6 @@ bool GZipWriter::writeSparseBootstraps(std::string& bcName,
   return true;
 }
 
-// FIXME(@k3yavi): The dumpUmiGraph parameter is un-used so I commented out the name
-// should we be doing something with it?
 bool GZipWriter::writeSparseAbundances(std::string& bcName,
                                        std::string& features,
                                        std::string& arboData,
@@ -1089,7 +1118,7 @@ bool GZipWriter::writeSparseAbundances(std::string& bcName,
                                        std::vector<double>& alphas,
                                        std::vector<uint8_t>& tiers,
                                        bool dumpArborescences,
-                                       bool /*dumpUmiGraph*/){
+                                       bool dumpUmiGraph){
 
   // construct the output vectors outside of the critical section
   // since e.g. this is more non-trivial work than in the dense case.
@@ -1183,8 +1212,9 @@ bool GZipWriter::writeSparseAbundances(std::string& bcName,
     } else if (featureCode != 0) {
       std::cerr<<"Error: Wrong feature code: " << featureCode << std::flush;
       exit(74);
-    }
-    header += "\tArborescenceCount\n";
+    } else if (dumpUmiGraph) {
+      header += "\tArborescenceCount";
+    } header += "\n";
     bcFeaturesStream_->write(header.c_str(), header.size());
   }
 
@@ -1481,6 +1511,110 @@ bool GZipWriter::writeUmiGraph(alevin::graph::Graph& g,
   return true;
 }
 
+void GZipWriter::writeMtx(std::shared_ptr<spdlog::logger>& jointLog, 
+              boost::filesystem::path& outputDirectory, 
+              size_t numGenes, size_t numCells, size_t totalExpGeneCounts) {
+    jointLog->info("Starting dumping cell v gene counts in mtx format");
+    boost::filesystem::path qFilePath = outputDirectory / "quants_mat.mtx.gz";
+
+    boost::iostreams::filtering_ostream qFile;
+    qFile.push(boost::iostreams::gzip_compressor(6));
+    qFile.push(boost::iostreams::file_sink(qFilePath.string(),
+                                           std::ios_base::out | std::ios_base::binary));
+
+    // mtx header
+    qFile << "%%MatrixMarket\tmatrix\tcoordinate\treal\tgeneral" << std::endl
+          << numCells << "\t" << numGenes << "\t" << totalExpGeneCounts << std::endl;
+
+    {
+
+      auto popcount = [](uint8_t n) {
+        size_t count {0};
+        while (n) {
+          n &= n-1;
+          ++count;
+        }
+        return count;
+      };
+
+      uint32_t zerod_cells {0};
+      size_t numFlags = std::ceil(numGenes/8.0);
+      std::vector<uint8_t> alphasFlag (numFlags, 0);
+      size_t flagSize = sizeof(decltype(alphasFlag)::value_type);
+
+      std::vector<float> alphasSparse;
+      alphasSparse.reserve(numFlags/2);
+      size_t elSize = sizeof(decltype(alphasSparse)::value_type);
+
+      auto countMatFilename = outputDirectory / "quants_mat.gz";
+      if(not boost::filesystem::exists(countMatFilename)){
+        jointLog->error("Can't import Binary file quants.mat.gz, it doesn't exist");
+        jointLog->flush();
+        exit(84);
+      }
+
+      boost::iostreams::filtering_istream countMatrixStream;
+      countMatrixStream.push(boost::iostreams::gzip_decompressor());
+      countMatrixStream.push(boost::iostreams::file_source(countMatFilename.string(),
+                                                           std::ios_base::in | std::ios_base::binary));
+
+      for (size_t cellCount=0; cellCount<numCells; cellCount++){
+        countMatrixStream.read(reinterpret_cast<char*>(alphasFlag.data()), flagSize * numFlags);
+
+        size_t numExpGenes {0};
+        std::vector<size_t> indices;
+        for (size_t j=0; j<alphasFlag.size(); j++) {
+          uint8_t flag = alphasFlag[j];
+          size_t numNonZeros = popcount(flag);
+          numExpGenes += numNonZeros;
+
+          for (size_t i=0; i<8; i++){
+            if (flag & (128 >> i)) {
+              indices.emplace_back( i+(8*j) );
+            }
+          }
+        }
+
+        if (indices.size() != numExpGenes) {
+          jointLog->error("binary format reading error {}: {}: {}",
+                           indices.size(), numExpGenes);
+          jointLog->flush();
+          exit(84);
+        }
+
+
+        alphasSparse.clear();
+        alphasSparse.resize(numExpGenes);
+        countMatrixStream.read(reinterpret_cast<char*>(alphasSparse.data()), elSize * numExpGenes);
+
+        float readCount {0.0};
+        readCount += std::accumulate(alphasSparse.begin(), alphasSparse.end(), 0.0);
+        if (readCount > 1000000) {
+          jointLog->warn("A cell has more 1M count, Possible error");
+          jointLog->flush();
+        }
+
+        for(size_t i=0; i<numExpGenes; i++) {
+          qFile << std::fixed
+                << cellCount + 1 << "\t"
+                << indices[i] + 1 << "\t"
+                << alphasSparse[i] <<  std::endl;
+        }
+
+        if (readCount == 0.0){
+          zerod_cells += 1;
+        }
+      } // end-for each cell
+
+      if (zerod_cells > 0) {
+        jointLog->warn("Found {} cells with 0 counts", zerod_cells);
+      }
+    }
+
+    boost::iostreams::close(qFile);
+    jointLog->info("Finished dumping counts into mtx");
+}
+
 using SCExpT = ReadExperiment<EquivalenceClassBuilder<SCTGValue>>;
 using BulkExpT = ReadExperiment<EquivalenceClassBuilder<TGValue>>;
 template <typename FragT>
@@ -1574,6 +1708,10 @@ bool GZipWriter::writeEquivCounts<SCExpT, apt::DropSeq>(
                                                         const AlevinOpts<apt::DropSeq>& aopts,
                                                         SCExpT& readExp);
 template
+bool GZipWriter::writeEquivCounts<SCExpT, apt::CITESeq>(
+                                                        const AlevinOpts<apt::CITESeq>& aopts,
+                                                        SCExpT& readExp);
+template
 bool GZipWriter::writeEquivCounts<SCExpT, apt::InDrop>(
                                                        const AlevinOpts<apt::InDrop>& aopts,
                                                        SCExpT& readExp);
@@ -1609,6 +1747,9 @@ template bool
 GZipWriter::writeMetaAlevin<apt::DropSeq>(const AlevinOpts<apt::DropSeq>& opts,
                                           boost::filesystem::path aux_dir);
 template bool
+GZipWriter::writeMetaAlevin<apt::CITESeq>(const AlevinOpts<apt::CITESeq>& opts,
+                                          boost::filesystem::path aux_dir);
+template bool
 GZipWriter::writeMetaAlevin<apt::InDrop>(const AlevinOpts<apt::InDrop>& opts,
                                          boost::filesystem::path aux_dir);
 template bool
diff --git a/src/ProgramOptionsGenerator.cpp b/src/ProgramOptionsGenerator.cpp
index cdecaf073..3087a8a4e 100644
--- a/src/ProgramOptionsGenerator.cpp
+++ b/src/ProgramOptionsGenerator.cpp
@@ -17,7 +17,9 @@ namespace salmon {
       ("seqBias", po::bool_switch(&(sopt.biasCorrect))->default_value(salmon::defaults::seqBiasCorrect),
        "Perform sequence-specific bias correction.")
       ("gcBias", po::bool_switch(&(sopt.gcBiasCorrect))->default_value(salmon::defaults::gcBiasCorrect),
-       "[beta for single-end reads] Perform fragment GC bias correction")
+       "[beta for single-end reads] Perform fragment GC bias correction.")
+      ("posBias", po::bool_switch(&(sopt.posBiasCorrect))->default_value(salmon::defaults::posBiasCorrect),
+        "Perform positional bias correction.")
       ("threads,p",
        po::value<uint32_t>(&(sopt.numThreads))->default_value(sopt.numThreads),
        "The number of threads to use concurrently.")
@@ -82,26 +84,25 @@ namespace salmon {
     mapspec.add_options()
       ("discardOrphansQuasi",
        po::bool_switch(&(sopt.discardOrphansQuasi))->default_value(salmon::defaults::discardOrphansQuasi),
-       "[Quasi-mapping mode only] : Discard orphan mappings in quasi-mapping "
+       "[selective-alignment mode only] : Discard orphan mappings in quasi-mapping "
        "mode.  If this flag is passed "
        "then only paired mappings will be considered toward quantification "
        "estimates.  The default behavior is "
        "to consider orphan mappings if no valid paired mappings exist.  This "
        "flag is independent of the option to "
        "write the orphaned mappings to file (--writeOrphanLinks).")
+       /*
       ("noSA",
        po::bool_switch(&(sopt.disableSA))->default_value(salmon::defaults::disableSA),
-       "[Quasi-mapping mode only] : Disable selective-alignment in favor of basic quasi-mapping. "
+       "[Not currently supported] : Disable selective-alignment in favor of basic quasi-mapping. "
        "If this flag is passed, selective-alignment and alignment scoring of reads will be disabled."
-       )
+       )*/
       ("validateMappings",
        po::bool_switch(&(sopt.validateMappings))->default_value(salmon::defaults::validateMappings),
-       "[Quasi-mapping mode only] : Validate mappings using alignment-based verifcation. "
-       "If this flag is passed, mappings will be validated to ensure that they could give "
-       "rise to a reasonable alignment before they are further used for quantification.")
+       "[*deprecated* (no effect; selective-alignment is the default)]")
       ("consensusSlack",
        po::value<float>(&(sopt.consensusSlack))->default_value(salmon::defaults::consensusSlack),
-       "[Quasi-mapping mode only] : The amount of slack allowed in the quasi-mapping consensus "
+       "[selective-alignment mode only] : The amount of slack allowed in the quasi-mapping consensus "
        "mechanism.  Normally, a transcript must cover all hits to be considered for mapping.  "
        "If this is set to a fraction, X, greater than 0 (and in [0,1)), then a transcript can fail to cover up to "
        "(100 * X)% of the hits before it is discounted as a mapping candidate.  The default value of this option "
@@ -109,68 +110,86 @@ namespace salmon {
        )
       ("scoreExp", 
       po::value<double>(&sopt.scoreExp)->default_value(salmon::defaults::scoreExp),
-      "[Quasi-mapping mode (w / mapping validation) only] : The factor by which sub-optimal alignment scores are "
+      "[selective-alignment mode only] : The factor by which sub-optimal alignment scores are "
       "downweighted to produce a probability.  If the best alignment score for the current read is S, and the score "
       "for a particular alignment is w, then the probability will be computed porportional to exp( - scoreExp * (S-w) )."
       )
       ("minScoreFraction",
        po::value<double>(&sopt.minScoreFraction),
-       "[Quasi-mapping mode (w / mapping validation) only] : The fraction of the optimal possible alignment score that a "
+       "[selective-alignment mode only] : The fraction of the optimal possible alignment score that a "
        "mapping must achieve in order to be considered \"valid\" --- should be in (0,1].\n"
        "Salmon Default 0.65 and Alevin Default 0.87"
        )
+      ("disableChainingHeuristic",
+      po::bool_switch(&(sopt.disableChainingHeuristic))->default_value(salmon::defaults::disableChainingHeuristic),
+      "[selective-alignment mode only] : By default, the heuristic of (Li 2018) is implemented, which terminates "
+      "the chaining DP once a given number of valid backpointers are found.  This speeds up the seed (MEM) "
+      "chaining step, but may result in sub-optimal chains in complex situations (e.g. sequences with many repeats and "
+      "overlapping repeats).  Passing this flag will disable the chaining heuristic, and perform the full chaining "
+      "dynamic program, guaranteeing the optimal chain is found in this step."
+      )
       ("decoyThreshold",
        po::value<double>(&sopt.decoyThreshold)->default_value(salmon::defaults::decoyThreshold),
-       "For an alignemnt to an annotated transcript to be considered invalid, it must have an alignment "
+       "[selective-alignment mode only] : For an alignemnt to an annotated transcript to be considered invalid, it must have an alignment "
        "score < (decoyThreshold * bestDecoyScore).  A value of 1.0 means that any alignment strictly worse than "
        "the best decoy alignment will be discarded.  A smaller value will allow reads to be allocated to transcripts "
        "even if they strictly align better to the decoy sequence."
       )
       ("ma",
        po::value<int16_t>(&sopt.matchScore)->default_value(salmon::defaults::matchScore),
-       "[Quasi-mapping mode (w / mapping validation) only] : The value given to a match between read and reference nucleotides "
+       "[selective-alignment mode only] : The value given to a match between read and reference nucleotides "
        "in an alignment."
        )
       ("mp",
        po::value<int16_t>(&sopt.mismatchPenalty)->default_value(salmon::defaults::mismatchPenalty),
-       "[Quasi-mapping mode (w / mapping validation) only] : The value given to a mis-match between read and reference nucleotides "
+       "[selective-alignment mode only] : The value given to a mis-match between read and reference nucleotides "
        "in an alignment."
        )
       ("go",
        po::value<int16_t>(&sopt.gapOpenPenalty)->default_value(salmon::defaults::gapOpenPenalty),
-       "[Quasi-mapping mode (w / mapping validation) only] : The value given to a gap opening in an alignment."
+       "[selective-alignment mode only] : The value given to a gap opening in an alignment."
        )
       ("ge",
        po::value<int16_t>(&sopt.gapExtendPenalty)->default_value(salmon::defaults::gapExtendPenalty),
-       "[Quasi-mapping mode (w / mapping validation) only] : The value given to a gap extension in an alignment."
+       "[selective-alignment mode only] : The value given to a gap extension in an alignment."
        )
       ("bandwidth",
        po::value<int32_t>(&sopt.dpBandwidth)->default_value(salmon::defaults::dpBandwidth),
-       "[Quasi-mapping mode (w / mapping validation) only] : The value used for the bandwidth passed to ksw2.  A smaller "
+       "[selective-alignment mode only] : The value used for the bandwidth passed to ksw2.  A smaller "
        "bandwidth can make the alignment verification run more quickly, but could possibly miss valid alignments."
        )
       ("allowDovetail",
        po::bool_switch(&(sopt.allowDovetail))->default_value(salmon::defaults::allowDovetail),
-       "[Quasi-mapping mode only] : allow dovetailing mappings."
+       "[selective-alignment mode only] : allow dovetailing mappings."
        )
       ("recoverOrphans",
        po::bool_switch(&(sopt.recoverOrphans))->default_value(salmon::defaults::recoverOrphans),
-       "[Quasi-mapping mode only] : Attempt to recover the mates of orphaned reads. This uses edlib for "
+       "[selective-alignment mode only] : Attempt to recover the mates of orphaned reads. This uses edlib for "
        "orphan recovery, and so introduces some computational overhead, but it can improve sensitivity."
        )
       ("mimicBT2", // horrible flag name, think of something better
        po::bool_switch(&(sopt.mimicBT2))->default_value(salmon::defaults::mimicBT2),
-       "[Quasi-mapping mode (w / mapping validation) only] : Set flags to mimic parameters similar to "
+       "[selective-alignment mode only] : Set flags to mimic parameters similar to "
        "Bowtie2 with --no-discordant and --no-mixed flags.  This increases disallows dovetailing reads, and "
        "discards orphans. Note, this does not impose the very strict parameters assumed by RSEM+Bowtie2, "
        "like gapless alignments.  For that behavior, use the --mimiStrictBT2 flag below."
        )
       ("mimicStrictBT2", // horrible flag name, think of something better
        po::bool_switch(&(sopt.mimicStrictBT2))->default_value(salmon::defaults::mimicStrictBT2),
-       "[Quasi-mapping mode (w / mapping validation) only] : Set flags to mimic the very strict parameters used by "
+       "[selective-alignment mode only] : Set flags to mimic the very strict parameters used by "
        "RSEM+Bowtie2.  This increases --minScoreFraction to 0.8, disallows dovetailing reads, "
        "discards orphans, and disallows gaps in alignments."
        )
+      ("softclip", 
+       po::bool_switch(&(sopt.softclip))->default_value(salmon::defaults::softclip),
+       "[selective-alignment mode only (experimental)] : Allos soft-clipping of reads during selective-alignment. If this "
+       "option is provided, then regions at the beginning or end of the read can be withheld from alignment "
+       "without any effect on the resulting score (i.e. neither adding nor removing from the score).  This "
+       "will drastically reduce the penalty if there are mismatches at the beginning or end of the read "
+       "due to e.g. low-quality bases or adapters.  NOTE: Even with soft-clipping enabled, the read must still "
+       "achieve a score of at least minScoreFraction * maximum achievable score, where the maximum achievable "
+       "score is computed based on the full (un-clipped) read length."
+      )
       ("softclipOverhangs", 
        po::bool_switch(&(sopt.softclipOverhangs))->default_value(salmon::defaults::softclipOverhangs),
        "[selective-alignment mode only] : Allow soft-clipping of reads that overhang the beginning or ends "
@@ -185,11 +204,12 @@ namespace salmon {
        "from the (approximate) initial mapping location and using extension alignment.  This is in contrast with the "
        "default behavior which is to only perform alignment between the MEMs in the optimal chain (and before the "
        "first and after the last MEM if applicable).  The default strategy forces the MEMs to belong to the alignment, "
-       "but has the benefit that it can discover indels prior to the first hit shared between the read and reference."
+       "but has the benefit that it can discover indels prior to the first hit shared between the read and reference. Except in "
+       "very rare circumstances, the default mode should be more accurate."
        )
       ("hardFilter",
        po::bool_switch(&(sopt.hardFilter))->default_value(salmon::defaults::hardFilter),
-       "[Quasi-mapping mode (w / mapping validation) only] : Instead of weighting mappings by their alignment score, "
+       "[selective-alignemnt mode only] : Instead of weighting mappings by their alignment score, "
        "this flag will discard any mappings with sub-optimal alignment score.  The default option of soft-filtering "
        "(i.e. weighting mappings by their alignment score) usually yields slightly more accurate abundance estimates "
        "but this flag may be desirable if you want more accurate 'naive' equivalence classes, rather "
@@ -204,16 +224,6 @@ namespace salmon {
        "final quantifications.  Filtering such alignments reduces the number of variables that need "
        "to be considered and can result in slightly faster inference and 'cleaner' equivalence classes."
        )
-      /*
-      ("allowOrphansFMD",
-       po::bool_switch(&(sopt.allowOrphans))->default_value(salmon::defaults::allowOrphansFMD),
-       "[FMD-mapping mode only] : Consider orphaned reads as valid hits when "
-       "performing lightweight-alignment.  This option will increase "
-       "sensitivity (allow more reads to map and "
-       "more transcripts to be detected), but may decrease specificity as "
-       "orphaned alignments are more likely "
-       "to be spurious.")
-      */
       ("writeMappings,z", po::value<string>(&sopt.qmFileName)
        ->default_value(salmon::defaults::quasiMappingDefaultFile)
        ->implicit_value(salmon::defaults::quasiMappingImplicitFile),
@@ -222,38 +232,17 @@ namespace salmon {
        "format.  By default, output will be directed to "
        "stdout, but an alternative file name can be "
        "provided instead.")
-      /*
-      ("consistentHits,c",
-       po::bool_switch(&(sopt.consistentHits))->default_value(salmon::defaults::consistentHits),
-       "Force hits gathered during "
-       "quasi-mapping to be \"consistent\" (i.e. co-linear and "
-       "approximately the right distance apart).")
-      */
-      ("fasterMapping",
-       po::bool_switch(&(sopt.fasterMapping))->default_value(salmon::defaults::fasterMapping),
-       "[Developer]: Disables some extra checks during quasi-mapping. This "
-       "may make mapping a little bit faster at the potential cost of "
-       "missing some difficult alignments.")
-      /*
-      ("quasiCoverage,x",
-       po::value<double>(&(sopt.quasiCoverage))->default_value(salmon::defaults::quasiCoverage),
-       "[Experimental]: The fraction of the read that must be covered by "
-       "MMPs (of length >= 31) if "
-       "this read is to be considered as \"mapped\".  This may help to "
-       "avoid \"spurious\" mappings. "
-       "A value of 0 (the default) denotes no coverage threshold (a single "
-       "31-mer can yield a mapping).  "
-       "Since coverage by exact matching, large, MMPs is a rather strict "
-       "condition, this value should likely "
-       "be set to something low, if used.")
-      */
       ("hitFilterPolicy",
        po::value<string>(&sopt.hitFilterPolicyStr)->default_value(salmon::defaults::hitFilterPolicyStr),
-       "Determines the policy by which hits are filtered in selective alignment.  Filtering hits after "
+       "[selective-alignment mode only] : Determines the policy by which hits are filtered in selective alignment.  Filtering hits after "
        "chaining (the default) is more sensitive, but more computationally intensive, because it performs "
        "the chaining dynamic program for all hits.  Filtering before chaining is faster, but some true hits "
        "may be missed.  The options are BEFORE, AFTER, BOTH and NONE."
        );
+
+      sopt.disableSA = salmon::defaults::disableSA;
+      sopt.fasterMapping = salmon::defaults::fasterMapping;
+      
       return mapspec;
   }
 
@@ -287,7 +276,7 @@ namespace salmon {
     alignin.add_options()
     ("discardOrphans",
        po::bool_switch(&(sopt.discardOrphansAln))->default_value(salmon::defaults::discardOrphansAln),
-       "[Alignment-based mode only] : Discard orphan alignments in the input "
+       "[alignment-based mode only] : Discard orphan alignments in the input "
        ".  If this flag is passed, then only paired alignments will be "
        "considered toward quantification estimates.  The default behavior is "
        "to consider orphan alignments if no valid paired mappings exist.")
@@ -363,6 +352,9 @@ namespace salmon {
       (
        "gemcode", po::bool_switch()->default_value(alevin::defaults::isGemcode),
        "Use 10x gemcode v1 Single Cell protocol for the library.")
+      (
+       "citeseq", po::bool_switch()->default_value(alevin::defaults::isCITESeq),
+       "Use CITESeq Single Cell protocol for the library, 16 CB, 12 UMI and features.")
       (
        "celseq", po::bool_switch()->default_value(alevin::defaults::isCELSeq),
        "Use CEL-Seq Single Cell protocol for the library.")
@@ -375,6 +367,12 @@ namespace salmon {
       (
        "whitelist", po::value<std::string>(),
        "File containing white-list barcodes")
+      (
+       "featureStart", po::value<size_t>(),
+       "This flag should be used with citeseq and specifies the starting index of the feature barcode on Read2.")
+      (
+       "featureLength", po::value<size_t>(),
+       "This flag should be used with citeseq and specifies the length of the feature barcode.")
       (
        "noQuant", po::bool_switch()->default_value(alevin::defaults::noQuant),
        "Don't run downstream barcode-salmon model.")
@@ -773,8 +771,6 @@ namespace salmon {
     po::options_description testing("\n"
                                     "testing options");
     testing.add_options()
-      ("posBias", po::bool_switch(&(sopt.posBiasCorrect))->default_value(salmon::defaults::posBiasCorrect),
-                          "[experimental] Perform positional bias correction")
       ("noRichEqClasses",
       po::bool_switch(&(sopt.noRichEqClasses))->default_value(salmon::defaults::noRichEqClasses),
         "[TESTING OPTION]: Disable \"rich\" equivalent classes.  If this flag is "
@@ -788,6 +784,10 @@ namespace salmon {
         "account the "
         "goodness-of-fit of an alignment with the empirical fragment length "
         "distribution")
+      ("disableAlignmentCache",
+      po::bool_switch(&(sopt.disableAlignmentCache))->default_value(salmon::defaults::disableAlignmentCache),
+        "[TESTING OPTION]: Turn of the alignment cache.  This will hurt performance but "
+        "can help debug any issues that might result from caching")
       ("rankEqClasses",
       po::bool_switch(&(sopt.rankEqClasses))->default_value(salmon::defaults::rankEqClasses),
         "[TESTING OPTION]: Keep separate equivalence classes for each distinct "
diff --git a/src/SalmonAlevin.cpp b/src/SalmonAlevin.cpp
index 7864f4811..88a2b3bd0 100644
--- a/src/SalmonAlevin.cpp
+++ b/src/SalmonAlevin.cpp
@@ -455,6 +455,7 @@ void processReadsQuasi(
   bool mimicStrictBT2 = salmonOpts.mimicStrictBT2;
   bool mimicBT2 = salmonOpts.mimicBT2;
   bool noDovetail = !salmonOpts.allowDovetail;
+  bool useChainingHeuristic = !salmonOpts.disableChainingHeuristic;
 
   pufferfish::util::HitCounters hctr;
   salmon::utils::MappingType mapType{salmon::utils::MappingType::UNMAPPED};
@@ -488,6 +489,7 @@ void processReadsQuasi(
   std::string readSubSeq;
   //////////////////////
 
+  bool tryAlign{salmonOpts.validateMappings};
   auto rg = parser->getReadGroup();
   while (parser->refill(rg)) {
     rangeSize = rg.size();
@@ -500,6 +502,8 @@ void processReadsQuasi(
       std::exit(1);
     }
 
+    LibraryFormat expectedLibraryFormat = rl.format();
+
     for (size_t i = 0; i < rangeSize; ++i) { // For all the read in this batch
       auto& rp = rg[i];
       readLenLeft = rp.first.seq.length();
@@ -528,7 +532,8 @@ void processReadsQuasi(
       nonstd::optional<uint32_t> barcodeIdx;
       bool seqOk;
 
-      if (alevinOpts.protocol.end == bcEnd::FIVE){
+      if (alevinOpts.protocol.end == bcEnd::FIVE ||
+          alevinOpts.protocol.end == bcEnd::THREE){
         barcode = aut::extractBarcode(rp.first.seq, alevinOpts.protocol);
         seqOk = (barcode.has_value()) ?
           aut::sequenceCheck(*barcode, Sequence::BARCODE) : false;
@@ -597,7 +602,7 @@ void processReadsQuasi(
                   //auto rh = hitCollector(readSubSeq, saSearcher, hcInfo);
                 }
               } else {
-                readSubSeq = rp.second.seq;
+                aut::getReadSequence(alevinOpts.protocol, rp.second.seq, readSubSeq);
                 auto rh = tooShortRight ? false : memCollector(readSubSeq, qc,
                                        true, // isLeft
                                        false // verbose
@@ -606,7 +611,7 @@ void processReadsQuasi(
               memCollector.findChains(readSubSeq, hits,
                                       salmonOpts.fragLenDistMax,
                                       MateStatus::PAIRED_END_RIGHT,
-                                      true, // heuristic chaining
+                                      useChainingHeuristic, // heuristic chaining
                                       true, // isLeft
                                       false // verbose
                                       );
@@ -644,7 +649,6 @@ void processReadsQuasi(
       }
 
         // adding validate mapping code
-        bool tryAlign{salmonOpts.validateMappings};
         if (tryAlign and !jointHits.empty()) {
           puffaligner.clear();
           bestScorePerTranscript.clear();
@@ -660,7 +664,7 @@ void processReadsQuasi(
           */
           salmon::mapping_utils::MappingScoreInfo msi = {invalidScore, invalidScore, invalidScore, decoyThreshold};
 
-          std::vector<decltype(msi.bestScore)> scores(jointHits.size(), 0);
+          std::vector<decltype(msi.bestScore)> scores(jointHits.size(), invalidScore);
           size_t idx{0};
           bool isMultimapping = (jointHits.size() > 1);
 
@@ -671,7 +675,17 @@ void processReadsQuasi(
             bool validScore = (hitScore != invalidScore);
             numMappingsDropped += validScore ? 0 : 1;
             auto tid = qidx->getRefId(jointHit.tid);
-            salmon::mapping_utils::updateRefMappings(tid, hitScore, idx, transcripts, invalidScore, 
+            
+            // NOTE: Here, we know that the read arising from the transcriptome is the "right"
+            // read (read 2).  So we interpret compatibility in that context.
+            // TODO: Make this code more generic and modular (account for the possibility of different library)
+            // protocols or setups where the reads are not always "paired-end" and the transcriptomic read is not
+            // always read 2 (@k3yavi).
+            bool isCompat = (expectedLibraryFormat.strandedness == ReadStrandedness::U) or 
+                            (jointHit.orphanClust()->isFw and (expectedLibraryFormat.strandedness == ReadStrandedness::AS)) or
+                            (!jointHit.orphanClust()->isFw and (expectedLibraryFormat.strandedness == ReadStrandedness::SA));
+
+            salmon::mapping_utils::updateRefMappings(tid, hitScore, isCompat, idx, transcripts, invalidScore, 
                                                      msi,
                                                      //bestScore, secondBestScore, bestDecoyScore,
                                                      scores, bestScorePerTranscript, perm);
@@ -698,10 +712,14 @@ void processReadsQuasi(
                                                               bestDecoyScore,
                                                               */
                                                               jointAlignments);
+            if (!jointAlignments.empty()) {
+              mapType = salmon::utils::MappingType::SINGLE_MAPPED;
+            }
           } else {
             numDecoyFrags += bestHitDecoy ? 1 : 0;
             ++numDropped;
             jointHitGroup.clearAlignments();
+            mapType = (bestHitDecoy) ? salmon::utils::MappingType::DECOY : salmon::utils::MappingType::UNMAPPED;
           }
         } //end-if validate mapping
 
@@ -713,7 +731,7 @@ void processReadsQuasi(
           */
         }
 
-      if (writeUnmapped and mapType == salmon::utils::MappingType::UNMAPPED) {
+      if (writeUnmapped and mapType != salmon::utils::MappingType::SINGLE_MAPPED) {
         // If we have no mappings --- then there's nothing to do
         // unless we're outputting names for un-mapped reads
         unmappedNames << rp.first.name << ' ' << salmon::utils::str(mapType) << '\n';
@@ -783,24 +801,28 @@ void processReadsQuasi(
 
 template <typename AlnT, typename ProtocolT>
 void processReadLibrary(
-                        ReadExperimentT& readExp, ReadLibrary& rl, SalmonIndex* sidx,
-                        std::vector<Transcript>& transcripts, ClusterForest& clusterForest,
-                        std::atomic<uint64_t>&
-                        numObservedFragments, // total number of reads we've looked at
-                        std::atomic<uint64_t>&
-                        numAssignedFragments,              // total number of assigned reads
+                        ReadExperimentT& readExp, 
+                        ReadLibrary& rl, 
+                        SalmonIndex* sidx,
+                        std::vector<Transcript>& transcripts, 
+                        ClusterForest& clusterForest,
+                        std::atomic<uint64_t>& numObservedFragments, // total number of reads we've looked at
+                        std::atomic<uint64_t>& numAssignedFragments, // total number of assigned reads
                         std::atomic<uint64_t>& upperBoundHits, // upper bound on # of mapped frags
                         std::atomic<uint32_t>& smallSeqs,
                         std::atomic<uint32_t>& nSeqs,
                         bool initialRound,
-                        std::atomic<bool>& burnedIn, ForgettingMassCalculator& fmCalc,
+                        std::atomic<bool>& burnedIn, 
+                        ForgettingMassCalculator& fmCalc,
                         FragmentLengthDistribution& fragLengthDist,
                         SalmonOpts& salmonOpts,
-                        std::mutex& iomutex, size_t numThreads,
+                        std::mutex& iomutex, 
+                        size_t numThreads,
                         std::vector<AlnGroupVec<AlnT>>& structureVec,
                         AlevinOpts<ProtocolT>& alevinOpts,
                         SoftMapT& barcodeMap,
-                        spp::sparse_hash_map<std::string, uint32_t>& trBcs, MappingStatistics& mstats) {
+                        spp::sparse_hash_map<std::string, uint32_t>& trBcs, 
+                        MappingStatistics& mstats) {
 
   std::vector<std::thread> threads;
 
@@ -1416,6 +1438,16 @@ int alevinQuant(AlevinOpts<apt::DropSeq>& aopt,
                 CFreqMapT& freqCounter,
                 size_t numLowConfidentBarcode);
 template
+int alevinQuant(AlevinOpts<apt::CITESeq>& aopt,
+                SalmonOpts& sopt,
+                SoftMapT& barcodeMap,
+                TrueBcsT& trueBarcodes,
+                spp::sparse_hash_map<uint32_t, uint32_t>& txpToGeneMap,
+                spp::sparse_hash_map<std::string, uint32_t>& geneIdxMap,
+                boost::program_options::parsed_options& orderedOptions,
+                CFreqMapT& freqCounter,
+                size_t numLowConfidentBarcode);
+template
 int alevinQuant(AlevinOpts<apt::InDrop>& aopt,
                 SalmonOpts& sopt,
                 SoftMapT& barcodeMap,
diff --git a/src/SalmonQuantify.cpp b/src/SalmonQuantify.cpp
index d2c6efd3b..16fe0f252 100644
--- a/src/SalmonQuantify.cpp
+++ b/src/SalmonQuantify.cpp
@@ -18,7 +18,6 @@
     along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
 <HEADER
 **/
-
 #include <algorithm>
 #include <atomic>
 #include <cassert>
@@ -828,6 +827,7 @@ void processReads(
   bool mimicStrictBT2 = salmonOpts.mimicStrictBT2;
   bool mimicBT2 = salmonOpts.mimicBT2;
   bool noDovetail = !salmonOpts.allowDovetail;
+  bool useChainingHeuristic = !salmonOpts.disableChainingHeuristic;
   size_t numOrphansRescued{0};
   phmap::flat_hash_map<uint32_t, std::pair<int32_t, int32_t>> bestScorePerTranscript;
   uint64_t firstDecoyIndex = qidx->firstDecoyIndex();
@@ -874,6 +874,8 @@ void processReads(
       spdlog::drop_all();
       std::exit(1);
     }
+    
+    LibraryFormat expectedLibraryFormat = rl.format();
 
     bool tryAlign{salmonOpts.validateMappings};
     for (size_t i = 0; i < rangeSize; ++i) { // For all the reads in this batch
@@ -925,7 +927,7 @@ void processReads(
                               leftHits,
                               salmonOpts.fragLenDistMax,
                               MateStatus::PAIRED_END_LEFT,
-                              true, // heuristic chaining
+                              useChainingHeuristic, // heuristic chaining
                               true, // isLeft
                               false // verbose
                               );
@@ -933,7 +935,7 @@ void processReads(
                               rightHits,
                               salmonOpts.fragLenDistMax,
                               MateStatus::PAIRED_END_RIGHT,
-                              true,  // heuristic chaining
+                              useChainingHeuristic,  // heuristic chaining
                               false, // isLeft
                               false  // verbose
                               );
@@ -1043,9 +1045,12 @@ void processReads(
       if (!jointHits.empty()) {
         bool isPaired = jointHits.front().mateStatus ==
                         MateStatus::PAIRED_END_PAIRED;
+        /*
         if (isPaired) {
           mapType = salmon::utils::MappingType::PAIRED_MAPPED;
         }
+        */
+
         // If we are ignoring orphans
         if (!salmonOpts.allowOrphans) {
           // If the mappings for the current read are not properly-paired (i.e.
@@ -1069,7 +1074,7 @@ void processReads(
           */
 
           salmon::mapping_utils::MappingScoreInfo msi = {invalidScore, invalidScore, invalidScore, decoyThreshold};
-          std::vector<decltype(msi.bestScore)> scores(jointHits.size(), 0);
+          std::vector<decltype(msi.bestScore)> scores(jointHits.size(), invalidScore);
           size_t idx{0};
           bool isMultimapping = (jointHits.size() > 1);
 
@@ -1078,13 +1083,95 @@ void processReads(
             bool validScore = (hitScore != invalidScore);
             numMappingsDropped += validScore ? 0 : 1;
             auto tid = qidx->getRefId(jointHit.tid);
-            salmon::mapping_utils::updateRefMappings(tid, hitScore, idx, transcripts, invalidScore, msi, 
+
+            // ----- compatibility determination 
+            // This code is to determine if a given PE mapping is _compatible_ or not with
+            // the expected library format.  This is to remove stochasticity in mapping.
+            // Specifically, if there are equally highest-scoring alignments to a target
+            // we will always prefer the one that is compatible.
+
+            bool isUnstranded = expectedLibraryFormat.strandedness == ReadStrandedness::U;
+            bool isOrphan = jointHit.isOrphan();
+            
+            // if the protocol is unstranded:
+            // (1) an orphan is always compatible
+            // (2) a paired-end mapping is compatible if the ends are on separate strands
+            bool isCompat = isUnstranded ? (isOrphan ? true  : (jointHit.leftClust->isFw != jointHit.rightClust->isFw)) : false;
+
+            // if the mapping hasn't been determined to be compatible yet
+            if (!isCompat) {
+              // if this is an orphan mapping 
+              if (isOrphan) {
+                bool isLeft = jointHit.isLeftAvailable();
+                // ISF
+                // if the expectation is ISF, then this read is compatible if
+                // (1) we observed the left read and it is forward 
+                // (2) we observed the right read and it is not foward
+
+                // ISR
+                // if the expectation is ISR, then this read is compatible if 
+                // (1) we observed the left read and it is not forward
+                // (2) we observed the right read and it is foward
+                isCompat = (expectedLibraryFormat.strandedness == ReadStrandedness::SA) ? 
+                           ((isLeft and jointHit.leftClust->isFw) or (!isLeft and !jointHit.rightClust->isFw)) : 
+                           ((expectedLibraryFormat.strandedness == ReadStrandedness::AS) ?
+                           ((isLeft and !jointHit.leftClust->isFw) or (!isLeft and jointHit.rightClust->isFw)) : false);
+              } else {
+                bool leftIsFw = jointHit.leftClust->isFw;
+                bool rightIsFw = jointHit.rightClust->isFw;
+                // paired-end paired
+                isCompat = (expectedLibraryFormat.strandedness == ReadStrandedness::SA) ? 
+                           (leftIsFw and !rightIsFw) :
+                           ((expectedLibraryFormat.strandedness == ReadStrandedness::AS) ? 
+                            (!leftIsFw and rightIsFw) : false);
+              }
+            }
+            // ----- end compatibility determination 
+
+            // alternative compat
+            /**
+            LibraryFormat lf(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::U);
+            MateStatus ms = isOrphan ? 
+                            (jointHit.isLeftAvailable() ? MateStatus::PAIRED_END_LEFT : MateStatus::PAIRED_END_RIGHT) :
+                            MateStatus::PAIRED_END_PAIRED;
+            switch (ms) {
+              case MateStatus::PAIRED_END_LEFT: 
+              case MateStatus::PAIRED_END_RIGHT: {
+                lf = salmon::utils::hitType(jointHit.orphanClust()->getTrFirstHitPos(), jointHit.orphanClust()->isFw);
+              } break;
+              case MateStatus::PAIRED_END_PAIRED: {
+                uint32_t end1Pos = (jointHit.leftClust->isFw) ? jointHit.leftClust->getTrFirstHitPos() : 
+                                   jointHit.leftClust->getTrFirstHitPos() + jointHit.leftClust->readLen;
+                uint32_t end2Pos = (jointHit.rightClust->isFw) ? jointHit.rightClust->getTrFirstHitPos() : 
+                                   jointHit.rightClust->getTrFirstHitPos() + jointHit.rightClust->readLen;
+                bool canDovetail = false;
+                lf =
+                    salmon::utils::hitType(end1Pos, jointHit.leftClust->isFw, jointHit.leftClust->readLen, end2Pos,
+                                          jointHit.rightClust->isFw, jointHit.rightClust->readLen, canDovetail);
+              } break;
+              case MateStatus::SINGLE_END: {
+                // do nothing
+              } break;
+              default:
+                break;
+            }
+
+            auto p = isOrphan ? jointHit.orphanClust()->getTrFirstHitPos() : jointHit.leftClust->getTrFirstHitPos();
+            auto fw = isOrphan ? jointHit.orphanClust()->isFw : jointHit.leftClust->isFw;
+            bool isCompatRef = salmon::utils::isCompatible(lf, expectedLibraryFormat, static_cast<int32_t>(p), fw, ms);
+
+            if (isCompat != isCompatRef) {
+              std::cerr << "\n\n\nERROR: simple implemntation says compatiable is [" <<  isCompat << "], but ref. implementation says [" << isCompatRef << "]\n\n";
+            }
+            **/
+            // end of alternative compat
+
+            salmon::mapping_utils::updateRefMappings(tid, hitScore, isCompat, idx, transcripts, invalidScore, msi,
                               //bestScore, secondBestScore, bestDecoyScore,
                               scores, bestScorePerTranscript, perm);
             ++idx;
           }
 
-          //bool bestHitDecoy = (msi.bestScore < msi.bestDecoyScore);
           bool bestHitDecoy = msi.haveOnlyDecoyMappings();
           if (msi.bestScore > invalidScore and !bestHitDecoy) {
             salmon::mapping_utils::filterAndCollectAlignments(jointHits,
@@ -1104,13 +1191,35 @@ void processReads(
                                        bestDecoyScore,
                                        */
                                        jointAlignments);
+            // if we have alignments
+            if (!jointAlignments.empty()) {
+              // chose the mapType based on the mate status
+              auto& h = jointAlignments.front();
+              switch (h.mateStatus) {
+              case pufferfish::util::MateStatus::PAIRED_END_PAIRED:
+                mapType = salmon::utils::MappingType::PAIRED_MAPPED;
+                break;
+              case pufferfish::util::MateStatus::PAIRED_END_LEFT:
+                mapType = salmon::utils::MappingType::LEFT_ORPHAN;
+                break;
+              case pufferfish::util::MateStatus::PAIRED_END_RIGHT:
+                mapType = salmon::utils::MappingType::RIGHT_ORPHAN;
+                break;
+              default:
+                mapType = salmon::utils::MappingType::UNMAPPED;
+                break;
+              }
+            }
+
           } else {
+            // if we had decoy hits, our type is decoy, otherwise it's unmapped
+            mapType = bestHitDecoy ? salmon::utils::MappingType::DECOY : salmon::utils::MappingType::UNMAPPED;
             numDecoyFrags += bestHitDecoy ? 1 : 0;
             ++numFragsDropped;
             jointAlignmentGroup.clearAlignments();
           }
         } else if (isPaired and noDovetail) {
-          salmonOpts.jointLog->warn("HAVE NOT THOUGHT ABOUT THIS CODE-PATH YET!");
+          salmonOpts.jointLog->critical("This code path is not yet implemented!");
           jointAlignments.erase(
                           std::remove_if(jointAlignments.begin(), jointAlignments.end(),
                                          [](const QuasiAlignment& h) -> bool {
@@ -1441,6 +1550,7 @@ void processReads(
   bool mimicStrictBT2 = salmonOpts.mimicStrictBT2;
   bool mimicBT2 = salmonOpts.mimicBT2;
   bool noDovetail = !salmonOpts.allowDovetail;
+  bool useChainingHeuristic = !salmonOpts.disableChainingHeuristic;
   size_t numOrphansRescued{0};
   //*******
 
@@ -1480,6 +1590,8 @@ void processReads(
        std::exit(1);
      }
 
+     LibraryFormat expectedLibraryFormat = rl.format();
+
      bool tryAlign{salmonOpts.validateMappings};
      for (size_t i = 0; i < rangeSize; ++i) { // For all the read in this batch
        auto& rp = rg[i];
@@ -1489,6 +1601,7 @@ void processReads(
        jointHitGroup.clearAlignments();
        auto& jointAlignments = jointHitGroup.alignments();
 
+       mapType = salmon::utils::MappingType::UNMAPPED;
        perm.clear();
        hits.clear();
        jointHits.clear();
@@ -1507,7 +1620,7 @@ void processReads(
                                hits,
                                salmonOpts.fragLenDistMax,
                                MateStatus::SINGLE_END,
-                               true, // heuristic chaining
+                               useChainingHeuristic, // heuristic chaining
                                true, // isLeft
                                false // verbose
                                );
@@ -1530,6 +1643,7 @@ void processReads(
                                                     readLen,
                                                     memCollector.getConsensusFraction()); 
          hctr.peHits += jointHits.size();
+
          if (initialRound) {
            upperBoundHits += (jointHits.size() > 0);
          }
@@ -1562,7 +1676,7 @@ void processReads(
          */
          salmon::mapping_utils::MappingScoreInfo msi = {invalidScore, invalidScore, invalidScore, decoyThreshold};
 
-         std::vector<decltype(msi.bestScore)> scores(jointHits.size(), 0);
+         std::vector<decltype(msi.bestScore)> scores(jointHits.size(), invalidScore);
          size_t idx{0};
          bool isMultimapping = (jointHits.size() > 1);
 
@@ -1571,7 +1685,12 @@ void processReads(
            bool validScore = (hitScore != invalidScore);
            numMappingsDropped += validScore ? 0 : 1;
            auto tid = qidx->getRefId(jointHit.tid);
-           salmon::mapping_utils::updateRefMappings(tid, hitScore, idx, transcripts, invalidScore, 
+          
+           bool isCompat = (expectedLibraryFormat.strandedness == ReadStrandedness::U) or 
+                           (jointHit.orphanClust()->isFw and (expectedLibraryFormat.strandedness == ReadStrandedness::S)) or
+                           (!jointHit.orphanClust()->isFw and (expectedLibraryFormat.strandedness == ReadStrandedness::A));
+
+           salmon::mapping_utils::updateRefMappings(tid, hitScore, isCompat, idx, transcripts, invalidScore, 
                             msi,
                             //bestScore, secondBestScore, bestDecoyScore,
                              scores, bestScorePerTranscript, perm);
@@ -1598,7 +1717,14 @@ void processReads(
                                       bestDecoyScore,
                                       */
                                       jointAlignments);
+            // if we have any alignments, then they are 
+            // just single mapped.
+            if (!jointAlignments.empty()) {
+              mapType = salmon::utils::MappingType::SINGLE_MAPPED;
+            }
          } else {
+           // if we had decoy hits, our type is decoy, otherwise it's unmapped
+           mapType = (bestHitDecoy) ? salmon::utils::MappingType::DECOY : salmon::utils::MappingType::UNMAPPED;
            numDecoyFrags += bestHitDecoy ? 1 : 0;
            ++numFragsDropped;
            jointHitGroup.clearAlignments();
@@ -1665,7 +1791,7 @@ void processReads(
          writeAlignmentsToStreamSingle(rp, formatter, jointAlignments, sstream, false, true);
        }
 
-       if (writeUnmapped and jointHits.empty()) {
+       if (writeUnmapped and mapType != salmon::utils::MappingType::SINGLE_MAPPED) {
          // If we have no mappings --- then there's nothing to do
          // unless we're outputting names for un-mapped reads
          unmappedNames << rp.name << " u\n";
diff --git a/src/SalmonUtils.cpp b/src/SalmonUtils.cpp
index 091f4b15f..770030696 100644
--- a/src/SalmonUtils.cpp
+++ b/src/SalmonUtils.cpp
@@ -69,6 +69,8 @@ std::string str(const MappingType& mt) {
     return "mp";
   case MappingType::SINGLE_MAPPED:
     return "ms";
+  case MappingType::DECOY:
+    return "d";
   }
   // should never get here!
   return "E";
@@ -1942,7 +1944,7 @@ bool processQuantOptions(SalmonOpts& sopt,
       jointLog->critical("Note: Alignment-free mapping (i.e. mapping without subsequent selective-alignment) "
                          "has not yet been throughly tested under the pufferfish-based index and using the "
                          "pufferfish-based mapping strategies.  Thus, disabling of selective-alignment "
-                         "is not currently allowed.  We will explore re-enabling this option in future "
+                         "is not currently allowed.  We may, potentially explore re-enabling this option in future "
                          "versions of salmon.");
       jointLog->flush();
       return false;
@@ -1989,6 +1991,13 @@ bool processQuantOptions(SalmonOpts& sopt,
     jointLog->info("Using per-nucleotide prior with the default VB prior.  Setting the default prior to {}",sopt.vbPrior);
   }
 
+  // If the maxHashResizeThreads was defaulted, then set it equal to the regular number 
+  // of threads.
+  if (vm["maxHashResizeThreads"].defaulted()) {
+    sopt.maxHashResizeThreads = sopt.numThreads;
+    jointLog->info("setting maxHashResizeThreads to {}", sopt.maxHashResizeThreads);
+  }
+
   {
     try {
       conflicting_options(vm, "alignments", "eqclasses");
@@ -3219,6 +3228,20 @@ void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
   }
   */
 }
+
+double compute_1_edit_threshold(int32_t l, const SalmonOpts& sopt) {
+  int32_t match = sopt.matchScore;
+  int32_t mismatch = (sopt.mismatchPenalty < 0) ? sopt.mismatchPenalty : -sopt.mismatchPenalty;
+  int32_t go = (sopt.gapOpenPenalty < 0) ? sopt.gapOpenPenalty : -sopt.gapOpenPenalty;
+  int32_t ge = (sopt.gapExtendPenalty < 0) ? sopt.gapExtendPenalty : -sopt.gapExtendPenalty;
+
+  // cost of subst = mismatch - cost of losing a match
+  // cost of deletion = gap open + gap extend - cost of losing a match
+  int32_t edit_cost = std::min(mismatch - match, go + ge - match);
+  int32_t max_score = l * match;
+  return  (static_cast<double>(max_score + edit_cost) - 0.5) / max_score;
+}
+
 } // namespace utils
 } // namespace salmon
 
diff --git a/src/VersionChecker.cpp b/src/VersionChecker.cpp
index debec5b11..64d2d0e4d 100644
--- a/src/VersionChecker.cpp
+++ b/src/VersionChecker.cpp
@@ -26,9 +26,10 @@ VersionChecker::VersionChecker(boost::asio::io_service& io_service,
   request_stream << "Accept: */*\r\n";
   request_stream << "Connection: close\r\n\r\n";
 
+
+  deadline_.expires_from_now(boost::posix_time::seconds(1));
   deadline_.async_wait(boost::bind(&VersionChecker::cancel_upgrade_check, this,
                                    boost::asio::placeholders::error));
-  deadline_.expires_from_now(boost::posix_time::seconds(1));
 
   // Start an asynchronous resolve to translate the server and service names
   // into a list of endpoints.
diff --git a/src/WhiteList.cpp b/src/WhiteList.cpp
index 3b5798246..1b29d8f13 100644
--- a/src/WhiteList.cpp
+++ b/src/WhiteList.cpp
@@ -166,9 +166,8 @@ namespace alevin {
 
     template <typename ProtocolT>
     bool performWhitelisting(AlevinOpts<ProtocolT>& aopt,
-                             std::vector<uint32_t>& /*umiCount*/,
                              std::vector<std::string>& trueBarcodes,
-                             CFreqMapT& /*freqCounter*/, bool useRibo, bool useMito,
+                             bool useRibo, bool useMito,
                              size_t numLowConfidentBarcode){
       size_t numCells = trueBarcodes.size();
       size_t numFeatures {5};
@@ -254,49 +253,44 @@ namespace alevin {
       return true;
     }
     template bool performWhitelisting(AlevinOpts<alevin::protocols::DropSeq>& aopt,
-                                      std::vector<uint32_t>& umiCount,
                                       std::vector<std::string>& trueBarcodes,
-                                      CFreqMapT& freqCounter, bool useRibo, bool useMito,
+                                      bool useRibo, bool useMito,
+                                      size_t numLowConfidentBarcode);
+    template bool performWhitelisting(AlevinOpts<alevin::protocols::CITESeq>& aopt,
+                                      std::vector<std::string>& trueBarcodes,
+                                      bool useRibo, bool useMito,
                                       size_t numLowConfidentBarcode);
     template bool performWhitelisting(AlevinOpts<alevin::protocols::InDrop>& aopt,
-                                      std::vector<uint32_t>& umiCount,
                                       std::vector<std::string>& trueBarcodes,
-                                      CFreqMapT& freqCounter, bool useRibo, bool useMito,
+                                      bool useRibo, bool useMito,
                                       size_t numLowConfidentBarcode);
     template bool performWhitelisting(AlevinOpts<alevin::protocols::ChromiumV3>& aopt,
-                                      std::vector<uint32_t>& umiCount,
                                       std::vector<std::string>& trueBarcodes,
-                                      CFreqMapT& freqCounter, bool useRibo, bool useMito,
+                                      bool useRibo, bool useMito,
                                       size_t numLowConfidentBarcode);
     template bool performWhitelisting(AlevinOpts<alevin::protocols::Chromium>& aopt,
-                                      std::vector<uint32_t>& umiCount,
                                       std::vector<std::string>& trueBarcodes,
-                                      CFreqMapT& freqCounter, bool useRibo, bool useMito,
+                                      bool useRibo, bool useMito,
                                       size_t numLowConfidentBarcode);
     template bool performWhitelisting(AlevinOpts<alevin::protocols::Gemcode>& aopt,
-                                      std::vector<uint32_t>& umiCount,
                                       std::vector<std::string>& trueBarcodes,
-                                      CFreqMapT& freqCounter, bool useRibo, bool useMito,
+                                      bool useRibo, bool useMito,
                                       size_t numLowConfidentBarcode);
     template bool performWhitelisting(AlevinOpts<alevin::protocols::CELSeq>& aopt,
-                                      std::vector<uint32_t>& umiCount,
                                       std::vector<std::string>& trueBarcodes,
-                                      CFreqMapT& freqCounter, bool useRibo, bool useMito,
+                                      bool useRibo, bool useMito,
                                       size_t numLowConfidentBarcode);
     template bool performWhitelisting(AlevinOpts<alevin::protocols::CELSeq2>& aopt,
-                                      std::vector<uint32_t>& umiCount,
                                       std::vector<std::string>& trueBarcodes,
-                                      CFreqMapT& freqCounter, bool useRibo, bool useMito,
+                                      bool useRibo, bool useMito,
                                       size_t numLowConfidentBarcode);
     template bool performWhitelisting(AlevinOpts<alevin::protocols::QuartzSeq2>& aopt,
-                                      std::vector<uint32_t>& umiCount,
                                       std::vector<std::string>& trueBarcodes,
-                                      CFreqMapT& freqCounter, bool useRibo, bool useMito,
+                                      bool useRibo, bool useMito,
                                       size_t numLowConfidentBarcode);
     template bool performWhitelisting(AlevinOpts<alevin::protocols::Custom>& aopt,
-                                      std::vector<uint32_t>& umiCount,
                                       std::vector<std::string>& trueBarcodes,
-                                      CFreqMapT& freqCounter, bool useRibo, bool useMito,
+                                      bool useRibo, bool useMito,
                                       size_t numLowConfidentBarcode);
   }
 }