diff --git a/fsst.cpp b/fsst.cpp
index e708482..d113184 100644
--- a/fsst.cpp
+++ b/fsst.cpp
@@ -16,16 +16,20 @@
 //
 // You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
 #ifdef FSST12
+
 #include "fsst12.h" // the official FSST API -- also usable by C mortals
+
 #else
 #include "fsst.h" // the official FSST API -- also usable by C mortals
 #endif
+
 #include <condition_variable>
 #include <iostream>
 #include <fstream>
 #include <mutex>
 #include <vector>
 #include <thread>
+
 using namespace std;
 
 // Utility to compress and decompress (-d) data with FSST (using stdin and stdout).
@@ -45,48 +49,54 @@ using namespace std;
 namespace {
 
 class BinarySemaphore {
-   private:
+private:
    mutex m;
    condition_variable cv;
    bool value;
 
-   public:
+public:
    explicit BinarySemaphore(bool initialValue = false) : value(initialValue) {}
+
    void wait() {
       unique_lock<mutex> lock(m);
       while (!value) cv.wait(lock);
       value = false;
    }
+
    void post() {
-      { unique_lock<mutex> lock(m); value = true; }
+      {
+         unique_lock<mutex> lock(m);
+         value = true;
+      }
       cv.notify_one();
    }
 };
 
 bool stopThreads = false;
 BinarySemaphore srcDoneIO[2], dstDoneIO[2], srcDoneCPU[2], dstDoneCPU[2];
-unsigned char *srcBuf[2] = { NULL, NULL };
-unsigned char *dstBuf[2] = { NULL, NULL };
-unsigned char *dstMem[2] = { NULL, NULL };
-size_t srcLen[2] = { 0, 0 };
-size_t dstLen[2] = { 0, 0 };
+unsigned char *srcBuf[2] = {NULL, NULL};
+unsigned char *dstBuf[2] = {NULL, NULL};
+unsigned char *dstMem[2] = {NULL, NULL};
+size_t srcLen[2] = {0, 0};
+size_t dstLen[2] = {0, 0};
 
 #define FSST_MEMBUF (1ULL<<22)
 int decompress = 0;
-size_t blksz = FSST_MEMBUF-(1+FSST_MAXHEADER/2); // block size of compression (max compressed size must fit 3 bytes)
+size_t blksz =
+        FSST_MEMBUF - (1 + FSST_MAXHEADER / 2); // block size of compression (max compressed size must fit 3 bytes)
 
 #define DESERIALIZE(p) (((unsigned long long) (p)[0]) << 16) | (((unsigned long long) (p)[1]) << 8) | ((unsigned long long) (p)[2])
-#define SERIALIZE(l,p) { (p)[0] = ((l)>>16)&255; (p)[1] = ((l)>>8)&255; (p)[2] = (l)&255; }
+#define SERIALIZE(l, p) { (p)[0] = ((l)>>16)&255; (p)[1] = ((l)>>8)&255; (p)[2] = (l)&255; }
 
-void reader(ifstream& src) {
-   for(int swap=0; true; swap = 1-swap) {
+void reader(ifstream &src) {
+   for (int swap = 0; true; swap = 1 - swap) {
       srcDoneCPU[swap].wait();
       if (stopThreads) break;
-      src.read((char*) srcBuf[swap], blksz);
+      src.read((char *) srcBuf[swap], blksz);
       srcLen[swap] = (unsigned long) src.gcount();
       if (decompress) {
          if (blksz && srcLen[swap] == blksz) {
-            blksz = DESERIALIZE(srcBuf[swap]+blksz-3); // read size of next block
+            blksz = DESERIALIZE(srcBuf[swap] + blksz - 3); // read size of next block
             srcLen[swap] -= 3; // cut off size bytes
          } else {
             blksz = 0;
@@ -96,33 +106,33 @@ void reader(ifstream& src) {
    }
 }
 
-void writer(ofstream& dst) {
-   for(int swap=0; true; swap = 1-swap) {
+void writer(ofstream &dst) {
+   for (int swap = 0; true; swap = 1 - swap) {
       dstDoneCPU[swap].wait();
       if (!dstLen[swap]) break;
-      dst.write((char*) dstBuf[swap], dstLen[swap]);
+      dst.write((char *) dstBuf[swap], dstLen[swap]);
       dstDoneIO[swap].post();
    }
-   for(int swap=0; swap<2; swap++)
+   for (int swap = 0; swap < 2; swap++)
       dstDoneIO[swap].post();
 }
 
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {
    size_t srcTot = 0, dstTot = 0;
    if (argc < 2 || argc > 4 || (argc == 4 && (argv[1][0] != '-' || argv[1][1] != 'd' || argv[1][2]))) {
       cerr << "usage: " << argv[0] << " -d infile outfile" << endl;
-      cerr << "       " << argv[0] << " infile outfile" << endl;
-      cerr << "       " << argv[0] << " infile" << endl;
+      cerr << "      " << argv[0] << " infile outfile" << endl;
+      cerr << "      " << argv[0] << " infile" << endl;
       return -1;
    }
    decompress = (argc == 4);
-   string srcfile(argv[1+decompress]), dstfile;
+   string srcfile(argv[1 + decompress]), dstfile;
    if (argc == 2) {
       dstfile = srcfile + ".fsst";
    } else {
-      dstfile = argv[2+decompress];
+      dstfile = argv[2 + decompress];
    }
    ifstream src;
    ofstream dst;
@@ -132,28 +142,28 @@ int main(int argc, char* argv[]) {
    dst.exceptions(ios_base::badbit);
    src.exceptions(ios_base::badbit);
    if (decompress) {
-       unsigned char tmp[3];
-       src.read((char*) tmp, 3);
-       if (src.gcount() != 3) {
-          cerr << "failed to open input." << endl;
-          return -1;
-       }
-       blksz = DESERIALIZE(tmp); // read first block size
+      unsigned char tmp[3];
+      src.read((char *) tmp, 3);
+      if (src.gcount() != 3) {
+         cerr << "failed to open input." << endl;
+         return -1;
+      }
+      blksz = DESERIALIZE(tmp); // read first block size
    }
-   vector<unsigned char> buffer(FSST_MEMBUF*6);
+   vector<unsigned char> buffer(FSST_MEMBUF * 6);
    srcBuf[0] = buffer.data();
-   srcBuf[1] = srcBuf[0] + (FSST_MEMBUF*(1ULL+decompress));
-   dstMem[0] = srcBuf[1] + (FSST_MEMBUF*(1ULL+decompress));
-   dstMem[1] = dstMem[0] + (FSST_MEMBUF*(2ULL-decompress));
+   srcBuf[1] = srcBuf[0] + (FSST_MEMBUF * (1ULL + decompress));
+   dstMem[0] = srcBuf[1] + (FSST_MEMBUF * (1ULL + decompress));
+   dstMem[1] = dstMem[0] + (FSST_MEMBUF * (2ULL - decompress));
 
-   for(int swap=0; swap<2; swap++) {
+   for (int swap = 0; swap < 2; swap++) {
       srcDoneCPU[swap].post(); // input buffer is not being processed initially
       dstDoneIO[swap].post();  // output buffer is not being written initially
    }
-   thread readerThread([&src]{ reader(src); });
-   thread writerThread([&dst]{ writer(dst); });
+   thread readerThread([&src] { reader(src); });
+   thread writerThread([&dst] { writer(dst); });
 
-   for(int swap=0; true; swap = 1-swap) {
+   for (int swap = 0; true; swap = 1 - swap) {
       srcDoneIO[swap].wait(); // wait until input buffer is available (i.e. done reading)
       dstDoneIO[swap].wait(); // wait until output buffer is ready writing hence free for use
       if (srcLen[swap] == 0) {
@@ -161,33 +171,37 @@ int main(int argc, char* argv[]) {
          break;
       }
       if (decompress) {
-          fsst_decoder_t decoder;
-          size_t hdr = fsst_import(&decoder, srcBuf[swap]);
-          dstLen[swap] = fsst_decompress(&decoder, srcLen[swap] - hdr, srcBuf[swap] + hdr, FSST_MEMBUF, dstBuf[swap] = dstMem[swap]);
+         fsst_decoder_t decoder;
+         size_t hdr = fsst_import(&decoder, srcBuf[swap]);
+         dstLen[swap] = fsst_decompress(&decoder, srcLen[swap] - hdr, srcBuf[swap] + hdr, FSST_MEMBUF,
+                                        dstBuf[swap] = dstMem[swap]);
       } else {
          unsigned char tmp[FSST_MAXHEADER];
-         fsst_encoder_t* encoder = fsst_create(1, &srcLen[swap], const_cast<const unsigned char **>(&srcBuf[swap]), 0);
+         fsst_encoder_t *encoder = fsst_create(1, &srcLen[swap], const_cast<const unsigned char **>(&srcBuf[swap]),
+                                               0);
          size_t hdr = fsst_export(encoder, tmp);
          if (fsst_compress(encoder, 1, &srcLen[swap], const_cast<const unsigned char **>(&srcBuf[swap]),
                            FSST_MEMBUF * 2, dstMem[swap] + FSST_MAXHEADER + 3,
                            &dstLen[swap], &dstBuf[swap]) < 1)
             return -1;
          dstLen[swap] += 3 + hdr;
-          dstBuf[swap] -= 3 + hdr;
-          SERIALIZE(dstLen[swap],dstBuf[swap]); // block starts with size
-          copy(tmp, tmp+hdr, dstBuf[swap]+3); // then the header (followed by the compressed bytes which are already there)
-          fsst_destroy(encoder);
+         dstBuf[swap] -= 3 + hdr;
+         SERIALIZE(dstLen[swap], dstBuf[swap]); // block starts with size
+         copy(tmp, tmp + hdr,
+              dstBuf[swap] + 3); // then the header (followed by the compressed bytes which are already there)
+         fsst_destroy(encoder);
       }
       srcTot += srcLen[swap];
       dstTot += dstLen[swap];
       srcDoneCPU[swap].post(); // input buffer may be re-used by the reader for the next block
       dstDoneCPU[swap].post(); // output buffer is ready for writing out
    }
-   cerr  << (decompress?"Dec":"C") << "ompressed " << srcTot <<  " bytes into " << dstTot << " bytes ==> " << (int) ((100*dstTot)/srcTot) << "%" << endl;
+   cerr << (decompress ? "Dec" : "C") << "ompressed " << srcTot << " bytes into " << dstTot << " bytes ==> "
+        << (int) ((100 * dstTot) / srcTot) << "%" << endl;
 
    // force wait until all background writes finished
    stopThreads = true;
-   for(int swap=0; swap<2; swap++) {
+   for (int swap = 0; swap < 2; swap++) {
       srcDoneCPU[swap].post();
       dstDoneCPU[swap].post();
    }
diff --git a/fsst_avx512.cpp b/fsst_avx512.cpp
index a2b7b5e..9ce0ab4 100644
--- a/fsst_avx512.cpp
+++ b/fsst_avx512.cpp
@@ -21,23 +21,30 @@
 #include <immintrin.h>
 
 #ifdef _WIN32
+namespace libfsst {
 bool fsst_hasAVX512() {
    int info[4];
    __cpuidex(info, 0x00000007, 0);
    return (info[1]>>16)&1;
 }
+}
 #else
 #include <cpuid.h>
-bool fsst_hasAVX512() {
-   int info[4];
-    __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]);
-   return (info[1]>>16)&1;
+namespace libfsst {
+    bool fsst_hasAVX512() {
+        int info[4];
+        __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]);
+        return (info[1] >> 16) & 1;
+    }
 }
 #endif
 #else
+namespace libfsst {
 bool fsst_hasAVX512() { return false; }
+}
 #endif
 
+namespace libfsst {
 // BULK COMPRESSION OF STRINGS
 //
 // In one call of this function, we can compress 512 strings, each of maximum length 511 bytes.
@@ -70,14 +77,15 @@ bool fsst_hasAVX512() { return false; }
 // This reduces the effectiveness of unrolling, hence -O2 makes the loop perform worse than -O1 which skips this optimization. 
 // Assembly inspection confirmed that 3-way unroll with -O1 avoids needless load/stores.
 
-size_t fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBase, SIMDjob *input, SIMDjob *output, size_t n, size_t unroll) {
-   size_t processed = 0;
-   // define some constants (all_x means that all 8 lanes contain 64-bits value X)
+    size_t fsst_compressAVX512(SymbolTable &symbolTable, u8 *codeBase, u8 *symbolBase, SIMDjob *input, SIMDjob *output,
+                               size_t n, size_t unroll) {
+        size_t processed = 0;
+        // define some constants (all_x means that all 8 lanes contain 64-bits value X)
 #ifdef __AVX512F__
- //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c
-   __m512i all_MASK     = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1));
-   __m512i all_PRIME    = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME));
-   __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE));
+        //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c
+          __m512i all_MASK     = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1));
+          __m512i all_PRIME    = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME));
+          __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE));
 #define    all_HASH       _mm512_srli_epi64(all_MASK, 64-FSST_HASH_LOG2SIZE)
 #define    all_ONE        _mm512_srli_epi64(all_MASK, 63)
 #define    all_M19        _mm512_srli_epi64(all_MASK, 45)
@@ -87,54 +95,56 @@ size_t fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBas
 #define    all_FFFF       _mm512_srli_epi64(all_MASK, 48)
 #define    all_FF         _mm512_srli_epi64(all_MASK, 56)
 
-   SIMDjob *inputEnd = input+n;
-   assert(n >= unroll*8 && n <= 512); // should be close to 512 
-   __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4
-   __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll
-   u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll
+          SIMDjob *inputEnd = input+n;
+          assert(n >= unroll*8 && n <= 512); // should be close to 512
+          __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4
+          __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll
+          u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll
 
-   if (unroll >= 4) {
-      while (input+delta1+delta2+delta3+delta4 < inputEnd) {
-         #include "fsst_avx512_unroll4.inc"
-      }
-   } else if (unroll == 3) {
-      while (input+delta1+delta2+delta3 < inputEnd) {
-         #include "fsst_avx512_unroll3.inc"
-      }
-   } else if (unroll == 2) {
-      while (input+delta1+delta2 < inputEnd) {
-         #include "fsst_avx512_unroll2.inc"
-      }
-   } else {
-      while (input+delta1 < inputEnd) {
-         #include "fsst_avx512_unroll1.inc"
-      }
-   }
+          if (unroll >= 4) {
+             while (input+delta1+delta2+delta3+delta4 < inputEnd) {
+#include "fsst_avx512_unroll4.inc"
+             }
+          } else if (unroll == 3) {
+             while (input+delta1+delta2+delta3 < inputEnd) {
+#include "fsst_avx512_unroll3.inc"
+             }
+          } else if (unroll == 2) {
+             while (input+delta1+delta2 < inputEnd) {
+#include "fsst_avx512_unroll2.inc"
+             }
+          } else {
+             while (input+delta1 < inputEnd) {
+#include "fsst_avx512_unroll1.inc"
+             }
+          }
 
-   // flush the job states of the unfinished strings at the end of output[] 
-   processed = n - (inputEnd - input);
-   u32 unfinished = 0;
-   if (unroll > 1) { 
-      if (unroll > 2) { 
-         if (unroll > 3) { 
-            _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4); 
-            unfinished += _mm_popcnt_u32((int) loadmask4);
-         }
-         _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3); 
-         unfinished += _mm_popcnt_u32((int) loadmask3);
-      }
-      _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2); 
-      unfinished += _mm_popcnt_u32((int) loadmask2);
-   }
-   _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1); 
+          // flush the job states of the unfinished strings at the end of output[]
+          processed = n - (inputEnd - input);
+          u32 unfinished = 0;
+          if (unroll > 1) {
+             if (unroll > 2) {
+                if (unroll > 3) {
+                   _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4);
+                   unfinished += _mm_popcnt_u32((int) loadmask4);
+                }
+                _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3);
+                unfinished += _mm_popcnt_u32((int) loadmask3);
+             }
+             _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2);
+             unfinished += _mm_popcnt_u32((int) loadmask2);
+          }
+          _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1);
 #else
-   (void) symbolTable;
-   (void) codeBase;
-   (void) symbolBase;
-   (void) input;
-   (void) output;
-   (void) n;
-   (void) unroll;
+        (void) symbolTable;
+        (void) codeBase;
+        (void) symbolBase;
+        (void) input;
+        (void) output;
+        (void) n;
+        (void) unroll;
 #endif
-   return processed;
+        return processed;
+    }
 }
+
diff --git a/libfsst.cpp b/libfsst.cpp
index 5285173..d6836bc 100644
--- a/libfsst.cpp
+++ b/libfsst.cpp
@@ -17,20 +17,23 @@
 // You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
 #include "libfsst.hpp"
 
-Symbol concat(Symbol a, Symbol b) {
-   Symbol s;
-   u32 length = a.length()+b.length();
-   if (length > Symbol::maxLength) length = Symbol::maxLength; 
-   s.set_code_len(FSST_CODE_MASK, length);
-   s.val.num = (b.val.num << (8*a.length())) | a.val.num;
-   return s;
+namespace libfsst {
+    Symbol concat(Symbol a, Symbol b) {
+        Symbol s;
+        u32 length = a.length() + b.length();
+        if (length > Symbol::maxLength) length = Symbol::maxLength;
+        s.set_code_len(FSST_CODE_MASK, length);
+        s.val.num = (b.val.num << (8 * a.length())) | a.val.num;
+    return s;
 }
+}
+
 
 namespace std {
 template <>
-class hash<QSymbol> {
+class hash<libfsst::QSymbol> {
    public:
-   size_t operator()(const QSymbol& q) const {
+   size_t operator()(const libfsst::QSymbol& q) const {
       uint64_t k = q.symbol.val.num;
       const uint64_t m = 0xc6a4a7935bd1e995;
       const int r = 47;
@@ -48,572 +51,605 @@ class hash<QSymbol> {
 };
 }
 
-bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; }
+namespace libfsst {
+
+    bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; }
+
+    std::ostream &operator<<(std::ostream &out, const Symbol &s) {
+        for (u32 i = 0; i < s.length(); i++)
+            out << s.val.str[i];
+        return out;
+    }
+
+    SymbolTable *
+    buildSymbolTable(Counters &counters, vector<const u8 *> line, const size_t len[], bool zeroTerminated = false) {
+        SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable();
+        int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception)
+        size_t sampleFrac = 128;
+
+        // start by determining the terminator. We use the (lowest) most infrequent byte as terminator
+        st->zeroTerminated = zeroTerminated;
+        if (zeroTerminated) {
+            st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency
+        } else {
+            u16 byteHisto[256];
+            memset(byteHisto, 0, sizeof(byteHisto));
+            for (size_t i = 0; i < line.size(); i++) {
+                const u8 *cur = line[i];
+                const u8 *end = cur + len[i];
+                while (cur < end) byteHisto[*cur++]++;
+            }
+            u32 minSize = FSST_SAMPLEMAXSZ, i = st->terminator = 256;
+            while (i-- > 0) {
+                if (byteHisto[i] > minSize) continue;
+                st->terminator = i;
+                minSize = byteHisto[i];
+            }
+        }
+        assert(st->terminator != 256);
 
-std::ostream& operator<<(std::ostream& out, const Symbol& s) {
-   for (u32 i=0; i<s.length(); i++)
-      out << s.val.str[i];
-   return out;
-}
+        // a random number between 0 and 128
+        auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i + 1UL) * sampleFrac) & 127); };
 
-SymbolTable *buildSymbolTable(Counters& counters, vector<const u8*> line, const size_t len[], bool zeroTerminated=false) {
-   SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable();
-   int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception)
-   size_t sampleFrac = 128;
-
-   // start by determining the terminator. We use the (lowest) most infrequent byte as terminator 
-   st->zeroTerminated = zeroTerminated;
-   if (zeroTerminated) {
-      st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency
-   } else {
-      u16 byteHisto[256];
-      memset(byteHisto, 0, sizeof(byteHisto));
-      for(size_t i=0; i<line.size(); i++) {
-         const u8* cur = line[i];
-         const u8* end = cur + len[i];
-         while(cur < end) byteHisto[*cur++]++;
-      }
-      u32 minSize = FSST_SAMPLEMAXSZ, i = st->terminator = 256;
-      while(i-- > 0) {
-         if (byteHisto[i] > minSize) continue;
-         st->terminator = i;
-         minSize = byteHisto[i];
-      }
-   }
-   assert(st->terminator != 256);
-
-   // a random number between 0 and 128
-   auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); };
-
-   // compress sample, and compute (pair-)frequencies
-   auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain
-      int gain = 0;
-
-      for(size_t i=0; i<line.size(); i++) {
-         const u8* cur = line[i], *start = cur;
-         const u8* end = cur + len[i];
-
-         if (sampleFrac < 128) {
-            // in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x)
-            if (rnd128(i) > sampleFrac) continue;
-         }
-         if (cur < end) {
-            u16 code2 = 255, code1 = st->findLongestSymbol(cur, end);
-            cur += st->symbols[code1].length();
-            gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1)));
-            while (true) {
-               // count single symbol (i.e. an option is not extending it)
-               counters.count1Inc(code1);
-	
-               // as an alternative, consider just using the next byte..
-               if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly
-                  counters.count1Inc(*start); 
-
-               if (cur==end) { 
-                  break;
-               } 
-
-               // now match a new symbol
-	       start = cur;
-               if (cur<end-7) {
-                  u64 word = fsst_unaligned_load(cur);
-                  size_t code = word & 0xFFFFFF;
-                  size_t idx = FSST_HASH(code)&(st->hashTabSize-1);
-                  Symbol s = st->hashTab[idx];
-                  code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK;
-                  word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
-                  if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) {
-                     code2 = s.code(); 
-		     cur += s.length();
-                  } else if (code2 >= FSST_CODE_BASE) {
-                     cur += 2;
-                  } else {
-                     code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK;
-                     cur += 1;
-                  }
-               } else {
-                  code2 = st->findLongestSymbol(cur, end);
-                  cur += st->symbols[code2].length();
-               }
- 
-               // compute compressed output size
-               gain += ((int) (cur-start))-(1+isEscapeCode(code2));
-
-               if (sampleFrac < 128) { // no need to count pairs in final round
-	          // consider the symbol that is the concatenation of the two last symbols
-                  counters.count2Inc(code1, code2);
-
-                  // as an alternative, consider just extending with the next byte..
-                  if ((cur-start) > 1)  // ..but do not count single byte extensions doubly
-                     counters.count2Inc(code1, *start);
-               }
-               code1 = code2;
-            }
-         }
-      }
-      return gain; 
-   };
-
-   auto makeTable = [&](SymbolTable *st, Counters &counters) {
-      // hashmap of c (needed because we can generate duplicate candidates)
-      unordered_set<QSymbol> cands;
-
-      // artificially make terminater the most frequent symbol so it gets included
-      u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator;
-      counters.count1Set(terminator,65535); 
-
-      auto addOrInc = [&](unordered_set<QSymbol> &cands, Symbol s, u64 count) {
-         if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!!
-         QSymbol q;
-         q.symbol = s;
-         q.gain = count * s.length();
-         auto it = cands.find(q);
-         if (it != cands.end()) {
-            q.gain += (*it).gain;
-            cands.erase(*it);
-         }
-         cands.insert(q);
-      };
-
-      // add candidate symbols based on counted frequency
-      for (u32 pos1=0; pos1<FSST_CODE_BASE+(size_t) st->nSymbols; pos1++) { 
-         u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
-         if (!cnt1) continue;
-
-         // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed
-         Symbol s1 = st->symbols[pos1];
-         addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1);
-
-         if (sampleFrac >= 128 || // last round we do not create new (combined) symbols
-             s1.length() == Symbol::maxLength || // symbol cannot be extended
-             s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte
-            continue;
-         }
-         for (u32 pos2=0; pos2<FSST_CODE_BASE+(size_t)st->nSymbols; pos2++) { 
-            u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
-            if (!cnt2) continue;
-
-            // create a new symbol
-            Symbol s2 = st->symbols[pos2];
-            Symbol s3 = concat(s1, s2);
-            if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte
-               addOrInc(cands, s3, cnt2);
-         }
-      }
-
-      // insert candidates into priority queue (by gain)
-      auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.val.num > q2.symbol.val.num); };
-      priority_queue<QSymbol,vector<QSymbol>,decltype(cmpGn)> pq(cmpGn);
-      for (auto& q : cands)
-         pq.push(q);
-
-      // Create new symbol map using best candidates
-      st->clear();
-      while (st->nSymbols < 255 && !pq.empty()) {
-         QSymbol q = pq.top();
-         pq.pop();
-         st->add(q.symbol);
-      }
-   };
-
-   u8 bestCounters[512*sizeof(u16)];
-#ifdef NONOPT_FSST
-   for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) {
-      sampleFrac = frac;
-#else
-   for(sampleFrac=8; true; sampleFrac += 30) {
-#endif
-      memset(&counters, 0, sizeof(Counters));
-      long gain = compressCount(st, counters);
-      if (gain >= bestGain) { // a new best solution!
-         counters.backup1(bestCounters);
-         *bestTable = *st; bestGain = gain;
-      } 
-      if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128)
-      makeTable(st, counters);
-   }
-   delete st;
-   counters.restore1(bestCounters);
-   makeTable(bestTable, counters);
-   bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression
-   return bestTable;
-}
+        // compress sample, and compute (pair-)frequencies
+        auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain
+            int gain = 0;
 
-static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, const size_t len[], const u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) {
-   size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size;
-   u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings 
-   SIMDjob input[512];  // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer
-   SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this)
-   size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs)
-
-   while (curLine < nlines && outOff <= (1<<19)) {
-      size_t prevLine = curLine, chunk, curOff = 0;
- 
-      // bail out if the output buffer cannot hold the compressed next string fully
-      if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7
-      else budget -= (len[curLine]-curOff)*2;
-
-      strOut[curLine] = (u8*) 0; 
-      lenOut[curLine] = 0;
-
-      do {
-         do {
-            chunk = len[curLine] - curOff;
-            if (chunk > 511) {
-               chunk = 511; // large strings need to be chopped up into segments of 511 bytes
-            }
-            // create a job in this batch
-            SIMDjob job;
-            job.cur = inOff;
-            job.end = job.cur + chunk;
-            job.pos = batchPos;
-            job.out = outOff;
-   
-            // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros)
-            outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes.
-            if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk
-   
-            // register job in this batch
-            input[batchPos] = job;
-            jobLine[batchPos] = curLine;
-   
-            if (chunk == 0) {
-               empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out
-            } else {
-               // copy string chunk into temp buffer 
-               memcpy(symbolBase + inOff, line[curLine] + curOff, chunk);
-               inOff += chunk;
-               curOff += chunk;
-               symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded
+            for (size_t i = 0; i < line.size(); i++) {
+                const u8 *cur = line[i], *start = cur;
+                const u8 *end = cur + len[i];
+
+                if (sampleFrac < 128) {
+                    // in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x)
+                    if (rnd128(i) > sampleFrac) continue;
+                }
+                if (cur < end) {
+                    u16 code2 = 255, code1 = st->findLongestSymbol(cur, end);
+                    cur += st->symbols[code1].length();
+                    gain += (int) (st->symbols[code1].length() - (1 + isEscapeCode(code1)));
+                    while (true) {
+                        // count single symbol (i.e. an option is not extending it)
+                        counters.count1Inc(code1);
+
+                        // as an alternative, consider just using the next byte..
+                        if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly
+                            counters.count1Inc(*start);
+
+                        if (cur == end) {
+                            break;
+                        }
+
+                        // now match a new symbol
+                        start = cur;
+                        if (cur < end - 7) {
+                            u64 word = fsst_unaligned_load(cur);
+                            size_t code = word & 0xFFFFFF;
+                            size_t idx = FSST_HASH(code) & (st->hashTabSize - 1);
+                            Symbol s = st->hashTab[idx];
+                            code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK;
+                            word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
+                            if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) {
+                                code2 = s.code();
+                                cur += s.length();
+                            } else if (code2 >= FSST_CODE_BASE) {
+                                cur += 2;
+                            } else {
+                                code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK;
+                                cur += 1;
+                            }
+                        } else {
+                            code2 = st->findLongestSymbol(cur, end);
+                            cur += st->symbols[code2].length();
+                        }
+
+                        // compute compressed output size
+                        gain += ((int) (cur - start)) - (1 + isEscapeCode(code2));
+
+                        if (sampleFrac < 128) { // no need to count pairs in final round
+                            // consider the symbol that is the concatenation of the two last symbols
+                            counters.count2Inc(code1, code2);
+
+                            // as an alternative, consider just extending with the next byte..
+                            if ((cur - start) > 1)  // ..but do not count single byte extensions doubly
+                                counters.count2Inc(code1, *start);
+                        }
+                        code1 = code2;
+                    }
+                }
             }
-            if (++batchPos == 512) break;
-         } while(curOff < len[curLine]);
-   
-         if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines)) { // cannot accumulate more?
-            if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling)
-               // radix-sort jobs on length (longest string first) 
-               // -- this provides best load balancing and allows to skip empty jobs at the end
-               u16 sortpos[513]; 
-               memset(sortpos, 0, sizeof(sortpos));
-   
-               // calculate length histo 
-               for(size_t i=0; i<batchPos; i++) { 
-                  size_t len = input[i].end - input[i].cur; 
-                  sortpos[512UL - len]++;
-               }
-               // calculate running sum
-               for(size_t i=1; i<=512; i++) 
-                  sortpos[i] += sortpos[i-1]; 
-   
-               // move jobs to their final destination
-               SIMDjob inputOrdered[512];
-               for(size_t i=0; i<batchPos; i++) {
-                  size_t len = input[i].end - input[i].cur; 
-                  size_t pos = sortpos[511UL - len]++;
-                  inputOrdered[pos] = input[i]; 
+            return gain;
+        };
+
+        auto makeTable = [&](SymbolTable *st, Counters &counters) {
+            // hashmap of c (needed because we can generate duplicate candidates)
+            unordered_set<QSymbol> cands;
+
+            // artificially make terminater the most frequent symbol so it gets included
+            u16 terminator = st->nSymbols ? FSST_CODE_BASE : st->terminator;
+            counters.count1Set(terminator, 65535);
+
+            auto addOrInc = [&](unordered_set<QSymbol> &cands, Symbol s, u64 count) {
+                if (count < (5 * sampleFrac) / 128)
+                    return; // improves both compression speed (less candidates), but also quality!!
+                QSymbol q;
+                q.symbol = s;
+                q.gain = count * s.length();
+                auto it = cands.find(q);
+                if (it != cands.end()) {
+                    q.gain += (*it).gain;
+                    cands.erase(*it);
+                }
+                cands.insert(q);
+            };
+
+            // add candidate symbols based on counted frequency
+            for (u32 pos1 = 0; pos1 < FSST_CODE_BASE + (size_t) st->nSymbols; pos1++) {
+                u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
+                if (!cnt1) continue;
+
+                // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed
+                Symbol s1 = st->symbols[pos1];
+                addOrInc(cands, s1, ((s1.length() == 1) ? 8LL : 1LL) * cnt1);
+
+                if (sampleFrac >= 128 || // last round we do not create new (combined) symbols
+                    s1.length() == Symbol::maxLength || // symbol cannot be extended
+                    s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte
+                    continue;
+                }
+                for (u32 pos2 = 0; pos2 < FSST_CODE_BASE + (size_t) st->nSymbols; pos2++) {
+                    u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
+                    if (!cnt2) continue;
+
+                    // create a new symbol
+                    Symbol s2 = st->symbols[pos2];
+                    Symbol s3 = concat(s1, s2);
+                    if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte
+                        addOrInc(cands, s3, cnt2);
                 }
-               // finally.. SIMD compress max 256KB of simdbuf into (max) 512KB of simdbuf (but presumably much less..) 
-               for(size_t done = fsst_compressAVX512(symbolTable, codeBase, symbolBase, inputOrdered, output, batchPos-empty, unroll);
-                   done < batchPos; done++) output[done] = inputOrdered[done]; 
-            } else {
-               memcpy(output, input, batchPos*sizeof(SIMDjob));
             }
-   
-            // finish encoding (unfinished strings in process, plus the few last strings not yet processed)
-            for(size_t i=0; i<batchPos; i++) {
-               SIMDjob job = output[i];
-               if (job.cur < job.end) { // finish encoding this string with scalar code
-                  u8* cur = symbolBase + job.cur;
-                  u8* end = symbolBase + job.end;
-                  u8* out = codeBase + job.out;
-                  while (cur < end) {
-                     u64 word = fsst_unaligned_load(cur);
-                     size_t code = symbolTable.shortCodes[word & 0xFFFF];
-                     size_t pos = word & 0xFFFFFF;
-                     size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
-                     Symbol s = symbolTable.hashTab[idx];
-                     out[1] = (u8) word; // speculatively write out escaped byte
-                     word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
-                     if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
-                        *out++ = (u8) s.code(); cur += s.length();
-                     } else {
-                        // could be a 2-byte or 1-byte code, or miss
-                        // handle everything with predication 
-                        *out = (u8) code; 
-                        out += 1+((code&FSST_CODE_BASE)>>8);
-                        cur += (code>>FSST_LEN_BITS); 
-                    }
-                  }
-                  job.out = out - codeBase;
-               } 
-               // postprocess job info
-               job.cur = 0;
-               job.end = job.out - input[job.pos].out; // misuse .end field as compressed size 
-               job.out = input[job.pos].out; // reset offset to start of encoded string
-               input[job.pos] = job; 
+
+            // insert candidates into priority queue (by gain)
+            auto cmpGn = [](const QSymbol &q1, const QSymbol &q2) {
+                return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.val.num > q2.symbol.val.num);
+            };
+            priority_queue<QSymbol, vector<QSymbol>, decltype(cmpGn)> pq(cmpGn);
+            for (auto &q: cands)
+                pq.push(q);
+
+            // Create new symbol map using best candidates
+            st->clear();
+            while (st->nSymbols < 255 && !pq.empty()) {
+                QSymbol q = pq.top();
+                pq.pop();
+                st->add(q.symbol);
             }
-   
-            // copy out the result data
-            for(size_t i=0; i<batchPos; i++) {
-               size_t lineNr = jobLine[i]; // the sort must be order-preserving, as we concatenate results string in order
-               size_t sz = input[i].end; // had stored compressed lengths here
-               if (!strOut[lineNr]) strOut[lineNr] = dst; // first segment will be the strOut pointer
-               lenOut[lineNr] += sz; // add segment (lenOut starts at 0 for this reason)
-               memcpy(dst, codeBase+input[i].out, sz);
-               dst += sz;
+        };
+
+        u8 bestCounters[512 * sizeof(u16)];
+#ifdef NONOPT_FSST
+        for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) {
+           sampleFrac = frac;
+#else
+        for (sampleFrac = 8; true; sampleFrac += 30) {
+#endif
+            memset(&counters, 0, sizeof(Counters));
+            long gain = compressCount(st, counters);
+            if (gain >= bestGain) { // a new best solution!
+                counters.backup1(bestCounters);
+                *bestTable = *st;
+                bestGain = gain;
             }
-   
-            // go for the next batch of 512 chunks
-            inOff = outOff = batchPos = empty = 0;
-            budget = (size_t) (lim - dst);
-         } 
-      } while (curLine == prevLine && outOff <= (1<<19));
-   }
-   return curLine;
-}
+            if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128)
+            makeTable(st, counters);
+        }
+        delete st;
+        counters.restore1(bestCounters);
+        makeTable(bestTable, counters);
+        bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression
+        return bestTable;
+    }
+
+    static inline size_t
+    compressSIMD(SymbolTable &symbolTable, u8 *symbolBase, size_t nlines, const size_t len[], const u8 *line[],
+                 size_t size, u8 *dst, size_t lenOut[], u8 *strOut[], int unroll) {
+        size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size;
+        u8 *lim = dst + size, *codeBase = symbolBase + (1 << 18); // 512KB temp space for compressing 512 strings
+        SIMDjob input[512];  // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer
+        SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this)
+        size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs)
+
+        while (curLine < nlines && outOff <= (1 << 19)) {
+            size_t prevLine = curLine, chunk, curOff = 0;
+
+            // bail out if the output buffer cannot hold the compressed next string fully
+            if (((len[curLine] - curOff) * 2 + 7) > budget) break; // see below for the +7
+            else budget -= (len[curLine] - curOff) * 2;
+
+            strOut[curLine] = (u8 *) 0;
+            lenOut[curLine] = 0;
+
+            do {
+                do {
+                    chunk = len[curLine] - curOff;
+                    if (chunk > 511) {
+                        chunk = 511; // large strings need to be chopped up into segments of 511 bytes
+                    }
+                    // create a job in this batch
+                    SIMDjob job;
+                    job.cur = inOff;
+                    job.end = job.cur + chunk;
+                    job.pos = batchPos;
+                    job.out = outOff;
+
+                    // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros)
+                    outOff += 7 + 2 * (size_t) (job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes.
+                    if (outOff > (1 << 19)) break; // simdbuf may get full, stop before this chunk
+
+                    // register job in this batch
+                    input[batchPos] = job;
+                    jobLine[batchPos] = curLine;
+
+                    if (chunk == 0) {
+                        empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out
+                    } else {
+                        // copy string chunk into temp buffer
+                        memcpy(symbolBase + inOff, line[curLine] + curOff, chunk);
+                        inOff += chunk;
+                        curOff += chunk;
+                        symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded
+                    }
+                    if (++batchPos == 512) break;
+                } while (curOff < len[curLine]);
+
+                if ((batchPos == 512) || (outOff > (1 << 19)) || (++curLine >= nlines)) { // cannot accumulate more?
+                    if (batchPos - empty >=
+                        32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling)
+                        // radix-sort jobs on length (longest string first)
+                        // -- this provides best load balancing and allows to skip empty jobs at the end
+                        u16 sortpos[513];
+                        memset(sortpos, 0, sizeof(sortpos));
+
+                        // calculate length histo
+                        for (size_t i = 0; i < batchPos; i++) {
+                            size_t len = input[i].end - input[i].cur;
+                            sortpos[512UL - len]++;
+                        }
+                        // calculate running sum
+                        for (size_t i = 1; i <= 512; i++)
+                            sortpos[i] += sortpos[i - 1];
+
+                        // move jobs to their final destination
+                        SIMDjob inputOrdered[512];
+                        for (size_t i = 0; i < batchPos; i++) {
+                            size_t len = input[i].end - input[i].cur;
+                            size_t pos = sortpos[511UL - len]++;
+                            inputOrdered[pos] = input[i];
+                        }
+                        // finally.. SIMD compress max 256KB of simdbuf into (max) 512KB of simdbuf (but presumably much less..)
+                        for (size_t done = fsst_compressAVX512(symbolTable, codeBase, symbolBase, inputOrdered, output,
+                                                               batchPos - empty, unroll);
+                             done < batchPos; done++)
+                            output[done] = inputOrdered[done];
+                    } else {
+                        memcpy(output, input, batchPos * sizeof(SIMDjob));
+                    }
+
+                    // finish encoding (unfinished strings in process, plus the few last strings not yet processed)
+                    for (size_t i = 0; i < batchPos; i++) {
+                        SIMDjob job = output[i];
+                        if (job.cur < job.end) { // finish encoding this string with scalar code
+                            u8 *cur = symbolBase + job.cur;
+                            u8 *end = symbolBase + job.end;
+                            u8 *out = codeBase + job.out;
+                            while (cur < end) {
+                                u64 word = fsst_unaligned_load(cur);
+                                size_t code = symbolTable.shortCodes[word & 0xFFFF];
+                                size_t pos = word & 0xFFFFFF;
+                                size_t idx = FSST_HASH(pos) & (symbolTable.hashTabSize - 1);
+                                Symbol s = symbolTable.hashTab[idx];
+                                out[1] = (u8) word; // speculatively write out escaped byte
+                                word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
+                                if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
+                                    *out++ = (u8) s.code();
+                                    cur += s.length();
+                                } else {
+                                    // could be a 2-byte or 1-byte code, or miss
+                                    // handle everything with predication
+                                    *out = (u8) code;
+                                    out += 1 + ((code & FSST_CODE_BASE) >> 8);
+                                    cur += (code >> FSST_LEN_BITS);
+                                }
+                            }
+                            job.out = out - codeBase;
+                        }
+                        // postprocess job info
+                        job.cur = 0;
+                        job.end = job.out - input[job.pos].out; // misuse .end field as compressed size
+                        job.out = input[job.pos].out; // reset offset to start of encoded string
+                        input[job.pos] = job;
+                    }
+
+                    // copy out the result data
+                    for (size_t i = 0; i < batchPos; i++) {
+                        size_t lineNr = jobLine[i]; // the sort must be order-preserving, as we concatenate results string in order
+                        size_t sz = input[i].end; // had stored compressed lengths here
+                        if (!strOut[lineNr]) strOut[lineNr] = dst; // first segment will be the strOut pointer
+                        lenOut[lineNr] += sz; // add segment (lenOut starts at 0 for this reason)
+                        memcpy(dst, codeBase + input[i].out, sz);
+                        dst += sz;
+                    }
+
+                    // go for the next batch of 512 chunks
+                    inOff = outOff = batchPos = empty = 0;
+                    budget = (size_t) (lim - dst);
+                }
+            } while (curLine == prevLine && outOff <= (1 << 19));
+        }
+        return curLine;
+    }
 
 
 // optimized adaptive *scalar* compression method
-static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, const size_t lenIn[], const u8* strIn[], size_t size, u8* out, size_t lenOut[], u8* strOut[], bool noSuffixOpt, bool avoidBranch) {
-   const u8 *cur = NULL, *end =  NULL, *lim = out + size;
-   size_t curLine, suffixLim = symbolTable.suffixLim;
-   u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0];
-
-   u8 buf[512+8] = {}; /* +8 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */
-
-   // three variants are possible. dead code falls away since the bool arguments are constants
-   auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) {
-      while (cur < end) {
-         u64 word = fsst_unaligned_load(cur);
-         size_t code = symbolTable.shortCodes[word & 0xFFFF];
-         if (noSuffixOpt && ((u8) code) < suffixLim) {
-            // 2 byte code without having to worry about longer matches
-            *out++ = (u8) code; cur += 2;
-         } else {
-            size_t pos = word & 0xFFFFFF;
-            size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
-            Symbol s = symbolTable.hashTab[idx];
-            out[1] = (u8) word; // speculatively write out escaped byte
-            word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
-            if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
-               *out++ = (u8) s.code(); cur += s.length();
-            } else if (avoidBranch) {
-               // could be a 2-byte or 1-byte code, or miss
-               // handle everything with predication 
-               *out = (u8) code; 
-               out += 1+((code&FSST_CODE_BASE)>>8);
-               cur += (code>>FSST_LEN_BITS); 
-            } else if ((u8) code < byteLim) {
-               // 2 byte code after checking there is no longer pattern
-               *out++ = (u8) code; cur += 2;
-            } else {
-               // 1 byte code or miss. 
-               *out = (u8) code; 
-               out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse
-               cur++;
+    static inline size_t
+    compressBulk(SymbolTable &symbolTable, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *out,
+                 size_t lenOut[], u8 *strOut[], bool noSuffixOpt, bool avoidBranch) {
+        const u8 *cur = NULL, *end = NULL, *lim = out + size;
+        size_t curLine, suffixLim = symbolTable.suffixLim;
+        u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0];
+
+        u8 buf[512 + 8] = {}; /* +8 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */
+
+        // three variants are possible. dead code falls away since the bool arguments are constants
+        auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) {
+            while (cur < end) {
+                u64 word = fsst_unaligned_load(cur);
+                size_t code = symbolTable.shortCodes[word & 0xFFFF];
+                if (noSuffixOpt && ((u8) code) < suffixLim) {
+                    // 2 byte code without having to worry about longer matches
+                    *out++ = (u8) code;
+                    cur += 2;
+                } else {
+                    size_t pos = word & 0xFFFFFF;
+                    size_t idx = FSST_HASH(pos) & (symbolTable.hashTabSize - 1);
+                    Symbol s = symbolTable.hashTab[idx];
+                    out[1] = (u8) word; // speculatively write out escaped byte
+                    word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
+                    if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
+                        *out++ = (u8) s.code();
+                        cur += s.length();
+                    } else if (avoidBranch) {
+                        // could be a 2-byte or 1-byte code, or miss
+                        // handle everything with predication
+                        *out = (u8) code;
+                        out += 1 + ((code & FSST_CODE_BASE) >> 8);
+                        cur += (code >> FSST_LEN_BITS);
+                    } else if ((u8) code < byteLim) {
+                        // 2 byte code after checking there is no longer pattern
+                        *out++ = (u8) code;
+                        cur += 2;
+                    } else {
+                        // 1 byte code or miss.
+                        *out = (u8) code;
+                        out += 1 + ((code & FSST_CODE_BASE)
+                                >> 8); // predicated - tested with a branch, that was always worse
+                        cur++;
+                    }
+                }
             }
-         }
-      }
-   };
-
-   for(curLine=0; curLine<nlines; curLine++) {
-      size_t chunk, curOff = 0;
-      strOut[curLine] = out;
-      do {
-         cur = strIn[curLine] + curOff; 
-         chunk = lenIn[curLine] - curOff;
-         if (chunk > 511) {
-            chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST 
-         }
-         if ((2*chunk+7) > (size_t) (lim-out)) {
-            return curLine; // out of memory
-         }
-         // copy the string to the 511-byte buffer
-         memcpy(buf, cur, chunk);
-         buf[chunk] = (u8) symbolTable.terminator;
-         cur = buf;
-         end = cur + chunk; 
-
-         // based on symboltable stats, choose a variant that is nice to the branch predictor
-         if (noSuffixOpt) {
-            compressVariant(true,false);
-         } else if (avoidBranch) {
-            compressVariant(false,true);
-         } else {
-          compressVariant(false, false);
-         }
-      } while((curOff += chunk) < lenIn[curLine]);
-      lenOut[curLine] = (size_t) (out - strOut[curLine]);
-   } 
-   return curLine;
-}
+        };
+
+        for (curLine = 0; curLine < nlines; curLine++) {
+            size_t chunk, curOff = 0;
+            strOut[curLine] = out;
+            do {
+                cur = strIn[curLine] + curOff;
+                chunk = lenIn[curLine] - curOff;
+                if (chunk > 511) {
+                    chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST
+                }
+                if ((2 * chunk + 7) > (size_t) (lim - out)) {
+                    return curLine; // out of memory
+                }
+                // copy the string to the 511-byte buffer
+                memcpy(buf, cur, chunk);
+                buf[chunk] = (u8) symbolTable.terminator;
+                cur = buf;
+                end = cur + chunk;
+
+                // based on symboltable stats, choose a variant that is nice to the branch predictor
+                if (noSuffixOpt) {
+                    compressVariant(true, false);
+                } else if (avoidBranch) {
+                    compressVariant(false, true);
+                } else {
+                    compressVariant(false, false);
+                }
+            } while ((curOff += chunk) < lenIn[curLine]);
+            lenOut[curLine] = (size_t) (out - strOut[curLine]);
+        }
+        return curLine;
+    }
 
 #define FSST_SAMPLELINE ((size_t) 512)
 
 // quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes
-vector<const u8*> makeSample(u8* sampleBuf, const u8* strIn[], const size_t **lenRef, size_t nlines) {
-   size_t totSize = 0;
-   const size_t *lenIn = *lenRef;
-   vector<const u8*> sample;
-
-   for(size_t i=0; i<nlines; i++) 
-      totSize += lenIn[i];
-
-   if (totSize < FSST_SAMPLETARGET) { 
-      for(size_t i=0; i<nlines; i++) 
-         sample.push_back(strIn[i]);
-   } else {
-      size_t sampleRnd = FSST_HASH(4637947);
-      const u8* sampleLim = sampleBuf + FSST_SAMPLETARGET;
-      size_t *sampleLen =  new size_t[nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE];
-      *lenRef = sampleLen;
-      size_t* sampleLenLim = sampleLen + nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE;
-
-      while(sampleBuf < sampleLim && sampleLen < sampleLenLim) {
-         // choose a non-empty line
-         sampleRnd = FSST_HASH(sampleRnd);
-         size_t linenr = sampleRnd % nlines;
-         while (lenIn[linenr] == 0) 
-            if (++linenr == nlines) linenr = 0;
-
-         // choose a chunk
-         size_t chunks = 1 + ((lenIn[linenr]-1) / FSST_SAMPLELINE);
-         sampleRnd = FSST_HASH(sampleRnd);
-         size_t chunk = FSST_SAMPLELINE*(sampleRnd % chunks);
-
-         // add the chunk to the sample
-         size_t len = min(lenIn[linenr]-chunk,FSST_SAMPLELINE);
-         memcpy(sampleBuf, strIn[linenr]+chunk, len);
-         sample.push_back(sampleBuf);
-         sampleBuf += *sampleLen++ = len;
-      }
-   }
-   return sample;
-}
-
-extern "C" fsst_encoder_t* fsst_create(size_t n, const size_t lenIn[], const u8 *strIn[], int zeroTerminated) {
-   u8* sampleBuf = new u8[FSST_SAMPLEMAXSZ];
-   const size_t *sampleLen = lenIn;
-   vector<const u8*> sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample
-   Encoder *encoder = new Encoder();
-   encoder->symbolTable = shared_ptr<SymbolTable>(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated));
-   if (sampleLen != lenIn) delete[] sampleLen; 
-   delete[] sampleBuf; 
-   return (fsst_encoder_t*) encoder;
-}
+    vector<const u8 *> makeSample(u8 *sampleBuf, const u8 *strIn[], const size_t **lenRef, size_t nlines) {
+        size_t totSize = 0;
+        const size_t *lenIn = *lenRef;
+        vector<const u8 *> sample;
+
+        for (size_t i = 0; i < nlines; i++)
+            totSize += lenIn[i];
+
+        if (totSize < FSST_SAMPLETARGET) {
+            for (size_t i = 0; i < nlines; i++)
+                sample.push_back(strIn[i]);
+        } else {
+            size_t sampleRnd = FSST_HASH(4637947);
+            const u8 *sampleLim = sampleBuf + FSST_SAMPLETARGET;
+            size_t *sampleLen = new size_t[nlines + FSST_SAMPLEMAXSZ / FSST_SAMPLELINE];
+            *lenRef = sampleLen;
+            size_t *sampleLenLim = sampleLen + nlines + FSST_SAMPLEMAXSZ / FSST_SAMPLELINE;
+
+            while (sampleBuf < sampleLim && sampleLen < sampleLenLim) {
+                // choose a non-empty line
+                sampleRnd = FSST_HASH(sampleRnd);
+                size_t linenr = sampleRnd % nlines;
+                while (lenIn[linenr] == 0)
+                    if (++linenr == nlines) linenr = 0;
+
+                // choose a chunk
+                size_t chunks = 1 + ((lenIn[linenr] - 1) / FSST_SAMPLELINE);
+                sampleRnd = FSST_HASH(sampleRnd);
+                size_t chunk = FSST_SAMPLELINE * (sampleRnd % chunks);
+
+                // add the chunk to the sample
+                size_t len = min(lenIn[linenr] - chunk, FSST_SAMPLELINE);
+                memcpy(sampleBuf, strIn[linenr] + chunk, len);
+                sample.push_back(sampleBuf);
+                sampleBuf += *sampleLen++ = len;
+            }
+        }
+        return sample;
+    }
+
+    extern "C" fsst_encoder_t *fsst_create(size_t n, const size_t lenIn[], const u8 *strIn[], int zeroTerminated) {
+        u8 *sampleBuf = new u8[FSST_SAMPLEMAXSZ];
+        const size_t *sampleLen = lenIn;
+        vector<const u8 *> sample = makeSample(sampleBuf, strIn, &sampleLen, n ? n
+                                                                               : 1); // careful handling of input to get a right-size and representative sample
+        Encoder *encoder = new Encoder();
+        encoder->symbolTable = shared_ptr<SymbolTable>(
+                buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated));
+        if (sampleLen != lenIn) delete[] sampleLen;
+        delete[] sampleBuf;
+        return (fsst_encoder_t *) encoder;
+    }
 
 /* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */
-extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) {
-   Encoder *e = new Encoder();
-   e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr
-   return (fsst_encoder_t*) e;
-}
+    extern "C" fsst_encoder_t *fsst_duplicate(fsst_encoder_t *encoder) {
+        Encoder *e = new Encoder();
+        e->symbolTable = ((Encoder *) encoder)->symbolTable; // it is a shared_ptr
+        return (fsst_encoder_t *) e;
+    }
 
 // export a symbol table in compact format. 
-extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) {
-   Encoder *e = (Encoder*) encoder;
-   // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there.
-   // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t
-   // (such functionality could be useful to append compressed data to an existing block).
-   //
-   // However, the hash function in the encoder hash table is endian-sensitive, and given its
-   // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables.
-   // Doing a endian-conversion during hashing will be slow and self-defeating.
-   //
-   // Overall, we could support reconstructing an encoder for incremental compression, but 
-   // should enforce equal-endianness. Bit of a bummer. Not going there now.
-   // 
-   // The version field is now there just for future-proofness, but not used yet
-   
-   // version allows keeping track of fsst versions, track endianness, and encoder reconstruction
-   u64 version = (FSST_VERSION << 32) |  // version is 24 bits, most significant byte is 0 
-                 (((u64) e->symbolTable->suffixLim) << 24) | 
-                 (((u64) e->symbolTable->terminator) << 16) | 
-                 (((u64) e->symbolTable->nSymbols) << 8) | 
-                 FSST_ENDIAN_MARKER; // least significant byte is nonzero
-
-   /* do not assume unaligned reads here */
-   memcpy(buf, &version, 8);
-   buf[8] = e->symbolTable->zeroTerminated;
-   for(u32 i=0; i<8; i++)
-      buf[9+i] = (u8) e->symbolTable->lenHisto[i];
-   u32 pos = 17;
-
-   // emit only the used bytes of the symbols 
-   for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++)
-      for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++)
-         buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes
-
-   return pos; // length of what was serialized
-}
+    extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) {
+        Encoder *e = (Encoder *) encoder;
+        // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there.
+        // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t
+        // (such functionality could be useful to append compressed data to an existing block).
+        //
+        // However, the hash function in the encoder hash table is endian-sensitive, and given its
+        // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables.
+        // Doing a endian-conversion during hashing will be slow and self-defeating.
+        //
+        // Overall, we could support reconstructing an encoder for incremental compression, but
+        // should enforce equal-endianness. Bit of a bummer. Not going there now.
+        //
+        // The version field is now there just for future-proofness, but not used yet
+
+        // version allows keeping track of fsst versions, track endianness, and encoder reconstruction
+        u64 version = (FSST_VERSION << 32) |  // version is 24 bits, most significant byte is 0
+                      (((u64) e->symbolTable->suffixLim) << 24) |
+                      (((u64) e->symbolTable->terminator) << 16) |
+                      (((u64) e->symbolTable->nSymbols) << 8) |
+                      FSST_ENDIAN_MARKER; // least significant byte is nonzero
+
+        /* do not assume unaligned reads here */
+        memcpy(buf, &version, 8);
+        buf[8] = e->symbolTable->zeroTerminated;
+        for (u32 i = 0; i < 8; i++)
+            buf[9 + i] = (u8) e->symbolTable->lenHisto[i];
+        u32 pos = 17;
+
+        // emit only the used bytes of the symbols
+        for (u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++)
+            for (u32 j = 0; j < e->symbolTable->symbols[i].length(); j++)
+                buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes
+
+        return pos; // length of what was serialized
+    }
 
 #define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */
 
-extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 *buf) {
-   u64 version = 0;
-   u32 code, pos = 17;
-   u8 lenHisto[8];
-
-   // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
-   memcpy(&version, buf, 8);
-   if ((version>>32) != FSST_VERSION) return 0;
-   decoder->zeroTerminated = buf[8]&1;
-   memcpy(lenHisto, buf+9, 8);
-
-   // in case of zero-terminated, first symbol is "" (zero always, may be overwritten) 
-   decoder->len[0] = 1;
-   decoder->symbol[0] = 0;
-
-   // we use lenHisto[0] as 1-byte symbol run length (at the end)
-   code = decoder->zeroTerminated;
-   if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end
-
-   // now get all symbols from the buffer
-   for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */
-      for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++)  {
-         decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1  */
-         decoder->symbol[code] = 0;
-         for(u32 j=0; j<decoder->len[code]; j++) 
-            ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols
-      }
-   }
-   if (decoder->zeroTerminated) lenHisto[0]++; 
+    extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 *buf) {
+        u64 version = 0;
+        u32 code, pos = 17;
+        u8 lenHisto[8];
+
+        // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
+        memcpy(&version, buf, 8);
+        if ((version >> 32) != FSST_VERSION) return 0;
+        decoder->zeroTerminated = buf[8] & 1;
+        memcpy(lenHisto, buf + 9, 8);
+
+        // in case of zero-terminated, first symbol is "" (zero always, may be overwritten)
+        decoder->len[0] = 1;
+        decoder->symbol[0] = 0;
+
+        // we use lenHisto[0] as 1-byte symbol run length (at the end)
+        code = decoder->zeroTerminated;
+        if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end
+
+        // now get all symbols from the buffer
+        for (u32 l = 1; l <= 8; l++) { /* l = 1,2,3,4,5,6,7,8 */
+            for (u32 i = 0; i < lenHisto[(l & 7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) {
+                decoder->len[code] = (l & 7) + 1; /* len = 2,3,4,5,6,7,8,1  */
+                decoder->symbol[code] = 0;
+                for (u32 j = 0; j < decoder->len[code]; j++)
+                    ((u8 *) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols
+            }
+        }
+        if (decoder->zeroTerminated) lenHisto[0]++;
 
-   // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols).
-   while(code<255) {
-       decoder->symbol[code] = FSST_CORRUPT;    
-       decoder->len[code++] = 8;
-   }
-   return pos;
-}
+        // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols).
+        while (code < 255) {
+            decoder->symbol[code] = FSST_CORRUPT;
+            decoder->len[code++] = 8;
+        }
+        return pos;
+    }
 
 // runtime check for simd
-inline size_t _compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
+    inline size_t
+    _compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output,
+                  size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
 #ifndef NONOPT_FSST
-   if (simd && fsst_hasAVX512())
-      return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
+        if (simd && fsst_hasAVX512())
+            return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
 #endif
-   (void) simd;
-   return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch);
-}
-size_t compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
-   return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
-}
+        (void) simd;
+        return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt,
+                            avoidBranch);
+    }
+
+    size_t compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output,
+                        size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
+        return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
+    }
 
 // adaptive choosing of scalar compression method based on symbol length histogram 
-inline size_t _compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
-   bool avoidBranch = false, noSuffixOpt = false;
-   if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) {
-      noSuffixOpt = true;
-   } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) &&
-              (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) &&
-              (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) {
-      avoidBranch = true;
-   }
-   return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
-}
-size_t compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
-   return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
+    inline size_t
+    _compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output,
+                  size_t *lenOut, u8 *strOut[], int simd) {
+        bool avoidBranch = false, noSuffixOpt = false;
+        if (100 * e->symbolTable->lenHisto[1] > 65 * e->symbolTable->nSymbols &&
+            100 * e->symbolTable->suffixLim > 95 * e->symbolTable->lenHisto[1]) {
+            noSuffixOpt = true;
+        } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) &&
+                   (e->symbolTable->lenHisto[0] < 43 ||
+                    e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) &&
+                   (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) {
+            avoidBranch = true;
+        }
+        return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
+    }
+
+    size_t compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output,
+                        size_t *lenOut, u8 *strOut[], int simd) {
+        return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
+    }
 }
 
 // the main compression function (everything automatic)
@@ -621,12 +657,12 @@ extern "C" size_t fsst_compress(fsst_encoder_t *encoder, size_t nlines, const si
    // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB)
    size_t totLen = accumulate(lenIn, lenIn+nlines, 0);
    int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15); 
-   return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd);
+   return _compressAuto((libfsst::Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd);
 }
 
 /* deallocate encoder */
 extern "C" void fsst_destroy(fsst_encoder_t* encoder) {
-   Encoder *e = (Encoder*) encoder; 
+   libfsst::Encoder *e = (libfsst::Encoder*) encoder;
    delete e;
 }
 
@@ -639,3 +675,4 @@ extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) {
    assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; 
    return decoder;
 }
+
diff --git a/libfsst.hpp b/libfsst.hpp
index 6a38ab4..cb04bd7 100644
--- a/libfsst.hpp
+++ b/libfsst.hpp
@@ -51,62 +51,85 @@ typedef uint64_t u64;
 
 // we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length
 #define FSST_LEN_BITS       12
-#define FSST_CODE_BITS      9 
+#define FSST_CODE_BITS      9
 #define FSST_CODE_BASE      256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */
 #define FSST_CODE_MAX       (1UL<<FSST_CODE_BITS) /* all bits set: indicating a symbol that has not been assigned a code yet */
 #define FSST_CODE_MASK      (FSST_CODE_MAX-1UL)   /* all bits set: indicating a symbol that has not been assigned a code yet */
 
-inline uint64_t fsst_unaligned_load(u8 const* V) {
-    uint64_t Ret;
-    memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible)
-    return Ret;
+
+inline uint64_t fsst_unaligned_load(u8 const *V) {
+   uint64_t Ret;
+   memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible)
+   return Ret;
 }
 
+namespace libfsst {
 struct Symbol {
    static const unsigned maxLength = 8;
 
    // the byte sequence that this symbol stands for
-   union { char str[maxLength]; u64 num; } val; // usually we process it as a num(ber), as this is fast
+   union {
+      char str[maxLength];
+      u64 num;
+   } val; // usually we process it as a num(ber), as this is fast
 
    // icl = u64 ignoredBits:16,code:12,length:4,unused:32 -- but we avoid exposing this bit-field notation
    u64 icl;  // use a single u64 to be sure "code" is accessed with one load and can be compared with one comparison
 
    Symbol() : icl(0) { val.num = 0; }
 
-   explicit Symbol(u8 c, u16 code) : icl((1<<28)|(code<<16)|56) { val.num = c; } // single-char symbol
-   explicit Symbol(const char* begin, const char* end) : Symbol(begin, (u32) (end-begin)) {}
-   explicit Symbol(const u8* begin, const u8* end) : Symbol((const char*)begin, (u32) (end-begin)) {}
-   explicit Symbol(const char* input, u32 len) {
+   explicit Symbol(u8 c, u16 code) : icl((1 << 28) | (code << 16) | 56) { val.num = c; } // single-char symbol
+   explicit Symbol(const char *begin, const char *end) : Symbol(begin, (u32) (end - begin)) {}
+
+   explicit Symbol(const u8 *begin, const u8 *end) : Symbol((const char *) begin, (u32) (end - begin)) {}
+
+   explicit Symbol(const char *input, u32 len) {
       val.num = 0;
-      if (len>=8) {
-          len = 8;
-          memcpy(val.str, input, 8);
+      if (len >= 8) {
+         len = 8;
+         memcpy(val.str, input, 8);
       } else {
-          memcpy(val.str, input, len);
+         memcpy(val.str, input, len);
       }
       set_code_len(FSST_CODE_MAX, len);
    }
-   void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); }
+
+   void set_code_len(u32 code, u32 len) { icl = (len << 28) | (code << 16) | ((8 - len) * 8); }
 
    u32 length() const { return (u32) (icl >> 28); }
+
    u16 code() const { return (icl >> 16) & FSST_CODE_MASK; }
+
    u32 ignoredBits() const { return (u32) icl; }
 
-   u8 first() const { assert( length() >= 1); return 0xFF & val.num; }
-   u16 first2() const { assert( length() >= 2); return 0xFFFF & val.num; }
+   u8 first() const {
+      assert(length() >= 1);
+      return 0xFF & val.num;
+   }
 
-#define FSST_HASH_LOG2SIZE 10 
+   u16 first2() const {
+      assert(length() >= 2);
+      return 0xFFFF & val.num;
+   }
+
+#define FSST_HASH_LOG2SIZE 10
 #define FSST_HASH_PRIME 2971215073LL
 #define FSST_SHIFT 15
 #define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT))
-   size_t hash() const { size_t v = 0xFFFFFF & val.num; return FSST_HASH(v); } // hash on the next 3 bytes
+
+   size_t hash() const {
+      size_t v = 0xFFFFFF & val.num;
+      return FSST_HASH(v);
+   } // hash on the next 3 bytes
 };
 
 // Symbol that can be put in a queue, ordered on gain
-struct QSymbol{
+struct QSymbol {
    Symbol symbol;
    mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols
-   bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); }
+   bool operator==(const QSymbol &other) const {
+      return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length();
+   }
 };
 
 // we construct FSST symbol tables using a random sample of about 16KB (1<<14) 
@@ -144,7 +167,7 @@ struct QSymbol{
 // the gain field is only used in the symbol queue that sorts symbols on gain
 
 struct SymbolTable {
-   static const u32 hashTabSize = 1<<FSST_HASH_LOG2SIZE; // smallest size that incurs no precision loss
+   static const u32 hashTabSize = 1 << FSST_HASH_LOG2SIZE; // smallest size that incurs no precision loss
 
    // lookup table using the next two bytes (65536 codes), or just the next single byte
    u16 shortCodes[65536]; // contains code for 2-byte symbol, otherwise code for pseudo byte (escaped byte)
@@ -166,27 +189,27 @@ struct SymbolTable {
 
    SymbolTable() : nSymbols(0), suffixLim(FSST_CODE_MAX), terminator(0), zeroTerminated(false) {
       // stuff done once at startup
-      for (u32 i=0; i<256; i++) {
-         symbols[i] = Symbol(i,i|(1<<FSST_LEN_BITS)); // pseudo symbols
+      for (u32 i = 0; i < 256; i++) {
+         symbols[i] = Symbol(i, i | (1 << FSST_LEN_BITS)); // pseudo symbols
       }
-      Symbol unused = Symbol((u8) 0,FSST_CODE_MASK); // single-char symbol, exception code
-      for (u32 i=256; i<FSST_CODE_MAX; i++) {
+      Symbol unused = Symbol((u8) 0, FSST_CODE_MASK); // single-char symbol, exception code
+      for (u32 i = 256; i < FSST_CODE_MAX; i++) {
          symbols[i] = unused; // we start with all symbols unused
       }
       // empty hash table
       Symbol s;
       s.val.num = 0;
       s.icl = FSST_ICL_FREE; //marks empty in hashtab
-      for(u32 i=0; i<hashTabSize; i++)
+      for (u32 i = 0; i < hashTabSize; i++)
          hashTab[i] = s;
 
       // fill byteCodes[] with the pseudo code all bytes (escaped bytes)
-      for(u32 i=0; i<256; i++)
-         byteCodes[i] = (1<<FSST_LEN_BITS) | i;
+      for (u32 i = 0; i < 256; i++)
+         byteCodes[i] = (1 << FSST_LEN_BITS) | i;
 
       // fill shortCodes[] with the pseudo code for the first byte of each two-byte pattern
-      for(u32 i=0; i<65536; i++)
-         shortCodes[i] = (1<<FSST_LEN_BITS) | (i&255);
+      for (u32 i = 0; i < 65536; i++)
+         shortCodes[i] = (1 << FSST_LEN_BITS) | (i & 255);
 
       memset(lenHisto, 0, sizeof(lenHisto)); // all unused
    }
@@ -194,58 +217,63 @@ struct SymbolTable {
    void clear() {
       // clear a symbolTable with minimal effort (only erase the used positions in it)
       memset(lenHisto, 0, sizeof(lenHisto)); // all unused
-      for(u32 i=FSST_CODE_BASE; i<FSST_CODE_BASE+nSymbols; i++) {
-          if (symbols[i].length() == 1) {
-              u16 val = symbols[i].first();
-              byteCodes[val] = (1<<FSST_LEN_BITS) | val;
-          } else if (symbols[i].length() == 2) {
-              u16 val = symbols[i].first2();
-              shortCodes[val] = (1<<FSST_LEN_BITS) | (val&255);
-          } else {
-              u32 idx = symbols[i].hash() & (hashTabSize-1);
-              hashTab[idx].val.num = 0;
-              hashTab[idx].icl = FSST_ICL_FREE; //marks empty in hashtab
-          }           
-      } 
+      for (u32 i = FSST_CODE_BASE; i < FSST_CODE_BASE + nSymbols; i++) {
+         if (symbols[i].length() == 1) {
+            u16 val = symbols[i].first();
+            byteCodes[val] = (1 << FSST_LEN_BITS) | val;
+         } else if (symbols[i].length() == 2) {
+            u16 val = symbols[i].first2();
+            shortCodes[val] = (1 << FSST_LEN_BITS) | (val & 255);
+         } else {
+            u32 idx = symbols[i].hash() & (hashTabSize - 1);
+            hashTab[idx].val.num = 0;
+            hashTab[idx].icl = FSST_ICL_FREE; //marks empty in hashtab
+         }
+      }
       nSymbols = 0; // no need to clean symbols[] as no symbols are used
    }
+
    bool hashInsert(Symbol s) {
-      u32 idx = s.hash() & (hashTabSize-1);
+      u32 idx = s.hash() & (hashTabSize - 1);
       bool taken = (hashTab[idx].icl < FSST_ICL_FREE);
       if (taken) return false; // collision in hash table
       hashTab[idx].icl = s.icl;
       hashTab[idx].val.num = s.val.num & (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
       return true;
    }
+
    bool add(Symbol s) {
       assert(FSST_CODE_BASE + nSymbols < FSST_CODE_MAX);
       u32 len = s.length();
       s.set_code_len(FSST_CODE_BASE + nSymbols, len);
       if (len == 1) {
-         byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1<<FSST_LEN_BITS); // len=1 (<<FSST_LEN_BITS)
+         byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1 << FSST_LEN_BITS); // len=1 (<<FSST_LEN_BITS)
       } else if (len == 2) {
-         shortCodes[s.first2()] = FSST_CODE_BASE + nSymbols + (2<<FSST_LEN_BITS); // len=2 (<<FSST_LEN_BITS)
+         shortCodes[s.first2()] = FSST_CODE_BASE + nSymbols + (2 << FSST_LEN_BITS); // len=2 (<<FSST_LEN_BITS)
       } else if (!hashInsert(s)) {
          return false;
       }
       symbols[FSST_CODE_BASE + nSymbols++] = s;
-      lenHisto[len-1]++;
+      lenHisto[len - 1]++;
       return true;
    }
+
    /// Find longest expansion, return code (= position in symbol table)
    u16 findLongestSymbol(Symbol s) const {
-      size_t idx = s.hash() & (hashTabSize-1);
-      if (hashTab[idx].icl <= s.icl && hashTab[idx].val.num == (s.val.num & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) {
-         return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol 
+      size_t idx = s.hash() & (hashTabSize - 1);
+      if (hashTab[idx].icl <= s.icl &&
+          hashTab[idx].val.num == (s.val.num & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) {
+         return (hashTab[idx].icl >> 16) & FSST_CODE_MASK; // matched a long symbol
       }
       if (s.length() >= 2) {
-         u16 code =  shortCodes[s.first2()] & FSST_CODE_MASK;
-         if (code >= FSST_CODE_BASE) return code; 
+         u16 code = shortCodes[s.first2()] & FSST_CODE_MASK;
+         if (code >= FSST_CODE_BASE) return code;
       }
       return byteCodes[s.first()] & FSST_CODE_MASK;
    }
-   u16 findLongestSymbol(const u8* cur, const u8* end) const {
-      return findLongestSymbol(Symbol(cur,end)); // represent the string as a temporary symbol
+
+   u16 findLongestSymbol(const u8 *cur, const u8 *end) const {
+      return findLongestSymbol(Symbol(cur, end)); // represent the string as a temporary symbol
    }
 
    // rationale for finalize:
@@ -275,53 +303,53 @@ struct SymbolTable {
    // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore).
    //
    void finalize(u8 zeroTerminated) {
-       assert(nSymbols <= 255);
-       u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated);
-
-       // compute running sum of code lengths (starting offsets for each length) 
-       rsum[0] = byteLim; // 1-byte codes are highest
-       rsum[1] = zeroTerminated;
-       for(u32 i=1; i<7; i++)
-          rsum[i+1] = rsum[i] + lenHisto[i];
-
-       // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim)
-       suffixLim = rsum[1];
-       symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only)
-
-       for(u32 i=zeroTerminated, j=rsum[2]; i<nSymbols; i++) {  
-          Symbol s1 = symbols[FSST_CODE_BASE+i];
-          u32 len = s1.length(), opt = (len == 2)*nSymbols;
-          if (opt) {
-              u16 first2 = s1.first2();
-              for(u32 k=0; k<opt; k++) {  
-                 Symbol s2 = symbols[FSST_CODE_BASE+k];
-                 if (k != i && s2.length() > 1 && first2 == s2.first2()) // test if symbol k is a suffix of s
-                    opt = 0;
-              }
-              newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim 
-          } else 
-              newCode[i] = rsum[len-1]++;
-          s1.set_code_len(newCode[i],len);
-          symbols[newCode[i]] = s1; 
-       }
-       // renumber the codes in byteCodes[] 
-       for(u32 i=0; i<256; i++) 
-          if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
-             byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS);
-          else 
-             byteCodes[i] = 511 + (1 << FSST_LEN_BITS);
-       
-       // renumber the codes in shortCodes[] 
-       for(u32 i=0; i<65536; i++)
-          if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
-             shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS));
-          else 
-             shortCodes[i] = byteCodes[i&0xFF];
-
-       // replace the symbols in the hash table
-       for(u32 i=0; i<hashTabSize; i++)
-          if (hashTab[i].icl < FSST_ICL_FREE)
-             hashTab[i] = symbols[newCode[(u8) hashTab[i].code()]];
+      assert(nSymbols <= 255);
+      u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated);
+
+      // compute running sum of code lengths (starting offsets for each length)
+      rsum[0] = byteLim; // 1-byte codes are highest
+      rsum[1] = zeroTerminated;
+      for (u32 i = 1; i < 7; i++)
+         rsum[i + 1] = rsum[i] + lenHisto[i];
+
+      // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim)
+      suffixLim = rsum[1];
+      symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only)
+
+      for (u32 i = zeroTerminated, j = rsum[2]; i < nSymbols; i++) {
+         Symbol s1 = symbols[FSST_CODE_BASE + i];
+         u32 len = s1.length(), opt = (len == 2) * nSymbols;
+         if (opt) {
+            u16 first2 = s1.first2();
+            for (u32 k = 0; k < opt; k++) {
+               Symbol s2 = symbols[FSST_CODE_BASE + k];
+               if (k != i && s2.length() > 1 && first2 == s2.first2()) // test if symbol k is a suffix of s
+                  opt = 0;
+            }
+            newCode[i] = opt ? suffixLim++ : --j; // symbols without a larger suffix have a code < suffixLim
+         } else
+            newCode[i] = rsum[len - 1]++;
+         s1.set_code_len(newCode[i], len);
+         symbols[newCode[i]] = s1;
+      }
+      // renumber the codes in byteCodes[]
+      for (u32 i = 0; i < 256; i++)
+         if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
+            byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS);
+         else
+            byteCodes[i] = 511 + (1 << FSST_LEN_BITS);
+
+      // renumber the codes in shortCodes[]
+      for (u32 i = 0; i < 65536; i++)
+         if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
+            shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS));
+         else
+            shortCodes[i] = byteCodes[i & 0xFF];
+
+      // replace the symbols in the hash table
+      for (u32 i = 0; i < hashTabSize; i++)
+         if (hashTab[i].icl < FSST_ICL_FREE)
+            hashTab[i] = symbols[newCode[(u8) hashTab[i].code()]];
    }
 };
 
@@ -353,6 +381,7 @@ struct Counters {
    }
 };
 #else
+
 // we keep two counters count1[pos] and count2[pos1][pos2] of resp 16 and 12-bits. Both are split into two columns for performance reasons
 // first reason is to make the column we update the most during symbolTable construction (the low bits) thinner, thus reducing CPU cache pressure.
 // second reason is that when scanning the array, after seeing a 64-bits 0 in the high bits column, we can quickly skip over many codes (15 or 7)
@@ -360,28 +389,33 @@ struct Counters {
    // high arrays come before low arrays, because our GetNext() methods may overrun their 64-bits reads a few bytes
    u8 count1High[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample (16-bits)
    u8 count1Low[FSST_CODE_MAX];    // it is split in a low and high byte: cnt = count1High*256 + count1Low
-   u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX/2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high)
+   u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX /
+                                2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high)
    u8 count2Low[FSST_CODE_MAX][FSST_CODE_MAX];    // its value is (count2High*256+count2Low) -- but high is 4-bits (we put two numbers in one, hence /2)
    // 385KB  -- but hot area likely just 10 + 30*4 = 130 cache lines (=8KB)
-   
-   void count1Set(u32 pos1, u16 val) { 
-      count1Low[pos1] = val&255;
-      count1High[pos1] = val>>8;
+
+   void count1Set(u32 pos1, u16 val) {
+      count1Low[pos1] = val & 255;
+      count1High[pos1] = val >> 8;
    }
-   void count1Inc(u32 pos1) { 
+
+   void count1Inc(u32 pos1) {
       if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
          count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)...
    }
-   void count2Inc(u32 pos1, u32 pos2) {  
-       if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
-          // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively
-          count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample)
+
+   void count2Inc(u32 pos1, u32 pos2) {
+      if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
+         // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively
+         count2High[pos1][(pos2) >> 1] +=
+                 1 << (((pos2) & 1) << 2); // we take our chances with overflow.. (4K maxval, on a 8K sample)
    }
+
    u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range
       // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros
       u64 high = fsst_unaligned_load(&count1High[pos1]); // note: this reads 8 subsequent counters [pos1..pos1+7]
 
-      u32 zero = high?(__builtin_ctzl(high)>>3):7UL; // number of zero bytes
+      u32 zero = high ? (__builtin_ctzl(high) >> 3) : 7UL; // number of zero bytes
       high = (high >> (zero << 3)) & 255; // advance to nonzero counter
       if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
          return 0; // all zero
@@ -390,12 +424,14 @@ struct Counters {
       if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
       return (u32) ((high << 8) + low);
    }
+
    u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range
       // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros
-      u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]); // note: this reads 16 subsequent counters [pos2..pos2+15]
-      high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters
+      u64 high = fsst_unaligned_load(
+              &count2High[pos1][pos2 >> 1]); // note: this reads 16 subsequent counters [pos2..pos2+15]
+      high >>= ((pos2 & 1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters
 
-      u32 zero = high?(__builtin_ctzl(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters
+      u32 zero = high ? (__builtin_ctzl(high) >> 2) : (15UL - (pos2 & 1UL)); // number of zero 4-bits counters
       high = (high >> (zero << 2)) & 15;  // advance to nonzero counter
       if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
          return 0UL; // all zero
@@ -404,15 +440,18 @@ struct Counters {
       if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
       return (u32) ((high << 8) + low);
    }
+
    void backup1(u8 *buf) {
       memcpy(buf, count1High, FSST_CODE_MAX);
-      memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX);
+      memcpy(buf + FSST_CODE_MAX, count1Low, FSST_CODE_MAX);
    }
+
    void restore1(u8 *buf) {
       memcpy(count1High, buf, FSST_CODE_MAX);
-      memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX);
+      memcpy(count1Low, buf + FSST_CODE_MAX, FSST_CODE_MAX);
    }
-}; 
+};
+
 #endif
 
 
@@ -429,22 +468,26 @@ struct Encoder {
 
 // job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call)
 struct SIMDjob {
-   u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB)  
+   u64 out: 19, pos: 9, end: 18, cur: 18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB)
 };
 
-extern bool 
+extern bool
 fsst_hasAVX512(); // runtime check for avx512 capability
 
-extern size_t 
+extern size_t
 fsst_compressAVX512(
-   SymbolTable &symbolTable, 
-   u8* codeBase,    // IN: base address for codes, i.e. compression output (points to simdbuf+256KB)
-   u8* symbolBase,  // IN: base address for string bytes, i.e. compression input (points to simdbuf)
-   SIMDjob* input,  // IN: input array (size n) with job information: what to encode, where to store it.
-   SIMDjob* output, // OUT: output array (size n) with job information: how much got encoded, end output pointer.
-   size_t n,         // IN: size of arrays input and output (should be max 512)
-   size_t unroll);   // IN: degree of SIMD unrolling
+        SymbolTable &symbolTable,
+        u8 *codeBase,    // IN: base address for codes, i.e. compression output (points to simdbuf+256KB)
+        u8 *symbolBase,  // IN: base address for string bytes, i.e. compression input (points to simdbuf)
+        SIMDjob *input,  // IN: input array (size n) with job information: what to encode, where to store it.
+        SIMDjob *output, // OUT: output array (size n) with job information: how much got encoded, end output pointer.
+        size_t n,         // IN: size of arrays input and output (should be max 512)
+        size_t unroll);   // IN: degree of SIMD unrolling
 
 // C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree)
-size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd);
-size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd);
+size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut,
+                    u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd);
+
+size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut,
+                    u8 *strOut[], int simd);
+}
diff --git a/libfsst12.cpp b/libfsst12.cpp
index fa68451..3c86b37 100644
--- a/libfsst12.cpp
+++ b/libfsst12.cpp
@@ -19,23 +19,25 @@
 #include <math.h>
 #include <string.h>
 
+namespace libfsst {
 Symbol concat(Symbol a, Symbol b) {
    Symbol s;
-   u32 length = min(8, a.length()+b.length());
+   u32 length = min(8, a.length() + b.length());
    s.set_code_len(FSST_CODE_MASK, length);
-   *(u64*) s.symbol = ((*(u64*) b.symbol) << (8*a.length())) | *(u64*) a.symbol;
+   *(u64 *) s.symbol = ((*(u64 *) b.symbol) << (8 * a.length())) | *(u64 *) a.symbol;
    return s;
 }
+}
 
 namespace std {
-template <>
-class hash<Symbol> {
-   public:
-   size_t operator()(const Symbol& s) const {
-      uint64_t k = *(u64*) s.symbol;
+template<>
+class hash<libfsst::Symbol> {
+public:
+   size_t operator()(const libfsst::Symbol &s) const {
+      uint64_t k = *(u64 *) s.symbol;
       const uint64_t m = 0xc6a4a7935bd1e995;
       const int r = 47;
-      uint64_t h = 0x8445d61a4e774912 ^ (8*m);
+      uint64_t h = 0x8445d61a4e774912 ^ (8 * m);
       k *= m;
       k ^= k >> r;
       k *= m;
@@ -49,41 +51,46 @@ class hash<Symbol> {
 };
 }
 
-std::ostream& operator<<(std::ostream& out, const Symbol& s) {
-   for (u32 i=0; i<s.length(); i++)
+namespace libfsst {
+std::ostream &operator<<(std::ostream &out, const Symbol &s) {
+   for (u32 i = 0; i < s.length(); i++)
       out << s.symbol[i];
    return out;
 }
 
-#define FSST_SAMPLETARGET (1<<17) 
-#define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET) 
+#define FSST_SAMPLETARGET (1<<17)
+#define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET)
 
-SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector<ulong>& sample, const ulong len[], const u8* line[]) {
-   ulong sampleSize = max(sampleParam, FSST_SAMPLEMAXSZ); // if sampleParam is negative, we need to ignore part of the last line
+SymbolMap *
+buildSymbolMap(Counters &counters, long sampleParam, vector<ulong> &sample, const ulong len[], const u8 *line[]) {
+   ulong sampleSize = max(sampleParam,
+                          FSST_SAMPLEMAXSZ); // if sampleParam is negative, we need to ignore part of the last line
    SymbolMap *st = new SymbolMap(), *bestMap = new SymbolMap();
    long bestGain = -sampleSize; // worst case (everything exception)
    ulong sampleFrac = 128;
 
-   for(ulong i=0; i<sample.size(); i++) {
-      const u8* cur = line[sample[i]];
-      if (sampleParam < 0 && i+1 == sample.size())
+   for (ulong i = 0; i < sample.size(); i++) {
+      const u8 *cur = line[sample[i]];
+      if (sampleParam < 0 && i + 1 == sample.size())
          cur -= sampleSize; // use only last part of last line (which could be too long for an efficient sample)
    }
 
    // a random number between 0 and 128
-   auto rnd128 = [&](ulong i) { return 1 + (FSST_HASH((i+1)*sampleFrac)&127); };
+   auto rnd128 = [&](ulong i) { return 1 + (FSST_HASH((i + 1) * sampleFrac) & 127); };
 
    // compress sample, and compute (pair-)frequencies
    auto compressCount = [&](SymbolMap *st, Counters &counters) { // returns gain
       long gain = 0;
 
-      for(ulong i=0; i<sample.size(); i++) {
-         const u8* cur = line[sample[i]];
-         const u8* end = cur + len[sample[i]];
+      for (ulong i = 0; i < sample.size(); i++) {
+         const u8 *cur = line[sample[i]];
+         const u8 *end = cur + len[sample[i]];
 
-         if (sampleParam < 0 && i+1 == sample.size()) { 
+         if (sampleParam < 0 && i + 1 == sample.size()) {
             cur -= sampleParam; // use only last part of last line (which could be too long for an efficient sample)
-            if ((end-cur) > 500) end = cur + ((end-cur)*sampleFrac)/128; // shorten long lines to the sample fraction
+            if ((end - cur) > 500)
+               end = cur + ((end - cur) * sampleFrac) /
+                           128; // shorten long lines to the sample fraction
          } else if (sampleFrac < 128) {
             // in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x)
             if (rnd128(i) > sampleFrac) continue;
@@ -94,32 +101,33 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector<ulong>& s
             cur += pos1 >> 12;
             pos1 &= FSST_CODE_MASK;
             while (true) {
-	       const u8 *old = cur;
+               const u8 *old = cur;
                counters.count1Inc(pos1);
-               if (cur<end-7) {
+               if (cur < end - 7) {
                   ulong word = fsst_unaligned_load(cur);
                   ulong pos = (u32) word; // key is first 4 bytes!!
-                  ulong idx = FSST_HASH(pos)&(st->hashTabSize-1);
+                  ulong idx = FSST_HASH(pos) & (st->hashTabSize - 1);
                   Symbol s = st->hashTab[idx];
                   pos2 = st->shortCodes[word & 0xFFFF];
                   word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.gcl);
-                  if ((s.gcl < FSST_GCL_FREE) && (*(u64*) s.symbol == word)) {
-                     pos2 = s.code(); cur += s.length();
+                  if ((s.gcl < FSST_GCL_FREE) && (*(u64 *) s.symbol == word)) {
+                     pos2 = s.code();
+                     cur += s.length();
                   } else {
                      cur += (pos2 >> 12);
                      pos2 &= FSST_CODE_MASK;
                   }
-               } else if (cur==end) {
+               } else if (cur == end) {
                   break;
                } else {
-                  assert(cur<end);
+                  assert(cur < end);
                   pos2 = st->findExpansion(Symbol(cur, end));
                   cur += pos2 >> 12;
                   pos2 &= FSST_CODE_MASK;
                }
 
                // compute compressed output size (later divide by 2)
-               gain += 2*(cur-old)-3;
+               gain += 2 * (cur - old) - 3;
 
                // now count the subsequent two symbols we encode as an extension possibility
                if (sampleFrac < 128) { // no need to count pairs in final round
@@ -129,7 +137,7 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector<ulong>& s
             }
          }
       }
-      return gain; 
+      return gain;
    };
 
    auto makeMap = [&](SymbolMap *st, Counters &counters) {
@@ -138,7 +146,7 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector<ulong>& s
 
       auto addOrInc = [&](unordered_set<Symbol> &cands, Symbol s, u32 count) {
          auto it = cands.find(s);
-         s.gain = s.length()*count;
+         s.gain = s.length() * count;
          if (it != cands.end()) {
             s.gain += (*it).gain;
             cands.erase(*it);
@@ -147,7 +155,7 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector<ulong>& s
       };
 
       // add candidate symbols based on counted frequency
-      for (u32 pos1=0; pos1<st->symbolCount; pos1++) { 
+      for (u32 pos1 = 0; pos1 < st->symbolCount; pos1++) {
          u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
          if (!cnt1) continue;
 
@@ -160,7 +168,7 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector<ulong>& s
              s1.length() == Symbol::maxLength) { // symbol cannot be extended
             continue;
          }
-         for (u32 pos2=0; pos2<st->symbolCount; pos2++) { 
+         for (u32 pos2 = 0; pos2 < st->symbolCount; pos2++) {
             u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
             if (!cnt2) continue;
 
@@ -172,9 +180,9 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector<ulong>& s
       }
 
       // insert candidates into priority queue (by gain)
-      auto cmpGn = [](const Symbol& q1, const Symbol& q2) { return q1.gain < q2.gain; };
-      priority_queue<Symbol,vector<Symbol>,decltype(cmpGn)> pq(cmpGn);
-      for (auto& q : cands)
+      auto cmpGn = [](const Symbol &q1, const Symbol &q2) { return q1.gain < q2.gain; };
+      priority_queue<Symbol, vector<Symbol>, decltype(cmpGn)> pq(cmpGn);
+      for (auto &q: cands)
          pq.push(q);
 
       // Create new symbol map using best candidates
@@ -190,13 +198,14 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector<ulong>& s
    for(ulong frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) {
       sampleFrac = frac;
 #else
-   for(sampleFrac=14; true; sampleFrac = sampleFrac + 38) {
+   for (sampleFrac = 14; true; sampleFrac = sampleFrac + 38) {
 #endif
       memset(&counters, 0, sizeof(Counters));
       long gain = compressCount(st, counters);
       if (gain >= bestGain) { // a new best solution!
-         *bestMap = *st; bestGain = gain;
-      } 
+         *bestMap = *st;
+         bestGain = gain;
+      }
       if (sampleFrac >= 128) break; // we do 4 rounds (sampleFrac=14,52,90,128)
       makeMap(st, counters);
    }
@@ -205,21 +214,23 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector<ulong>& s
 }
 
 // optimized adaptive *scalar* compression method
-static inline ulong compressBulk(SymbolMap &symbolMap, ulong nlines, const ulong lenIn[], const u8* strIn[], ulong size, u8* out, ulong lenOut[], u8* strOut[]) {
+static inline ulong
+compressBulk(SymbolMap &symbolMap, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *out,
+             ulong lenOut[], u8 *strOut[]) {
    u8 *lim = out + size;
    ulong curLine;
-   for(curLine=0; curLine<nlines; curLine++) {
+   for (curLine = 0; curLine < nlines; curLine++) {
       const u8 *cur = strIn[curLine];
       const u8 *end = cur + lenIn[curLine];
       strOut[curLine] = out;
-      while (cur+16 <= end && (lim-out) >= 8) {
+      while (cur + 16 <= end && (lim - out) >= 8) {
          u64 word = fsst_unaligned_load(cur);
          ulong code = symbolMap.shortCodes[word & 0xFFFF];
          ulong pos = (u32) word; // key is first 4 bytes
-         ulong idx = FSST_HASH(pos)&(symbolMap.hashTabSize-1);
+         ulong idx = FSST_HASH(pos) & (symbolMap.hashTabSize - 1);
          Symbol s = symbolMap.hashTab[idx];
          word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.gcl);
-         if ((s.gcl < FSST_GCL_FREE) && *(ulong*) s.symbol == word) {
+         if ((s.gcl < FSST_GCL_FREE) && *(ulong *) s.symbol == word) {
             code = s.gcl >> 16;
          }
          cur += (code >> 12);
@@ -227,37 +238,37 @@ static inline ulong compressBulk(SymbolMap &symbolMap, ulong nlines, const ulong
          word = fsst_unaligned_load(cur);
          code = symbolMap.shortCodes[word & 0xFFFF];
          pos = (u32) word; // key is first 4 bytes
-         idx = FSST_HASH(pos)&(symbolMap.hashTabSize-1);
+         idx = FSST_HASH(pos) & (symbolMap.hashTabSize - 1);
          s = symbolMap.hashTab[idx];
          word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.gcl);
-         if ((s.gcl < FSST_GCL_FREE) && *(ulong*) s.symbol == word) {
-           code = s.gcl >> 16;
+         if ((s.gcl < FSST_GCL_FREE) && *(ulong *) s.symbol == word) {
+            code = s.gcl >> 16;
          }
          cur += (code >> 12);
-         res |= (code&FSST_CODE_MASK) << 12;
+         res |= (code & FSST_CODE_MASK) << 12;
          memcpy(out, &res, sizeof(u64));
-         out += 3; 
+         out += 3;
       }
       while (cur < end) {
          ulong code = symbolMap.findExpansion(Symbol(cur, end));
-         u32 res = (code&FSST_CODE_MASK);
-         if (out+8 > lim) {
-             return curLine; // u32 write would be out of bounds (out of output memory) 
+         u32 res = (code & FSST_CODE_MASK);
+         if (out + 8 > lim) {
+            return curLine; // u32 write would be out of bounds (out of output memory)
          }
          cur += code >> 12;
          if (cur >= end) {
             memcpy(out, &res, sizeof(u64));
-	    out += 2;
+            out += 2;
             break;
          }
          code = symbolMap.findExpansion(Symbol(cur, end));
-         res |= (code&FSST_CODE_MASK) << 12;
+         res |= (code & FSST_CODE_MASK) << 12;
          cur += code >> 12;
          memcpy(out, &res, sizeof(u64));
-	 out += 3;
-      } 
+         out += 3;
+      }
       lenOut[curLine] = out - strOut[curLine];
-   } 
+   }
    return curLine;
 }
 
@@ -265,56 +276,57 @@ long makeSample(vector<ulong> &sample, ulong nlines, const ulong len[]) {
    ulong i, sampleRnd = 1, sampleProb = 256, sampleSize = 0, totSize = 0;
    ulong sampleTarget = FSST_SAMPLETARGET;
 
-   for(i=0; i<nlines; i++) 
+   for (i = 0; i < nlines; i++)
       totSize += len[i];
 
    if (totSize > FSST_SAMPLETARGET) {
-      // if the batch is larger than the sampletarget, sample this fraction  
-      sampleProb = max(((ulong) 4),(256*sampleTarget) / totSize);
+      // if the batch is larger than the sampletarget, sample this fraction
+      sampleProb = max(((ulong) 4), (256 * sampleTarget) / totSize);
    } else {
       // too little data. But ok, do not include lines multiple times, just use everything once
-      sampleTarget = totSize; // sampleProb will be 256/256 (aka 100%) 
-   } 
+      sampleTarget = totSize; // sampleProb will be 256/256 (aka 100%)
+   }
    do {
       // if nlines is very large and strings are small (8, so we need 4K lines), we still expect 4K*256/4 iterations total worst case
-      for(i=0; i<nlines; i++) { 
+      for (i = 0; i < nlines; i++) {
          // cheaply draw a random number to select (or not) each line
          sampleRnd = FSST_HASH(sampleRnd);
-         if ((sampleRnd&255) < sampleProb) {
+         if ((sampleRnd & 255) < sampleProb) {
             sample.push_back(i);
             sampleSize += len[i];
-            if (sampleSize >= sampleTarget) // enough? 
-               i = nlines; // break out of both loops; 
+            if (sampleSize >= sampleTarget) // enough?
+               i = nlines; // break out of both loops;
          }
       }
       sampleProb *= 4; //accelerate the selection process at expense of front-bias (4,16,64,256: 4 passes max)
-   } while(i <= nlines); // basically continue until we have enough
+   } while (i <= nlines); // basically continue until we have enough
 
    // if the last line (only line?) is excessively long, return a negative samplesize (the amount of front bytes to skip)
    long sampleLong = (long) sampleSize;
    assert(sampleLong > 0);
-   return (sampleLong < FSST_SAMPLEMAXSZ)?sampleLong:FSST_SAMPLEMAXSZ-sampleLong; 
+   return (sampleLong < FSST_SAMPLEMAXSZ) ? sampleLong : FSST_SAMPLEMAXSZ - sampleLong;
 }
 
-extern "C" fsst_encoder_t* fsst_create(ulong n, const ulong lenIn[], const u8 *strIn[], int dummy) {
+extern "C" fsst_encoder_t *fsst_create(ulong n, const ulong lenIn[], const u8 *strIn[], int dummy) {
    vector<ulong> sample;
    (void) dummy;
-   long sampleSize = makeSample(sample, n?n:1, lenIn); // careful handling of input to get a right-size and representative sample
+   long sampleSize = makeSample(sample, n ? n : 1,
+                                lenIn); // careful handling of input to get a right-size and representative sample
    Encoder *encoder = new Encoder();
    encoder->symbolMap = shared_ptr<SymbolMap>(buildSymbolMap(encoder->counters, sampleSize, sample, lenIn, strIn));
-   return (fsst_encoder_t*) encoder;
+   return (fsst_encoder_t *) encoder;
 }
 
 /* create another encoder instance, necessary to do multi-threaded encoding using the same dictionary */
-extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) {
+extern "C" fsst_encoder_t *fsst_duplicate(fsst_encoder_t *encoder) {
    Encoder *e = new Encoder();
-   e->symbolMap = ((Encoder*)encoder)->symbolMap; // it is a shared_ptr
-   return (fsst_encoder_t*) e;
+   e->symbolMap = ((Encoder *) encoder)->symbolMap; // it is a shared_ptr
+   return (fsst_encoder_t *) e;
 }
 
 // export a dictionary in compact format. 
 extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) {
-   Encoder *e = (Encoder*) encoder;
+   Encoder *e = (Encoder *) encoder;
    // In ->version there is a versionnr, but we hide also suffixLim/terminator/symbolCount there.
    // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t
    // (such functionality could be useful to append compressed data to an existing block).
@@ -323,24 +335,24 @@ extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) {
    // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables.
    // Doing a endian-conversion during hashing will be slow and self-defeating.
    //
-   // Overall, we could support reconstructing an encoder for incremental compression, but 
+   // Overall, we could support reconstructing an encoder for incremental compression, but
    // should enforce equal-endianness. Bit of a bummer. Not going there now.
-   // 
+   //
    // The version field is now there just for future-proofness, but not used yet
-   
+
    // version allows keeping track of fsst versions, track endianness, and encoder reconstruction
    u64 version = (FSST_VERSION << 32) | FSST_ENDIAN_MARKER; // least significant byte is nonzero
 
    /* do not assume unaligned reads here */
    memcpy(buf, &version, 8);
-   memcpy(buf+8, e->symbolMap->lenHisto, 16); // serialize the lenHisto
+   memcpy(buf + 8, e->symbolMap->lenHisto, 16); // serialize the lenHisto
    u32 pos = 24;
 
-   // emit only the used bytes of the symbols 
-   for(u32 i = 0; i < e->symbolMap->symbolCount; i++) {
+   // emit only the used bytes of the symbols
+   for (u32 i = 0; i < e->symbolMap->symbolCount; i++) {
       buf[pos++] = e->symbolMap->symbols[i].length();
-      for(u32 j = 0; j < e->symbolMap->symbols[i].length(); j++) {
-         buf[pos++] = ((u8*) &e->symbolMap->symbols[i].symbol)[j]; // serialize used symbol bytes
+      for (u32 j = 0; j < e->symbolMap->symbols[i].length(); j++) {
+         buf[pos++] = ((u8 *) &e->symbolMap->symbols[i].symbol)[j]; // serialize used symbol bytes
       }
    }
    return pos; // length of what was serialized
@@ -355,57 +367,66 @@ extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 *buf) {
 
    // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
    memcpy(&version, buf, 8);
-   if ((version>>32) != FSST_VERSION) return 0;
-   memcpy(lenHisto, buf+8, 16);
+   if ((version >> 32) != FSST_VERSION) return 0;
+   memcpy(lenHisto, buf + 8, 16);
 
-   for(u32 i=0; i<8; i++) 
-     symbolCount += lenHisto[i]; 
+   for (u32 i = 0; i < 8; i++)
+      symbolCount += lenHisto[i];
 
-   for(u32 i = 0; i < symbolCount; i++) {
+   for (u32 i = 0; i < symbolCount; i++) {
       u32 len = decoder->len[i] = buf[pos++];
-      for(u32 j = 0; j < len; j++) {
-        ((u8*) &decoder->symbol[i])[j] = buf[pos++];
+      for (u32 j = 0; j < len; j++) {
+         ((u8 *) &decoder->symbol[i])[j] = buf[pos++];
       }
    }
    // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols).
-   while(symbolCount<4096) {
-       decoder->symbol[symbolCount] = FSST_CORRUPT;    
-       decoder->len[symbolCount++] = 8;
+   while (symbolCount < 4096) {
+      decoder->symbol[symbolCount] = FSST_CORRUPT;
+      decoder->len[symbolCount++] = 8;
    }
    return pos;
 }
 
 // runtime check for simd
-inline ulong _compressImpl(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
+inline ulong _compressImpl(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output,
+                           ulong *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
    (void) noSuffixOpt;
    (void) avoidBranch;
    (void) simd;
    return compressBulk(*e->symbolMap, nlines, lenIn, strIn, size, output, lenOut, strOut);
 }
-ulong compressImpl(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
+
+ulong compressImpl(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output,
+                   ulong *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
    return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
 }
 
 // adaptive choosing of scalar compression method based on symbol length histogram 
-inline ulong _compressAuto(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], int simd) {
+inline ulong _compressAuto(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output,
+                           ulong *lenOut, u8 *strOut[], int simd) {
    (void) simd;
    return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, false, false, false);
 }
-ulong compressAuto(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], int simd) {
+
+ulong compressAuto(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output,
+                   ulong *lenOut, u8 *strOut[], int simd) {
    return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
 }
+}
 
 // the main compression function (everything automatic)
-extern "C" ulong fsst_compress(fsst_encoder_t *encoder, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[]) {
+extern "C" ulong
+fsst_compress(fsst_encoder_t *encoder, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output,
+              ulong *lenOut, u8 *strOut[]) {
    // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB)
-   ulong totLen = accumulate(lenIn, lenIn+nlines, 0);
-   int simd = totLen > nlines*12 && (nlines > 64 || totLen > (ulong) 1<<15); 
-   return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd);
+   ulong totLen = accumulate(lenIn, lenIn + nlines, 0);
+   int simd = totLen > nlines * 12 && (nlines > 64 || totLen > (ulong) 1 << 15);
+   return _compressAuto((libfsst::Encoder *) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3 * simd);
 }
 
 /* deallocate encoder */
-extern "C" void fsst_destroy(fsst_encoder_t* encoder) {
-   Encoder *e = (Encoder*) encoder; 
+extern "C" void fsst_destroy(fsst_encoder_t *encoder) {
+   libfsst::Encoder *e = (libfsst::Encoder *) encoder;
    delete e;
 }
 
@@ -415,6 +436,9 @@ extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) {
    u32 cnt1 = fsst_export(encoder, buf);
    fsst_decoder_t decoder;
    u32 cnt2 = fsst_import(&decoder, buf);
-   assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; 
+   assert(cnt1 == cnt2);
+   (void) cnt1;
+   (void) cnt2;
    return decoder;
 }
+
diff --git a/libfsst12.hpp b/libfsst12.hpp
index 6a88941..0ac5358 100644
--- a/libfsst12.hpp
+++ b/libfsst12.hpp
@@ -36,7 +36,7 @@ using namespace std;
 #include "fsst12.h" // the official FSST API -- also usable by C mortals
 
 /* workhorse type for string and buffer lengths: 64-bits on 64-bits platforms and 32-bits on 32-bits platforms */
-typedef unsigned long ulong; 
+typedef unsigned long ulong;
 
 /* unsigned integers */
 typedef uint8_t u8;
@@ -51,12 +51,14 @@ typedef uint64_t u64;
 // "symbols" are character sequences (up to 8 bytes)
 // A symbol is compressed into a "code" of, 1.5 bytes (12 bits)
 #define FSST_CODE_MAX 4096
-#define FSST_CODE_MASK      ((u16) (FSST_CODE_MAX-1)) 
+#define FSST_CODE_MASK      ((u16) (FSST_CODE_MAX-1))
 
-inline uint64_t fsst_unaligned_load(u8 const* V) {
-    uint64_t Ret;
-    memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible)
-    return Ret;
+namespace libfsst {
+
+inline uint64_t fsst_unaligned_load(u8 const *V) {
+   uint64_t Ret;
+   memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible)
+   return Ret;
 }
 
 struct Symbol {
@@ -67,39 +69,54 @@ struct Symbol {
    mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of Symbols
 
    // the byte sequence that this symbol stands for
-   u8 symbol[maxLength]; 
+   u8 symbol[maxLength];
 
    Symbol() : gcl(0) {}
 
-   explicit Symbol(u8 c, u16 code) : gcl((1<<28)|(code<<16)|7) { *(u64*) symbol = c; } // single-char symbol
-   explicit Symbol(const char* input, u32 len) {
+   explicit Symbol(u8 c, u16 code) : gcl((1 << 28) | (code << 16) | 7) { *(u64 *) symbol = c; } // single-char symbol
+   explicit Symbol(const char *input, u32 len) {
       if (len < 8) {
-         *(u64*) symbol = 0;
-         for(u32 i=0; i<len; i++) symbol[i] = input[i];
+         *(u64 *) symbol = 0;
+         for (u32 i = 0; i < len; i++) symbol[i] = input[i];
       } else {
          len = 8;
-         *(u64*) symbol = *(u64*) input;
+         *(u64 *) symbol = *(u64 *) input;
       }
       set_code_len(FSST_CODE_MASK, len);
    }
-   explicit Symbol(const char* begin, const char* end) : Symbol(begin, end-begin) {}
-   explicit Symbol(const u8* begin, const u8* end) : Symbol((const char*)begin, end-begin) {}
-   void set_code_len(u32 code, u32 len) { gcl = (len<<28)|(code<<16)|((8-len)*8); }
+
+   explicit Symbol(const char *begin, const char *end) : Symbol(begin, end - begin) {}
+
+   explicit Symbol(const u8 *begin, const u8 *end) : Symbol((const char *) begin, end - begin) {}
+
+   void set_code_len(u32 code, u32 len) { gcl = (len << 28) | (code << 16) | ((8 - len) * 8); }
 
    u8 length() const { return gcl >> 28; }
+
    u16 code() const { return (gcl >> 16) & FSST_CODE_MASK; }
+
    u8 garbageBits() const { return gcl; }
 
-   u8 first() const { return 0xFF & *(u64*) symbol; }
-   u16 first2() const { assert(length() > 1); return (0xFFFF & *(u64*) symbol); }
+   u8 first() const { return 0xFF & *(u64 *) symbol; }
+
+   u16 first2() const {
+      assert(length() > 1);
+      return (0xFFFF & *(u64 *) symbol);
+   }
 
 #define FSST_HASH_LOG2SIZE 14
-#define FSST_HASH_SHIFT 15 
+#define FSST_HASH_SHIFT 15
 #define FSST_HASH_PRIME1 2971215073LL
 #define FSST_HASH(w) (((w)*FSST_HASH_PRIME1)^(((w)*FSST_HASH_PRIME1)>>13))
-   ulong hash() const { uint v0 = 0xFFFFFFFF & *(ulong*) symbol; return FSST_HASH(v0); }
 
-   bool operator==(const Symbol& other) const { return *(u64*) symbol == *(u64*) other.symbol && length() == other.length(); }
+   ulong hash() const {
+      uint v0 = 0xFFFFFFFF & *(ulong *) symbol;
+      return FSST_HASH(v0);
+   }
+
+   bool operator==(const Symbol &other) const {
+      return *(u64 *) symbol == *(u64 *) other.symbol && length() == other.length();
+   }
 };
 
 // during search for the best dictionary, we probe both (in this order, first wins):  
@@ -117,13 +134,13 @@ struct Symbol {
 // the gain field is only used in the symbol queue that sorts symbols on gain
 
 struct SymbolMap {
-   static const u32 hashTabSize = 1<<FSST_HASH_LOG2SIZE; // smallest size that incurs no precision loss
+   static const u32 hashTabSize = 1 << FSST_HASH_LOG2SIZE; // smallest size that incurs no precision loss
 
    // lookup table using the next two bytes (65536 codes), or just the next single byte
    u16 shortCodes[65536]; // shortCode[X] contains code for 2-byte symbol, contains 1-byte code X&255 if there is no 2-byte symbol
 
    // 'symbols' is the current symbol  table symbol[code].symbol is the max 8-byte 'symbol' for single-byte 'code'
-   Symbol symbols[4096];  
+   Symbol symbols[4096];
 
    // replicate long symbols in hashTab (avoid indirection). 
    Symbol hashTab[hashTabSize]; // used for all symbols of 3 and more bytes
@@ -134,11 +151,11 @@ struct SymbolMap {
 
    SymbolMap() : symbolCount(256), zeroTerminated(false) {
       // stuff done once at startup
-      Symbol unused = Symbol(0,FSST_CODE_MASK); // single-char symbol, exception code
-      for (u32 i=0; i<256; i++) {
-         symbols[i] = Symbol((u8)i,i); // single-byte symbol
+      Symbol unused = Symbol(0, FSST_CODE_MASK); // single-char symbol, exception code
+      for (u32 i = 0; i < 256; i++) {
+         symbols[i] = Symbol((u8) i, i); // single-byte symbol
       }
-      for (u32 i=256; i<4096; i++) {
+      for (u32 i = 256; i < 4096; i++) {
          symbols[i] = unused; // all other symbols are unused.
       }
       // stuff done when re-using a symbolmap during the search for the best map
@@ -148,31 +165,32 @@ struct SymbolMap {
    void clear() {
       Symbol s;
       s.gcl = FSST_GCL_FREE; //marks empty in hashtab
-      s.gain = 0; 
-      for(u32 i=0; i<hashTabSize; i++)
+      s.gain = 0;
+      for (u32 i = 0; i < hashTabSize; i++)
          hashTab[i] = s;
-      for(u32 i=0; i<65536; i++)
+      for (u32 i = 0; i < 65536; i++)
          shortCodes[i] = 4096 | (i & 255); // single-byte symbol
       memset(lenHisto, 0, sizeof(lenHisto)); // all unused
       lenHisto[0] = symbolCount = 256; // no need to clean symbols[] as no symbols are used
    }
- 
+
    u32 load() {
       u32 ret = 0;
-      for(u32 i=0; i<hashTabSize; i++)
+      for (u32 i = 0; i < hashTabSize; i++)
          ret += (hashTab[i].gcl < FSST_GCL_FREE);
       return ret;
    }
 
    bool hashInsert(Symbol s) {
-      u32 idx = s.hash() & (hashTabSize-1);
+      u32 idx = s.hash() & (hashTabSize - 1);
       bool taken = (hashTab[idx].gcl < FSST_GCL_FREE);
       if (taken) return false; // collision in hash table
       hashTab[idx].gcl = s.gcl;
       hashTab[idx].gain = 0;
-      *(u64*) hashTab[idx].symbol = (*(u64*) s.symbol) & (0xFFFFFFFFFFFFFFFF >> (u8) s.gcl);
+      *(u64 *) hashTab[idx].symbol = (*(u64 *) s.symbol) & (0xFFFFFFFFFFFFFFFF >> (u8) s.gcl);
       return true;
    }
+
    bool add(Symbol s) {
       assert(symbolCount < 4096);
       u32 len = s.length();
@@ -185,24 +203,26 @@ struct SymbolMap {
          return false;
       }
       symbols[symbolCount++] = s;
-      lenHisto[len-1]++;
+      lenHisto[len - 1]++;
       return true;
    }
+
    /// Find symbol in hash table, return code
    u16 hashFind(Symbol s) const {
-      ulong idx = s.hash() & (hashTabSize-1);
-      if (hashTab[idx].gcl <= s.gcl && 
-          *(u64*) hashTab[idx].symbol == (*(u64*) s.symbol & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].gcl)))) 
-         return (hashTab[idx].gcl>>16); // matched a long symbol 
+      ulong idx = s.hash() & (hashTabSize - 1);
+      if (hashTab[idx].gcl <= s.gcl &&
+          *(u64 *) hashTab[idx].symbol == (*(u64 *) s.symbol & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].gcl))))
+         return (hashTab[idx].gcl >> 16); // matched a long symbol
       return 0;
    }
+
    /// Find longest expansion, return code
    u16 findExpansion(Symbol s) const {
-      if (s.length() == 1) { 
-	return 4096 + s.first();
+      if (s.length() == 1) {
+         return 4096 + s.first();
       }
       u16 ret = hashFind(s);
-      return ret?ret:shortCodes[s.first2()];
+      return ret ? ret : shortCodes[s.first2()];
    }
 };
 
@@ -235,6 +255,7 @@ struct Counters {
    }
 };
 #else
+
 // we keep two counters count1[pos] and count2[pos1][pos2] of resp 16 and 12-bits. Both are split into two columns for performance reasons
 // first reason is to make the column we update the most during symbolTable construction (the low bits) thinner, thus reducing CPU cache pressure.
 // second reason is that when scanning the array, after seeing a 64-bits 0 in the high bits column, we can quickly skip over many codes (15 or 7)
@@ -242,28 +263,33 @@ struct Counters {
    // high arrays come before low arrays, because our GetNext() methods may overrun their 64-bits reads a few bytes
    u8 count1High[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample (16-bits)
    u8 count1Low[FSST_CODE_MAX];    // it is split in a low and high byte: cnt = count1High*256 + count1Low
-   u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX/2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high)
+   u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX /
+                                2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high)
    u8 count2Low[FSST_CODE_MAX][FSST_CODE_MAX];    // its value is (count2High*256+count2Low) -- but high is 4-bits (we put two numbers in one, hence /2)
    // 385KB  -- but hot area likely just 10 + 30*4 = 130 cache lines (=8KB)
-   
-   void count1Set(u32 pos1, u16 val) { 
-      count1Low[pos1] = val&255;
-      count1High[pos1] = val>>8;
+
+   void count1Set(u32 pos1, u16 val) {
+      count1Low[pos1] = val & 255;
+      count1High[pos1] = val >> 8;
    }
-   void count1Inc(u32 pos1) { 
+
+   void count1Inc(u32 pos1) {
       if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
          count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)...
    }
-   void count2Inc(u32 pos1, u32 pos2) {  
-       if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
-          // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively
-          count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample)
+
+   void count2Inc(u32 pos1, u32 pos2) {
+      if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
+         // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively
+         count2High[pos1][(pos2) >> 1] +=
+                 1 << (((pos2) & 1) << 2); // we take our chances with overflow.. (4K maxval, on a 8K sample)
    }
+
    u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range
       // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros
-      u64 high = *(u64*) &count1High[pos1]; // note: this reads 8 subsequent counters [pos1..pos1+7]
+      u64 high = *(u64 *) &count1High[pos1]; // note: this reads 8 subsequent counters [pos1..pos1+7]
 
-      u32 zero = high?(__builtin_ctzl(high)>>3):7; // number of zero bytes
+      u32 zero = high ? (__builtin_ctzl(high) >> 3) : 7; // number of zero bytes
       high = (high >> (zero << 3)) & 255; // advance to nonzero counter
       if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
          return 0; // all zero
@@ -272,12 +298,13 @@ struct Counters {
       if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
       return (high << 8) + low;
    }
+
    u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range
       // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros
-      u64 high = *(u64*) &count2High[pos1][pos2>>1]; // note: this reads 16 subsequent counters [pos2..pos2+15]
-      high >>= (pos2&1) << 2; // odd pos2: ignore the lowest 4 bits & we see only 15 counters
+      u64 high = *(u64 *) &count2High[pos1][pos2 >> 1]; // note: this reads 16 subsequent counters [pos2..pos2+15]
+      high >>= (pos2 & 1) << 2; // odd pos2: ignore the lowest 4 bits & we see only 15 counters
 
-      u32 zero = high?(__builtin_ctzl(high)>>2):(15-(pos2&1)); // number of zero 4-bits counters
+      u32 zero = high ? (__builtin_ctzl(high) >> 2) : (15 - (pos2 & 1)); // number of zero 4-bits counters
       high = (high >> (zero << 2)) & 15;  // advance to nonzero counter
       if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
          return 0; // all zero
@@ -286,15 +313,18 @@ struct Counters {
       if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
       return (high << 8) + low;
    }
+
    void backup1(u8 *buf) {
       memcpy(buf, count1High, FSST_CODE_MAX);
-      memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX);
+      memcpy(buf + FSST_CODE_MAX, count1Low, FSST_CODE_MAX);
    }
+
    void restore1(u8 *buf) {
       memcpy(count1High, buf, FSST_CODE_MAX);
-      memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX);
+      memcpy(count1Low, buf + FSST_CODE_MAX, FSST_CODE_MAX);
    }
-}; 
+};
+
 #endif
 
 // an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression 
@@ -306,5 +336,11 @@ struct Encoder {
 };
 
 // C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree)
-ulong compressImpl(Encoder *encoder, ulong n, ulong lenIn[], u8 *strIn[], ulong size, u8 * output, ulong *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd);
-ulong compressAuto(Encoder *encoder, ulong n, ulong lenIn[], u8 *strIn[], ulong size, u8 * output, ulong *lenOut, u8 *strOut[], int simd);
+ulong
+compressImpl(Encoder *encoder, ulong n, ulong lenIn[], u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[],
+             bool noSuffixOpt, bool avoidBranch, int simd);
+
+ulong
+compressAuto(Encoder *encoder, ulong n, ulong lenIn[], u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[],
+             int simd);
+}