diff --git a/fsst.cpp b/fsst.cpp index e708482..d113184 100644 --- a/fsst.cpp +++ b/fsst.cpp @@ -16,16 +16,20 @@ // // You can contact the authors via the FSST source repository : https://github.com/cwida/fsst #ifdef FSST12 + #include "fsst12.h" // the official FSST API -- also usable by C mortals + #else #include "fsst.h" // the official FSST API -- also usable by C mortals #endif + #include #include #include #include #include #include + using namespace std; // Utility to compress and decompress (-d) data with FSST (using stdin and stdout). @@ -45,48 +49,54 @@ using namespace std; namespace { class BinarySemaphore { - private: +private: mutex m; condition_variable cv; bool value; - public: +public: explicit BinarySemaphore(bool initialValue = false) : value(initialValue) {} + void wait() { unique_lock lock(m); while (!value) cv.wait(lock); value = false; } + void post() { - { unique_lock lock(m); value = true; } + { + unique_lock lock(m); + value = true; + } cv.notify_one(); } }; bool stopThreads = false; BinarySemaphore srcDoneIO[2], dstDoneIO[2], srcDoneCPU[2], dstDoneCPU[2]; -unsigned char *srcBuf[2] = { NULL, NULL }; -unsigned char *dstBuf[2] = { NULL, NULL }; -unsigned char *dstMem[2] = { NULL, NULL }; -size_t srcLen[2] = { 0, 0 }; -size_t dstLen[2] = { 0, 0 }; +unsigned char *srcBuf[2] = {NULL, NULL}; +unsigned char *dstBuf[2] = {NULL, NULL}; +unsigned char *dstMem[2] = {NULL, NULL}; +size_t srcLen[2] = {0, 0}; +size_t dstLen[2] = {0, 0}; #define FSST_MEMBUF (1ULL<<22) int decompress = 0; -size_t blksz = FSST_MEMBUF-(1+FSST_MAXHEADER/2); // block size of compression (max compressed size must fit 3 bytes) +size_t blksz = + FSST_MEMBUF - (1 + FSST_MAXHEADER / 2); // block size of compression (max compressed size must fit 3 bytes) #define DESERIALIZE(p) (((unsigned long long) (p)[0]) << 16) | (((unsigned long long) (p)[1]) << 8) | ((unsigned long long) (p)[2]) -#define SERIALIZE(l,p) { (p)[0] = ((l)>>16)&255; (p)[1] = ((l)>>8)&255; (p)[2] = (l)&255; } +#define SERIALIZE(l, p) { (p)[0] = ((l)>>16)&255; (p)[1] = ((l)>>8)&255; (p)[2] = (l)&255; } -void reader(ifstream& src) { - for(int swap=0; true; swap = 1-swap) { +void reader(ifstream &src) { + for (int swap = 0; true; swap = 1 - swap) { srcDoneCPU[swap].wait(); if (stopThreads) break; - src.read((char*) srcBuf[swap], blksz); + src.read((char *) srcBuf[swap], blksz); srcLen[swap] = (unsigned long) src.gcount(); if (decompress) { if (blksz && srcLen[swap] == blksz) { - blksz = DESERIALIZE(srcBuf[swap]+blksz-3); // read size of next block + blksz = DESERIALIZE(srcBuf[swap] + blksz - 3); // read size of next block srcLen[swap] -= 3; // cut off size bytes } else { blksz = 0; @@ -96,33 +106,33 @@ void reader(ifstream& src) { } } -void writer(ofstream& dst) { - for(int swap=0; true; swap = 1-swap) { +void writer(ofstream &dst) { + for (int swap = 0; true; swap = 1 - swap) { dstDoneCPU[swap].wait(); if (!dstLen[swap]) break; - dst.write((char*) dstBuf[swap], dstLen[swap]); + dst.write((char *) dstBuf[swap], dstLen[swap]); dstDoneIO[swap].post(); } - for(int swap=0; swap<2; swap++) + for (int swap = 0; swap < 2; swap++) dstDoneIO[swap].post(); } } -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) { size_t srcTot = 0, dstTot = 0; if (argc < 2 || argc > 4 || (argc == 4 && (argv[1][0] != '-' || argv[1][1] != 'd' || argv[1][2]))) { cerr << "usage: " << argv[0] << " -d infile outfile" << endl; - cerr << " " << argv[0] << " infile outfile" << endl; - cerr << " " << argv[0] << " infile" << endl; + cerr << " " << argv[0] << " infile outfile" << endl; + cerr << " " << argv[0] << " infile" << endl; return -1; } decompress = (argc == 4); - string srcfile(argv[1+decompress]), dstfile; + string srcfile(argv[1 + decompress]), dstfile; if (argc == 2) { dstfile = srcfile + ".fsst"; } else { - dstfile = argv[2+decompress]; + dstfile = argv[2 + decompress]; } ifstream src; ofstream dst; @@ -132,28 +142,28 @@ int main(int argc, char* argv[]) { dst.exceptions(ios_base::badbit); src.exceptions(ios_base::badbit); if (decompress) { - unsigned char tmp[3]; - src.read((char*) tmp, 3); - if (src.gcount() != 3) { - cerr << "failed to open input." << endl; - return -1; - } - blksz = DESERIALIZE(tmp); // read first block size + unsigned char tmp[3]; + src.read((char *) tmp, 3); + if (src.gcount() != 3) { + cerr << "failed to open input." << endl; + return -1; + } + blksz = DESERIALIZE(tmp); // read first block size } - vector buffer(FSST_MEMBUF*6); + vector buffer(FSST_MEMBUF * 6); srcBuf[0] = buffer.data(); - srcBuf[1] = srcBuf[0] + (FSST_MEMBUF*(1ULL+decompress)); - dstMem[0] = srcBuf[1] + (FSST_MEMBUF*(1ULL+decompress)); - dstMem[1] = dstMem[0] + (FSST_MEMBUF*(2ULL-decompress)); + srcBuf[1] = srcBuf[0] + (FSST_MEMBUF * (1ULL + decompress)); + dstMem[0] = srcBuf[1] + (FSST_MEMBUF * (1ULL + decompress)); + dstMem[1] = dstMem[0] + (FSST_MEMBUF * (2ULL - decompress)); - for(int swap=0; swap<2; swap++) { + for (int swap = 0; swap < 2; swap++) { srcDoneCPU[swap].post(); // input buffer is not being processed initially dstDoneIO[swap].post(); // output buffer is not being written initially } - thread readerThread([&src]{ reader(src); }); - thread writerThread([&dst]{ writer(dst); }); + thread readerThread([&src] { reader(src); }); + thread writerThread([&dst] { writer(dst); }); - for(int swap=0; true; swap = 1-swap) { + for (int swap = 0; true; swap = 1 - swap) { srcDoneIO[swap].wait(); // wait until input buffer is available (i.e. done reading) dstDoneIO[swap].wait(); // wait until output buffer is ready writing hence free for use if (srcLen[swap] == 0) { @@ -161,33 +171,37 @@ int main(int argc, char* argv[]) { break; } if (decompress) { - fsst_decoder_t decoder; - size_t hdr = fsst_import(&decoder, srcBuf[swap]); - dstLen[swap] = fsst_decompress(&decoder, srcLen[swap] - hdr, srcBuf[swap] + hdr, FSST_MEMBUF, dstBuf[swap] = dstMem[swap]); + fsst_decoder_t decoder; + size_t hdr = fsst_import(&decoder, srcBuf[swap]); + dstLen[swap] = fsst_decompress(&decoder, srcLen[swap] - hdr, srcBuf[swap] + hdr, FSST_MEMBUF, + dstBuf[swap] = dstMem[swap]); } else { unsigned char tmp[FSST_MAXHEADER]; - fsst_encoder_t* encoder = fsst_create(1, &srcLen[swap], const_cast(&srcBuf[swap]), 0); + fsst_encoder_t *encoder = fsst_create(1, &srcLen[swap], const_cast(&srcBuf[swap]), + 0); size_t hdr = fsst_export(encoder, tmp); if (fsst_compress(encoder, 1, &srcLen[swap], const_cast(&srcBuf[swap]), FSST_MEMBUF * 2, dstMem[swap] + FSST_MAXHEADER + 3, &dstLen[swap], &dstBuf[swap]) < 1) return -1; dstLen[swap] += 3 + hdr; - dstBuf[swap] -= 3 + hdr; - SERIALIZE(dstLen[swap],dstBuf[swap]); // block starts with size - copy(tmp, tmp+hdr, dstBuf[swap]+3); // then the header (followed by the compressed bytes which are already there) - fsst_destroy(encoder); + dstBuf[swap] -= 3 + hdr; + SERIALIZE(dstLen[swap], dstBuf[swap]); // block starts with size + copy(tmp, tmp + hdr, + dstBuf[swap] + 3); // then the header (followed by the compressed bytes which are already there) + fsst_destroy(encoder); } srcTot += srcLen[swap]; dstTot += dstLen[swap]; srcDoneCPU[swap].post(); // input buffer may be re-used by the reader for the next block dstDoneCPU[swap].post(); // output buffer is ready for writing out } - cerr << (decompress?"Dec":"C") << "ompressed " << srcTot << " bytes into " << dstTot << " bytes ==> " << (int) ((100*dstTot)/srcTot) << "%" << endl; + cerr << (decompress ? "Dec" : "C") << "ompressed " << srcTot << " bytes into " << dstTot << " bytes ==> " + << (int) ((100 * dstTot) / srcTot) << "%" << endl; // force wait until all background writes finished stopThreads = true; - for(int swap=0; swap<2; swap++) { + for (int swap = 0; swap < 2; swap++) { srcDoneCPU[swap].post(); dstDoneCPU[swap].post(); } diff --git a/fsst_avx512.cpp b/fsst_avx512.cpp index a2b7b5e..9ce0ab4 100644 --- a/fsst_avx512.cpp +++ b/fsst_avx512.cpp @@ -21,23 +21,30 @@ #include #ifdef _WIN32 +namespace libfsst { bool fsst_hasAVX512() { int info[4]; __cpuidex(info, 0x00000007, 0); return (info[1]>>16)&1; } +} #else #include -bool fsst_hasAVX512() { - int info[4]; - __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]); - return (info[1]>>16)&1; +namespace libfsst { + bool fsst_hasAVX512() { + int info[4]; + __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]); + return (info[1] >> 16) & 1; + } } #endif #else +namespace libfsst { bool fsst_hasAVX512() { return false; } +} #endif +namespace libfsst { // BULK COMPRESSION OF STRINGS // // In one call of this function, we can compress 512 strings, each of maximum length 511 bytes. @@ -70,14 +77,15 @@ bool fsst_hasAVX512() { return false; } // This reduces the effectiveness of unrolling, hence -O2 makes the loop perform worse than -O1 which skips this optimization. // Assembly inspection confirmed that 3-way unroll with -O1 avoids needless load/stores. -size_t fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBase, SIMDjob *input, SIMDjob *output, size_t n, size_t unroll) { - size_t processed = 0; - // define some constants (all_x means that all 8 lanes contain 64-bits value X) + size_t fsst_compressAVX512(SymbolTable &symbolTable, u8 *codeBase, u8 *symbolBase, SIMDjob *input, SIMDjob *output, + size_t n, size_t unroll) { + size_t processed = 0; + // define some constants (all_x means that all 8 lanes contain 64-bits value X) #ifdef __AVX512F__ - //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c - __m512i all_MASK = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1)); - __m512i all_PRIME = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME)); - __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE)); + //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c + __m512i all_MASK = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1)); + __m512i all_PRIME = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME)); + __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE)); #define all_HASH _mm512_srli_epi64(all_MASK, 64-FSST_HASH_LOG2SIZE) #define all_ONE _mm512_srli_epi64(all_MASK, 63) #define all_M19 _mm512_srli_epi64(all_MASK, 45) @@ -87,54 +95,56 @@ size_t fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBas #define all_FFFF _mm512_srli_epi64(all_MASK, 48) #define all_FF _mm512_srli_epi64(all_MASK, 56) - SIMDjob *inputEnd = input+n; - assert(n >= unroll*8 && n <= 512); // should be close to 512 - __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4 - __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll - u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll + SIMDjob *inputEnd = input+n; + assert(n >= unroll*8 && n <= 512); // should be close to 512 + __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4 + __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll + u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll - if (unroll >= 4) { - while (input+delta1+delta2+delta3+delta4 < inputEnd) { - #include "fsst_avx512_unroll4.inc" - } - } else if (unroll == 3) { - while (input+delta1+delta2+delta3 < inputEnd) { - #include "fsst_avx512_unroll3.inc" - } - } else if (unroll == 2) { - while (input+delta1+delta2 < inputEnd) { - #include "fsst_avx512_unroll2.inc" - } - } else { - while (input+delta1 < inputEnd) { - #include "fsst_avx512_unroll1.inc" - } - } + if (unroll >= 4) { + while (input+delta1+delta2+delta3+delta4 < inputEnd) { +#include "fsst_avx512_unroll4.inc" + } + } else if (unroll == 3) { + while (input+delta1+delta2+delta3 < inputEnd) { +#include "fsst_avx512_unroll3.inc" + } + } else if (unroll == 2) { + while (input+delta1+delta2 < inputEnd) { +#include "fsst_avx512_unroll2.inc" + } + } else { + while (input+delta1 < inputEnd) { +#include "fsst_avx512_unroll1.inc" + } + } - // flush the job states of the unfinished strings at the end of output[] - processed = n - (inputEnd - input); - u32 unfinished = 0; - if (unroll > 1) { - if (unroll > 2) { - if (unroll > 3) { - _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4); - unfinished += _mm_popcnt_u32((int) loadmask4); - } - _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3); - unfinished += _mm_popcnt_u32((int) loadmask3); - } - _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2); - unfinished += _mm_popcnt_u32((int) loadmask2); - } - _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1); + // flush the job states of the unfinished strings at the end of output[] + processed = n - (inputEnd - input); + u32 unfinished = 0; + if (unroll > 1) { + if (unroll > 2) { + if (unroll > 3) { + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4); + unfinished += _mm_popcnt_u32((int) loadmask4); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3); + unfinished += _mm_popcnt_u32((int) loadmask3); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2); + unfinished += _mm_popcnt_u32((int) loadmask2); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1); #else - (void) symbolTable; - (void) codeBase; - (void) symbolBase; - (void) input; - (void) output; - (void) n; - (void) unroll; + (void) symbolTable; + (void) codeBase; + (void) symbolBase; + (void) input; + (void) output; + (void) n; + (void) unroll; #endif - return processed; + return processed; + } } + diff --git a/libfsst.cpp b/libfsst.cpp index 5285173..d6836bc 100644 --- a/libfsst.cpp +++ b/libfsst.cpp @@ -17,20 +17,23 @@ // You can contact the authors via the FSST source repository : https://github.com/cwida/fsst #include "libfsst.hpp" -Symbol concat(Symbol a, Symbol b) { - Symbol s; - u32 length = a.length()+b.length(); - if (length > Symbol::maxLength) length = Symbol::maxLength; - s.set_code_len(FSST_CODE_MASK, length); - s.val.num = (b.val.num << (8*a.length())) | a.val.num; - return s; +namespace libfsst { + Symbol concat(Symbol a, Symbol b) { + Symbol s; + u32 length = a.length() + b.length(); + if (length > Symbol::maxLength) length = Symbol::maxLength; + s.set_code_len(FSST_CODE_MASK, length); + s.val.num = (b.val.num << (8 * a.length())) | a.val.num; + return s; } +} + namespace std { template <> -class hash { +class hash { public: - size_t operator()(const QSymbol& q) const { + size_t operator()(const libfsst::QSymbol& q) const { uint64_t k = q.symbol.val.num; const uint64_t m = 0xc6a4a7935bd1e995; const int r = 47; @@ -48,572 +51,605 @@ class hash { }; } -bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; } +namespace libfsst { + + bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; } + + std::ostream &operator<<(std::ostream &out, const Symbol &s) { + for (u32 i = 0; i < s.length(); i++) + out << s.val.str[i]; + return out; + } + + SymbolTable * + buildSymbolTable(Counters &counters, vector line, const size_t len[], bool zeroTerminated = false) { + SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable(); + int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception) + size_t sampleFrac = 128; + + // start by determining the terminator. We use the (lowest) most infrequent byte as terminator + st->zeroTerminated = zeroTerminated; + if (zeroTerminated) { + st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency + } else { + u16 byteHisto[256]; + memset(byteHisto, 0, sizeof(byteHisto)); + for (size_t i = 0; i < line.size(); i++) { + const u8 *cur = line[i]; + const u8 *end = cur + len[i]; + while (cur < end) byteHisto[*cur++]++; + } + u32 minSize = FSST_SAMPLEMAXSZ, i = st->terminator = 256; + while (i-- > 0) { + if (byteHisto[i] > minSize) continue; + st->terminator = i; + minSize = byteHisto[i]; + } + } + assert(st->terminator != 256); -std::ostream& operator<<(std::ostream& out, const Symbol& s) { - for (u32 i=0; i line, const size_t len[], bool zeroTerminated=false) { - SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable(); - int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception) - size_t sampleFrac = 128; - - // start by determining the terminator. We use the (lowest) most infrequent byte as terminator - st->zeroTerminated = zeroTerminated; - if (zeroTerminated) { - st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency - } else { - u16 byteHisto[256]; - memset(byteHisto, 0, sizeof(byteHisto)); - for(size_t i=0; iterminator = 256; - while(i-- > 0) { - if (byteHisto[i] > minSize) continue; - st->terminator = i; - minSize = byteHisto[i]; - } - } - assert(st->terminator != 256); - - // a random number between 0 and 128 - auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); }; - - // compress sample, and compute (pair-)frequencies - auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain - int gain = 0; - - for(size_t i=0; i sampleFrac) continue; - } - if (cur < end) { - u16 code2 = 255, code1 = st->findLongestSymbol(cur, end); - cur += st->symbols[code1].length(); - gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1))); - while (true) { - // count single symbol (i.e. an option is not extending it) - counters.count1Inc(code1); - - // as an alternative, consider just using the next byte.. - if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly - counters.count1Inc(*start); - - if (cur==end) { - break; - } - - // now match a new symbol - start = cur; - if (curhashTabSize-1); - Symbol s = st->hashTab[idx]; - code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK; - word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); - if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) { - code2 = s.code(); - cur += s.length(); - } else if (code2 >= FSST_CODE_BASE) { - cur += 2; - } else { - code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK; - cur += 1; - } - } else { - code2 = st->findLongestSymbol(cur, end); - cur += st->symbols[code2].length(); - } - - // compute compressed output size - gain += ((int) (cur-start))-(1+isEscapeCode(code2)); - - if (sampleFrac < 128) { // no need to count pairs in final round - // consider the symbol that is the concatenation of the two last symbols - counters.count2Inc(code1, code2); - - // as an alternative, consider just extending with the next byte.. - if ((cur-start) > 1) // ..but do not count single byte extensions doubly - counters.count2Inc(code1, *start); - } - code1 = code2; - } - } - } - return gain; - }; - - auto makeTable = [&](SymbolTable *st, Counters &counters) { - // hashmap of c (needed because we can generate duplicate candidates) - unordered_set cands; - - // artificially make terminater the most frequent symbol so it gets included - u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator; - counters.count1Set(terminator,65535); - - auto addOrInc = [&](unordered_set &cands, Symbol s, u64 count) { - if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!! - QSymbol q; - q.symbol = s; - q.gain = count * s.length(); - auto it = cands.find(q); - if (it != cands.end()) { - q.gain += (*it).gain; - cands.erase(*it); - } - cands.insert(q); - }; - - // add candidate symbols based on counted frequency - for (u32 pos1=0; pos1nSymbols; pos1++) { - u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!! - if (!cnt1) continue; - - // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed - Symbol s1 = st->symbols[pos1]; - addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1); - - if (sampleFrac >= 128 || // last round we do not create new (combined) symbols - s1.length() == Symbol::maxLength || // symbol cannot be extended - s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte - continue; - } - for (u32 pos2=0; pos2nSymbols; pos2++) { - u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!! - if (!cnt2) continue; - - // create a new symbol - Symbol s2 = st->symbols[pos2]; - Symbol s3 = concat(s1, s2); - if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte - addOrInc(cands, s3, cnt2); - } - } - - // insert candidates into priority queue (by gain) - auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.val.num > q2.symbol.val.num); }; - priority_queue,decltype(cmpGn)> pq(cmpGn); - for (auto& q : cands) - pq.push(q); - - // Create new symbol map using best candidates - st->clear(); - while (st->nSymbols < 255 && !pq.empty()) { - QSymbol q = pq.top(); - pq.pop(); - st->add(q.symbol); - } - }; - - u8 bestCounters[512*sizeof(u16)]; -#ifdef NONOPT_FSST - for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) { - sampleFrac = frac; -#else - for(sampleFrac=8; true; sampleFrac += 30) { -#endif - memset(&counters, 0, sizeof(Counters)); - long gain = compressCount(st, counters); - if (gain >= bestGain) { // a new best solution! - counters.backup1(bestCounters); - *bestTable = *st; bestGain = gain; - } - if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128) - makeTable(st, counters); - } - delete st; - counters.restore1(bestCounters); - makeTable(bestTable, counters); - bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression - return bestTable; -} + // compress sample, and compute (pair-)frequencies + auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain + int gain = 0; -static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, const size_t len[], const u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) { - size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size; - u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings - SIMDjob input[512]; // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer - SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this) - size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs) - - while (curLine < nlines && outOff <= (1<<19)) { - size_t prevLine = curLine, chunk, curOff = 0; - - // bail out if the output buffer cannot hold the compressed next string fully - if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7 - else budget -= (len[curLine]-curOff)*2; - - strOut[curLine] = (u8*) 0; - lenOut[curLine] = 0; - - do { - do { - chunk = len[curLine] - curOff; - if (chunk > 511) { - chunk = 511; // large strings need to be chopped up into segments of 511 bytes - } - // create a job in this batch - SIMDjob job; - job.cur = inOff; - job.end = job.cur + chunk; - job.pos = batchPos; - job.out = outOff; - - // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros) - outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes. - if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk - - // register job in this batch - input[batchPos] = job; - jobLine[batchPos] = curLine; - - if (chunk == 0) { - empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out - } else { - // copy string chunk into temp buffer - memcpy(symbolBase + inOff, line[curLine] + curOff, chunk); - inOff += chunk; - curOff += chunk; - symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded + for (size_t i = 0; i < line.size(); i++) { + const u8 *cur = line[i], *start = cur; + const u8 *end = cur + len[i]; + + if (sampleFrac < 128) { + // in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x) + if (rnd128(i) > sampleFrac) continue; + } + if (cur < end) { + u16 code2 = 255, code1 = st->findLongestSymbol(cur, end); + cur += st->symbols[code1].length(); + gain += (int) (st->symbols[code1].length() - (1 + isEscapeCode(code1))); + while (true) { + // count single symbol (i.e. an option is not extending it) + counters.count1Inc(code1); + + // as an alternative, consider just using the next byte.. + if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly + counters.count1Inc(*start); + + if (cur == end) { + break; + } + + // now match a new symbol + start = cur; + if (cur < end - 7) { + u64 word = fsst_unaligned_load(cur); + size_t code = word & 0xFFFFFF; + size_t idx = FSST_HASH(code) & (st->hashTabSize - 1); + Symbol s = st->hashTab[idx]; + code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK; + word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) { + code2 = s.code(); + cur += s.length(); + } else if (code2 >= FSST_CODE_BASE) { + cur += 2; + } else { + code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK; + cur += 1; + } + } else { + code2 = st->findLongestSymbol(cur, end); + cur += st->symbols[code2].length(); + } + + // compute compressed output size + gain += ((int) (cur - start)) - (1 + isEscapeCode(code2)); + + if (sampleFrac < 128) { // no need to count pairs in final round + // consider the symbol that is the concatenation of the two last symbols + counters.count2Inc(code1, code2); + + // as an alternative, consider just extending with the next byte.. + if ((cur - start) > 1) // ..but do not count single byte extensions doubly + counters.count2Inc(code1, *start); + } + code1 = code2; + } + } } - if (++batchPos == 512) break; - } while(curOff < len[curLine]); - - if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines)) { // cannot accumulate more? - if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling) - // radix-sort jobs on length (longest string first) - // -- this provides best load balancing and allows to skip empty jobs at the end - u16 sortpos[513]; - memset(sortpos, 0, sizeof(sortpos)); - - // calculate length histo - for(size_t i=0; i cands; + + // artificially make terminater the most frequent symbol so it gets included + u16 terminator = st->nSymbols ? FSST_CODE_BASE : st->terminator; + counters.count1Set(terminator, 65535); + + auto addOrInc = [&](unordered_set &cands, Symbol s, u64 count) { + if (count < (5 * sampleFrac) / 128) + return; // improves both compression speed (less candidates), but also quality!! + QSymbol q; + q.symbol = s; + q.gain = count * s.length(); + auto it = cands.find(q); + if (it != cands.end()) { + q.gain += (*it).gain; + cands.erase(*it); + } + cands.insert(q); + }; + + // add candidate symbols based on counted frequency + for (u32 pos1 = 0; pos1 < FSST_CODE_BASE + (size_t) st->nSymbols; pos1++) { + u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!! + if (!cnt1) continue; + + // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed + Symbol s1 = st->symbols[pos1]; + addOrInc(cands, s1, ((s1.length() == 1) ? 8LL : 1LL) * cnt1); + + if (sampleFrac >= 128 || // last round we do not create new (combined) symbols + s1.length() == Symbol::maxLength || // symbol cannot be extended + s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte + continue; + } + for (u32 pos2 = 0; pos2 < FSST_CODE_BASE + (size_t) st->nSymbols; pos2++) { + u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!! + if (!cnt2) continue; + + // create a new symbol + Symbol s2 = st->symbols[pos2]; + Symbol s3 = concat(s1, s2); + if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte + addOrInc(cands, s3, cnt2); } - // finally.. SIMD compress max 256KB of simdbuf into (max) 512KB of simdbuf (but presumably much less..) - for(size_t done = fsst_compressAVX512(symbolTable, codeBase, symbolBase, inputOrdered, output, batchPos-empty, unroll); - done < batchPos; done++) output[done] = inputOrdered[done]; - } else { - memcpy(output, input, batchPos*sizeof(SIMDjob)); } - - // finish encoding (unfinished strings in process, plus the few last strings not yet processed) - for(size_t i=0; i> (u8) s.icl); - if ((s.icl < FSST_ICL_FREE) && s.val.num == word) { - *out++ = (u8) s.code(); cur += s.length(); - } else { - // could be a 2-byte or 1-byte code, or miss - // handle everything with predication - *out = (u8) code; - out += 1+((code&FSST_CODE_BASE)>>8); - cur += (code>>FSST_LEN_BITS); - } - } - job.out = out - codeBase; - } - // postprocess job info - job.cur = 0; - job.end = job.out - input[job.pos].out; // misuse .end field as compressed size - job.out = input[job.pos].out; // reset offset to start of encoded string - input[job.pos] = job; + + // insert candidates into priority queue (by gain) + auto cmpGn = [](const QSymbol &q1, const QSymbol &q2) { + return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.val.num > q2.symbol.val.num); + }; + priority_queue, decltype(cmpGn)> pq(cmpGn); + for (auto &q: cands) + pq.push(q); + + // Create new symbol map using best candidates + st->clear(); + while (st->nSymbols < 255 && !pq.empty()) { + QSymbol q = pq.top(); + pq.pop(); + st->add(q.symbol); } - - // copy out the result data - for(size_t i=0; i= bestGain) { // a new best solution! + counters.backup1(bestCounters); + *bestTable = *st; + bestGain = gain; } - - // go for the next batch of 512 chunks - inOff = outOff = batchPos = empty = 0; - budget = (size_t) (lim - dst); - } - } while (curLine == prevLine && outOff <= (1<<19)); - } - return curLine; -} + if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128) + makeTable(st, counters); + } + delete st; + counters.restore1(bestCounters); + makeTable(bestTable, counters); + bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression + return bestTable; + } + + static inline size_t + compressSIMD(SymbolTable &symbolTable, u8 *symbolBase, size_t nlines, const size_t len[], const u8 *line[], + size_t size, u8 *dst, size_t lenOut[], u8 *strOut[], int unroll) { + size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size; + u8 *lim = dst + size, *codeBase = symbolBase + (1 << 18); // 512KB temp space for compressing 512 strings + SIMDjob input[512]; // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer + SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this) + size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs) + + while (curLine < nlines && outOff <= (1 << 19)) { + size_t prevLine = curLine, chunk, curOff = 0; + + // bail out if the output buffer cannot hold the compressed next string fully + if (((len[curLine] - curOff) * 2 + 7) > budget) break; // see below for the +7 + else budget -= (len[curLine] - curOff) * 2; + + strOut[curLine] = (u8 *) 0; + lenOut[curLine] = 0; + + do { + do { + chunk = len[curLine] - curOff; + if (chunk > 511) { + chunk = 511; // large strings need to be chopped up into segments of 511 bytes + } + // create a job in this batch + SIMDjob job; + job.cur = inOff; + job.end = job.cur + chunk; + job.pos = batchPos; + job.out = outOff; + + // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros) + outOff += 7 + 2 * (size_t) (job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes. + if (outOff > (1 << 19)) break; // simdbuf may get full, stop before this chunk + + // register job in this batch + input[batchPos] = job; + jobLine[batchPos] = curLine; + + if (chunk == 0) { + empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out + } else { + // copy string chunk into temp buffer + memcpy(symbolBase + inOff, line[curLine] + curOff, chunk); + inOff += chunk; + curOff += chunk; + symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded + } + if (++batchPos == 512) break; + } while (curOff < len[curLine]); + + if ((batchPos == 512) || (outOff > (1 << 19)) || (++curLine >= nlines)) { // cannot accumulate more? + if (batchPos - empty >= + 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling) + // radix-sort jobs on length (longest string first) + // -- this provides best load balancing and allows to skip empty jobs at the end + u16 sortpos[513]; + memset(sortpos, 0, sizeof(sortpos)); + + // calculate length histo + for (size_t i = 0; i < batchPos; i++) { + size_t len = input[i].end - input[i].cur; + sortpos[512UL - len]++; + } + // calculate running sum + for (size_t i = 1; i <= 512; i++) + sortpos[i] += sortpos[i - 1]; + + // move jobs to their final destination + SIMDjob inputOrdered[512]; + for (size_t i = 0; i < batchPos; i++) { + size_t len = input[i].end - input[i].cur; + size_t pos = sortpos[511UL - len]++; + inputOrdered[pos] = input[i]; + } + // finally.. SIMD compress max 256KB of simdbuf into (max) 512KB of simdbuf (but presumably much less..) + for (size_t done = fsst_compressAVX512(symbolTable, codeBase, symbolBase, inputOrdered, output, + batchPos - empty, unroll); + done < batchPos; done++) + output[done] = inputOrdered[done]; + } else { + memcpy(output, input, batchPos * sizeof(SIMDjob)); + } + + // finish encoding (unfinished strings in process, plus the few last strings not yet processed) + for (size_t i = 0; i < batchPos; i++) { + SIMDjob job = output[i]; + if (job.cur < job.end) { // finish encoding this string with scalar code + u8 *cur = symbolBase + job.cur; + u8 *end = symbolBase + job.end; + u8 *out = codeBase + job.out; + while (cur < end) { + u64 word = fsst_unaligned_load(cur); + size_t code = symbolTable.shortCodes[word & 0xFFFF]; + size_t pos = word & 0xFFFFFF; + size_t idx = FSST_HASH(pos) & (symbolTable.hashTabSize - 1); + Symbol s = symbolTable.hashTab[idx]; + out[1] = (u8) word; // speculatively write out escaped byte + word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) && s.val.num == word) { + *out++ = (u8) s.code(); + cur += s.length(); + } else { + // could be a 2-byte or 1-byte code, or miss + // handle everything with predication + *out = (u8) code; + out += 1 + ((code & FSST_CODE_BASE) >> 8); + cur += (code >> FSST_LEN_BITS); + } + } + job.out = out - codeBase; + } + // postprocess job info + job.cur = 0; + job.end = job.out - input[job.pos].out; // misuse .end field as compressed size + job.out = input[job.pos].out; // reset offset to start of encoded string + input[job.pos] = job; + } + + // copy out the result data + for (size_t i = 0; i < batchPos; i++) { + size_t lineNr = jobLine[i]; // the sort must be order-preserving, as we concatenate results string in order + size_t sz = input[i].end; // had stored compressed lengths here + if (!strOut[lineNr]) strOut[lineNr] = dst; // first segment will be the strOut pointer + lenOut[lineNr] += sz; // add segment (lenOut starts at 0 for this reason) + memcpy(dst, codeBase + input[i].out, sz); + dst += sz; + } + + // go for the next batch of 512 chunks + inOff = outOff = batchPos = empty = 0; + budget = (size_t) (lim - dst); + } + } while (curLine == prevLine && outOff <= (1 << 19)); + } + return curLine; + } // optimized adaptive *scalar* compression method -static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, const size_t lenIn[], const u8* strIn[], size_t size, u8* out, size_t lenOut[], u8* strOut[], bool noSuffixOpt, bool avoidBranch) { - const u8 *cur = NULL, *end = NULL, *lim = out + size; - size_t curLine, suffixLim = symbolTable.suffixLim; - u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0]; - - u8 buf[512+8] = {}; /* +8 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */ - - // three variants are possible. dead code falls away since the bool arguments are constants - auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) { - while (cur < end) { - u64 word = fsst_unaligned_load(cur); - size_t code = symbolTable.shortCodes[word & 0xFFFF]; - if (noSuffixOpt && ((u8) code) < suffixLim) { - // 2 byte code without having to worry about longer matches - *out++ = (u8) code; cur += 2; - } else { - size_t pos = word & 0xFFFFFF; - size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1); - Symbol s = symbolTable.hashTab[idx]; - out[1] = (u8) word; // speculatively write out escaped byte - word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); - if ((s.icl < FSST_ICL_FREE) && s.val.num == word) { - *out++ = (u8) s.code(); cur += s.length(); - } else if (avoidBranch) { - // could be a 2-byte or 1-byte code, or miss - // handle everything with predication - *out = (u8) code; - out += 1+((code&FSST_CODE_BASE)>>8); - cur += (code>>FSST_LEN_BITS); - } else if ((u8) code < byteLim) { - // 2 byte code after checking there is no longer pattern - *out++ = (u8) code; cur += 2; - } else { - // 1 byte code or miss. - *out = (u8) code; - out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse - cur++; + static inline size_t + compressBulk(SymbolTable &symbolTable, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *out, + size_t lenOut[], u8 *strOut[], bool noSuffixOpt, bool avoidBranch) { + const u8 *cur = NULL, *end = NULL, *lim = out + size; + size_t curLine, suffixLim = symbolTable.suffixLim; + u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0]; + + u8 buf[512 + 8] = {}; /* +8 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */ + + // three variants are possible. dead code falls away since the bool arguments are constants + auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) { + while (cur < end) { + u64 word = fsst_unaligned_load(cur); + size_t code = symbolTable.shortCodes[word & 0xFFFF]; + if (noSuffixOpt && ((u8) code) < suffixLim) { + // 2 byte code without having to worry about longer matches + *out++ = (u8) code; + cur += 2; + } else { + size_t pos = word & 0xFFFFFF; + size_t idx = FSST_HASH(pos) & (symbolTable.hashTabSize - 1); + Symbol s = symbolTable.hashTab[idx]; + out[1] = (u8) word; // speculatively write out escaped byte + word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) && s.val.num == word) { + *out++ = (u8) s.code(); + cur += s.length(); + } else if (avoidBranch) { + // could be a 2-byte or 1-byte code, or miss + // handle everything with predication + *out = (u8) code; + out += 1 + ((code & FSST_CODE_BASE) >> 8); + cur += (code >> FSST_LEN_BITS); + } else if ((u8) code < byteLim) { + // 2 byte code after checking there is no longer pattern + *out++ = (u8) code; + cur += 2; + } else { + // 1 byte code or miss. + *out = (u8) code; + out += 1 + ((code & FSST_CODE_BASE) + >> 8); // predicated - tested with a branch, that was always worse + cur++; + } + } } - } - } - }; - - for(curLine=0; curLine 511) { - chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST - } - if ((2*chunk+7) > (size_t) (lim-out)) { - return curLine; // out of memory - } - // copy the string to the 511-byte buffer - memcpy(buf, cur, chunk); - buf[chunk] = (u8) symbolTable.terminator; - cur = buf; - end = cur + chunk; - - // based on symboltable stats, choose a variant that is nice to the branch predictor - if (noSuffixOpt) { - compressVariant(true,false); - } else if (avoidBranch) { - compressVariant(false,true); - } else { - compressVariant(false, false); - } - } while((curOff += chunk) < lenIn[curLine]); - lenOut[curLine] = (size_t) (out - strOut[curLine]); - } - return curLine; -} + }; + + for (curLine = 0; curLine < nlines; curLine++) { + size_t chunk, curOff = 0; + strOut[curLine] = out; + do { + cur = strIn[curLine] + curOff; + chunk = lenIn[curLine] - curOff; + if (chunk > 511) { + chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST + } + if ((2 * chunk + 7) > (size_t) (lim - out)) { + return curLine; // out of memory + } + // copy the string to the 511-byte buffer + memcpy(buf, cur, chunk); + buf[chunk] = (u8) symbolTable.terminator; + cur = buf; + end = cur + chunk; + + // based on symboltable stats, choose a variant that is nice to the branch predictor + if (noSuffixOpt) { + compressVariant(true, false); + } else if (avoidBranch) { + compressVariant(false, true); + } else { + compressVariant(false, false); + } + } while ((curOff += chunk) < lenIn[curLine]); + lenOut[curLine] = (size_t) (out - strOut[curLine]); + } + return curLine; + } #define FSST_SAMPLELINE ((size_t) 512) // quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes -vector makeSample(u8* sampleBuf, const u8* strIn[], const size_t **lenRef, size_t nlines) { - size_t totSize = 0; - const size_t *lenIn = *lenRef; - vector sample; - - for(size_t i=0; i sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample - Encoder *encoder = new Encoder(); - encoder->symbolTable = shared_ptr(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated)); - if (sampleLen != lenIn) delete[] sampleLen; - delete[] sampleBuf; - return (fsst_encoder_t*) encoder; -} + vector makeSample(u8 *sampleBuf, const u8 *strIn[], const size_t **lenRef, size_t nlines) { + size_t totSize = 0; + const size_t *lenIn = *lenRef; + vector sample; + + for (size_t i = 0; i < nlines; i++) + totSize += lenIn[i]; + + if (totSize < FSST_SAMPLETARGET) { + for (size_t i = 0; i < nlines; i++) + sample.push_back(strIn[i]); + } else { + size_t sampleRnd = FSST_HASH(4637947); + const u8 *sampleLim = sampleBuf + FSST_SAMPLETARGET; + size_t *sampleLen = new size_t[nlines + FSST_SAMPLEMAXSZ / FSST_SAMPLELINE]; + *lenRef = sampleLen; + size_t *sampleLenLim = sampleLen + nlines + FSST_SAMPLEMAXSZ / FSST_SAMPLELINE; + + while (sampleBuf < sampleLim && sampleLen < sampleLenLim) { + // choose a non-empty line + sampleRnd = FSST_HASH(sampleRnd); + size_t linenr = sampleRnd % nlines; + while (lenIn[linenr] == 0) + if (++linenr == nlines) linenr = 0; + + // choose a chunk + size_t chunks = 1 + ((lenIn[linenr] - 1) / FSST_SAMPLELINE); + sampleRnd = FSST_HASH(sampleRnd); + size_t chunk = FSST_SAMPLELINE * (sampleRnd % chunks); + + // add the chunk to the sample + size_t len = min(lenIn[linenr] - chunk, FSST_SAMPLELINE); + memcpy(sampleBuf, strIn[linenr] + chunk, len); + sample.push_back(sampleBuf); + sampleBuf += *sampleLen++ = len; + } + } + return sample; + } + + extern "C" fsst_encoder_t *fsst_create(size_t n, const size_t lenIn[], const u8 *strIn[], int zeroTerminated) { + u8 *sampleBuf = new u8[FSST_SAMPLEMAXSZ]; + const size_t *sampleLen = lenIn; + vector sample = makeSample(sampleBuf, strIn, &sampleLen, n ? n + : 1); // careful handling of input to get a right-size and representative sample + Encoder *encoder = new Encoder(); + encoder->symbolTable = shared_ptr( + buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated)); + if (sampleLen != lenIn) delete[] sampleLen; + delete[] sampleBuf; + return (fsst_encoder_t *) encoder; + } /* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */ -extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) { - Encoder *e = new Encoder(); - e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr - return (fsst_encoder_t*) e; -} + extern "C" fsst_encoder_t *fsst_duplicate(fsst_encoder_t *encoder) { + Encoder *e = new Encoder(); + e->symbolTable = ((Encoder *) encoder)->symbolTable; // it is a shared_ptr + return (fsst_encoder_t *) e; + } // export a symbol table in compact format. -extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) { - Encoder *e = (Encoder*) encoder; - // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there. - // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t - // (such functionality could be useful to append compressed data to an existing block). - // - // However, the hash function in the encoder hash table is endian-sensitive, and given its - // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables. - // Doing a endian-conversion during hashing will be slow and self-defeating. - // - // Overall, we could support reconstructing an encoder for incremental compression, but - // should enforce equal-endianness. Bit of a bummer. Not going there now. - // - // The version field is now there just for future-proofness, but not used yet - - // version allows keeping track of fsst versions, track endianness, and encoder reconstruction - u64 version = (FSST_VERSION << 32) | // version is 24 bits, most significant byte is 0 - (((u64) e->symbolTable->suffixLim) << 24) | - (((u64) e->symbolTable->terminator) << 16) | - (((u64) e->symbolTable->nSymbols) << 8) | - FSST_ENDIAN_MARKER; // least significant byte is nonzero - - /* do not assume unaligned reads here */ - memcpy(buf, &version, 8); - buf[8] = e->symbolTable->zeroTerminated; - for(u32 i=0; i<8; i++) - buf[9+i] = (u8) e->symbolTable->lenHisto[i]; - u32 pos = 17; - - // emit only the used bytes of the symbols - for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++) - for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++) - buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes - - return pos; // length of what was serialized -} + extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) { + Encoder *e = (Encoder *) encoder; + // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there. + // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t + // (such functionality could be useful to append compressed data to an existing block). + // + // However, the hash function in the encoder hash table is endian-sensitive, and given its + // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables. + // Doing a endian-conversion during hashing will be slow and self-defeating. + // + // Overall, we could support reconstructing an encoder for incremental compression, but + // should enforce equal-endianness. Bit of a bummer. Not going there now. + // + // The version field is now there just for future-proofness, but not used yet + + // version allows keeping track of fsst versions, track endianness, and encoder reconstruction + u64 version = (FSST_VERSION << 32) | // version is 24 bits, most significant byte is 0 + (((u64) e->symbolTable->suffixLim) << 24) | + (((u64) e->symbolTable->terminator) << 16) | + (((u64) e->symbolTable->nSymbols) << 8) | + FSST_ENDIAN_MARKER; // least significant byte is nonzero + + /* do not assume unaligned reads here */ + memcpy(buf, &version, 8); + buf[8] = e->symbolTable->zeroTerminated; + for (u32 i = 0; i < 8; i++) + buf[9 + i] = (u8) e->symbolTable->lenHisto[i]; + u32 pos = 17; + + // emit only the used bytes of the symbols + for (u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++) + for (u32 j = 0; j < e->symbolTable->symbols[i].length(); j++) + buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes + + return pos; // length of what was serialized + } #define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */ -extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 *buf) { - u64 version = 0; - u32 code, pos = 17; - u8 lenHisto[8]; - - // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped) - memcpy(&version, buf, 8); - if ((version>>32) != FSST_VERSION) return 0; - decoder->zeroTerminated = buf[8]&1; - memcpy(lenHisto, buf+9, 8); - - // in case of zero-terminated, first symbol is "" (zero always, may be overwritten) - decoder->len[0] = 1; - decoder->symbol[0] = 0; - - // we use lenHisto[0] as 1-byte symbol run length (at the end) - code = decoder->zeroTerminated; - if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end - - // now get all symbols from the buffer - for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */ - for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) { - decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1 */ - decoder->symbol[code] = 0; - for(u32 j=0; jlen[code]; j++) - ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols - } - } - if (decoder->zeroTerminated) lenHisto[0]++; + extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 *buf) { + u64 version = 0; + u32 code, pos = 17; + u8 lenHisto[8]; + + // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped) + memcpy(&version, buf, 8); + if ((version >> 32) != FSST_VERSION) return 0; + decoder->zeroTerminated = buf[8] & 1; + memcpy(lenHisto, buf + 9, 8); + + // in case of zero-terminated, first symbol is "" (zero always, may be overwritten) + decoder->len[0] = 1; + decoder->symbol[0] = 0; + + // we use lenHisto[0] as 1-byte symbol run length (at the end) + code = decoder->zeroTerminated; + if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end + + // now get all symbols from the buffer + for (u32 l = 1; l <= 8; l++) { /* l = 1,2,3,4,5,6,7,8 */ + for (u32 i = 0; i < lenHisto[(l & 7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) { + decoder->len[code] = (l & 7) + 1; /* len = 2,3,4,5,6,7,8,1 */ + decoder->symbol[code] = 0; + for (u32 j = 0; j < decoder->len[code]; j++) + ((u8 *) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols + } + } + if (decoder->zeroTerminated) lenHisto[0]++; - // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols). - while(code<255) { - decoder->symbol[code] = FSST_CORRUPT; - decoder->len[code++] = 8; - } - return pos; -} + // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols). + while (code < 255) { + decoder->symbol[code] = FSST_CORRUPT; + decoder->len[code++] = 8; + } + return pos; + } // runtime check for simd -inline size_t _compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { + inline size_t + _compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, + size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { #ifndef NONOPT_FSST - if (simd && fsst_hasAVX512()) - return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); + if (simd && fsst_hasAVX512()) + return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); #endif - (void) simd; - return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch); -} -size_t compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { - return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); -} + (void) simd; + return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, + avoidBranch); + } + + size_t compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, + size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { + return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); + } // adaptive choosing of scalar compression method based on symbol length histogram -inline size_t _compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { - bool avoidBranch = false, noSuffixOpt = false; - if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) { - noSuffixOpt = true; - } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) && - (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) && - (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) { - avoidBranch = true; - } - return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); -} -size_t compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { - return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); + inline size_t + _compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, + size_t *lenOut, u8 *strOut[], int simd) { + bool avoidBranch = false, noSuffixOpt = false; + if (100 * e->symbolTable->lenHisto[1] > 65 * e->symbolTable->nSymbols && + 100 * e->symbolTable->suffixLim > 95 * e->symbolTable->lenHisto[1]) { + noSuffixOpt = true; + } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) && + (e->symbolTable->lenHisto[0] < 43 || + e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) && + (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) { + avoidBranch = true; + } + return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); + } + + size_t compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, + size_t *lenOut, u8 *strOut[], int simd) { + return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); + } } // the main compression function (everything automatic) @@ -621,12 +657,12 @@ extern "C" size_t fsst_compress(fsst_encoder_t *encoder, size_t nlines, const si // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB) size_t totLen = accumulate(lenIn, lenIn+nlines, 0); int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15); - return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd); + return _compressAuto((libfsst::Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd); } /* deallocate encoder */ extern "C" void fsst_destroy(fsst_encoder_t* encoder) { - Encoder *e = (Encoder*) encoder; + libfsst::Encoder *e = (libfsst::Encoder*) encoder; delete e; } @@ -639,3 +675,4 @@ extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) { assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; return decoder; } + diff --git a/libfsst.hpp b/libfsst.hpp index 6a38ab4..cb04bd7 100644 --- a/libfsst.hpp +++ b/libfsst.hpp @@ -51,62 +51,85 @@ typedef uint64_t u64; // we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length #define FSST_LEN_BITS 12 -#define FSST_CODE_BITS 9 +#define FSST_CODE_BITS 9 #define FSST_CODE_BASE 256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */ #define FSST_CODE_MAX (1UL<=8) { - len = 8; - memcpy(val.str, input, 8); + if (len >= 8) { + len = 8; + memcpy(val.str, input, 8); } else { - memcpy(val.str, input, len); + memcpy(val.str, input, len); } set_code_len(FSST_CODE_MAX, len); } - void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); } + + void set_code_len(u32 code, u32 len) { icl = (len << 28) | (code << 16) | ((8 - len) * 8); } u32 length() const { return (u32) (icl >> 28); } + u16 code() const { return (icl >> 16) & FSST_CODE_MASK; } + u32 ignoredBits() const { return (u32) icl; } - u8 first() const { assert( length() >= 1); return 0xFF & val.num; } - u16 first2() const { assert( length() >= 2); return 0xFFFF & val.num; } + u8 first() const { + assert(length() >= 1); + return 0xFF & val.num; + } -#define FSST_HASH_LOG2SIZE 10 + u16 first2() const { + assert(length() >= 2); + return 0xFFFF & val.num; + } + +#define FSST_HASH_LOG2SIZE 10 #define FSST_HASH_PRIME 2971215073LL #define FSST_SHIFT 15 #define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT)) - size_t hash() const { size_t v = 0xFFFFFF & val.num; return FSST_HASH(v); } // hash on the next 3 bytes + + size_t hash() const { + size_t v = 0xFFFFFF & val.num; + return FSST_HASH(v); + } // hash on the next 3 bytes }; // Symbol that can be put in a queue, ordered on gain -struct QSymbol{ +struct QSymbol { Symbol symbol; mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols - bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); } + bool operator==(const QSymbol &other) const { + return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); + } }; // we construct FSST symbol tables using a random sample of about 16KB (1<<14) @@ -144,7 +167,7 @@ struct QSymbol{ // the gain field is only used in the symbol queue that sorts symbols on gain struct SymbolTable { - static const u32 hashTabSize = 1<> (u8) s.icl); return true; } + bool add(Symbol s) { assert(FSST_CODE_BASE + nSymbols < FSST_CODE_MAX); u32 len = s.length(); s.set_code_len(FSST_CODE_BASE + nSymbols, len); if (len == 1) { - byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1<> ((u8) hashTab[idx].icl)))) { - return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol + size_t idx = s.hash() & (hashTabSize - 1); + if (hashTab[idx].icl <= s.icl && + hashTab[idx].val.num == (s.val.num & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) { + return (hashTab[idx].icl >> 16) & FSST_CODE_MASK; // matched a long symbol } if (s.length() >= 2) { - u16 code = shortCodes[s.first2()] & FSST_CODE_MASK; - if (code >= FSST_CODE_BASE) return code; + u16 code = shortCodes[s.first2()] & FSST_CODE_MASK; + if (code >= FSST_CODE_BASE) return code; } return byteCodes[s.first()] & FSST_CODE_MASK; } - u16 findLongestSymbol(const u8* cur, const u8* end) const { - return findLongestSymbol(Symbol(cur,end)); // represent the string as a temporary symbol + + u16 findLongestSymbol(const u8 *cur, const u8 *end) const { + return findLongestSymbol(Symbol(cur, end)); // represent the string as a temporary symbol } // rationale for finalize: @@ -275,53 +303,53 @@ struct SymbolTable { // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore). // void finalize(u8 zeroTerminated) { - assert(nSymbols <= 255); - u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated); - - // compute running sum of code lengths (starting offsets for each length) - rsum[0] = byteLim; // 1-byte codes are highest - rsum[1] = zeroTerminated; - for(u32 i=1; i<7; i++) - rsum[i+1] = rsum[i] + lenHisto[i]; - - // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim) - suffixLim = rsum[1]; - symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only) - - for(u32 i=zeroTerminated, j=rsum[2]; i 1 && first2 == s2.first2()) // test if symbol k is a suffix of s - opt = 0; - } - newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim - } else - newCode[i] = rsum[len-1]++; - s1.set_code_len(newCode[i],len); - symbols[newCode[i]] = s1; - } - // renumber the codes in byteCodes[] - for(u32 i=0; i<256; i++) - if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) - byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS); - else - byteCodes[i] = 511 + (1 << FSST_LEN_BITS); - - // renumber the codes in shortCodes[] - for(u32 i=0; i<65536; i++) - if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) - shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS)); - else - shortCodes[i] = byteCodes[i&0xFF]; - - // replace the symbols in the hash table - for(u32 i=0; i 1 && first2 == s2.first2()) // test if symbol k is a suffix of s + opt = 0; + } + newCode[i] = opt ? suffixLim++ : --j; // symbols without a larger suffix have a code < suffixLim + } else + newCode[i] = rsum[len - 1]++; + s1.set_code_len(newCode[i], len); + symbols[newCode[i]] = s1; + } + // renumber the codes in byteCodes[] + for (u32 i = 0; i < 256; i++) + if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) + byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS); + else + byteCodes[i] = 511 + (1 << FSST_LEN_BITS); + + // renumber the codes in shortCodes[] + for (u32 i = 0; i < 65536; i++) + if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) + shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS)); + else + shortCodes[i] = byteCodes[i & 0xFF]; + + // replace the symbols in the hash table + for (u32 i = 0; i < hashTabSize; i++) + if (hashTab[i].icl < FSST_ICL_FREE) + hashTab[i] = symbols[newCode[(u8) hashTab[i].code()]]; } }; @@ -353,6 +381,7 @@ struct Counters { } }; #else + // we keep two counters count1[pos] and count2[pos1][pos2] of resp 16 and 12-bits. Both are split into two columns for performance reasons // first reason is to make the column we update the most during symbolTable construction (the low bits) thinner, thus reducing CPU cache pressure. // second reason is that when scanning the array, after seeing a 64-bits 0 in the high bits column, we can quickly skip over many codes (15 or 7) @@ -360,28 +389,33 @@ struct Counters { // high arrays come before low arrays, because our GetNext() methods may overrun their 64-bits reads a few bytes u8 count1High[FSST_CODE_MAX]; // array to count frequency of symbols as they occur in the sample (16-bits) u8 count1Low[FSST_CODE_MAX]; // it is split in a low and high byte: cnt = count1High*256 + count1Low - u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX/2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high) + u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX / + 2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high) u8 count2Low[FSST_CODE_MAX][FSST_CODE_MAX]; // its value is (count2High*256+count2Low) -- but high is 4-bits (we put two numbers in one, hence /2) // 385KB -- but hot area likely just 10 + 30*4 = 130 cache lines (=8KB) - - void count1Set(u32 pos1, u16 val) { - count1Low[pos1] = val&255; - count1High[pos1] = val>>8; + + void count1Set(u32 pos1, u16 val) { + count1Low[pos1] = val & 255; + count1High[pos1] = val >> 8; } - void count1Inc(u32 pos1) { + + void count1Inc(u32 pos1) { if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)... } - void count2Inc(u32 pos1, u32 pos2) { - if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) - // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively - count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample) + + void count2Inc(u32 pos1, u32 pos2) { + if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) + // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively + count2High[pos1][(pos2) >> 1] += + 1 << (((pos2) & 1) << 2); // we take our chances with overflow.. (4K maxval, on a 8K sample) } + u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros u64 high = fsst_unaligned_load(&count1High[pos1]); // note: this reads 8 subsequent counters [pos1..pos1+7] - u32 zero = high?(__builtin_ctzl(high)>>3):7UL; // number of zero bytes + u32 zero = high ? (__builtin_ctzl(high) >> 3) : 7UL; // number of zero bytes high = (high >> (zero << 3)) & 255; // advance to nonzero counter if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 return 0; // all zero @@ -390,12 +424,14 @@ struct Counters { if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) return (u32) ((high << 8) + low); } + u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros - u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]); // note: this reads 16 subsequent counters [pos2..pos2+15] - high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters + u64 high = fsst_unaligned_load( + &count2High[pos1][pos2 >> 1]); // note: this reads 16 subsequent counters [pos2..pos2+15] + high >>= ((pos2 & 1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters - u32 zero = high?(__builtin_ctzl(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters + u32 zero = high ? (__builtin_ctzl(high) >> 2) : (15UL - (pos2 & 1UL)); // number of zero 4-bits counters high = (high >> (zero << 2)) & 15; // advance to nonzero counter if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 return 0UL; // all zero @@ -404,15 +440,18 @@ struct Counters { if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) return (u32) ((high << 8) + low); } + void backup1(u8 *buf) { memcpy(buf, count1High, FSST_CODE_MAX); - memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX); + memcpy(buf + FSST_CODE_MAX, count1Low, FSST_CODE_MAX); } + void restore1(u8 *buf) { memcpy(count1High, buf, FSST_CODE_MAX); - memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX); + memcpy(count1Low, buf + FSST_CODE_MAX, FSST_CODE_MAX); } -}; +}; + #endif @@ -429,22 +468,26 @@ struct Encoder { // job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call) struct SIMDjob { - u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB) + u64 out: 19, pos: 9, end: 18, cur: 18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB) }; -extern bool +extern bool fsst_hasAVX512(); // runtime check for avx512 capability -extern size_t +extern size_t fsst_compressAVX512( - SymbolTable &symbolTable, - u8* codeBase, // IN: base address for codes, i.e. compression output (points to simdbuf+256KB) - u8* symbolBase, // IN: base address for string bytes, i.e. compression input (points to simdbuf) - SIMDjob* input, // IN: input array (size n) with job information: what to encode, where to store it. - SIMDjob* output, // OUT: output array (size n) with job information: how much got encoded, end output pointer. - size_t n, // IN: size of arrays input and output (should be max 512) - size_t unroll); // IN: degree of SIMD unrolling + SymbolTable &symbolTable, + u8 *codeBase, // IN: base address for codes, i.e. compression output (points to simdbuf+256KB) + u8 *symbolBase, // IN: base address for string bytes, i.e. compression input (points to simdbuf) + SIMDjob *input, // IN: input array (size n) with job information: what to encode, where to store it. + SIMDjob *output, // OUT: output array (size n) with job information: how much got encoded, end output pointer. + size_t n, // IN: size of arrays input and output (should be max 512) + size_t unroll); // IN: degree of SIMD unrolling // C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree) -size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd); -size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd); +size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, + u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd); + +size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, + u8 *strOut[], int simd); +} diff --git a/libfsst12.cpp b/libfsst12.cpp index fa68451..3c86b37 100644 --- a/libfsst12.cpp +++ b/libfsst12.cpp @@ -19,23 +19,25 @@ #include #include +namespace libfsst { Symbol concat(Symbol a, Symbol b) { Symbol s; - u32 length = min(8, a.length()+b.length()); + u32 length = min(8, a.length() + b.length()); s.set_code_len(FSST_CODE_MASK, length); - *(u64*) s.symbol = ((*(u64*) b.symbol) << (8*a.length())) | *(u64*) a.symbol; + *(u64 *) s.symbol = ((*(u64 *) b.symbol) << (8 * a.length())) | *(u64 *) a.symbol; return s; } +} namespace std { -template <> -class hash { - public: - size_t operator()(const Symbol& s) const { - uint64_t k = *(u64*) s.symbol; +template<> +class hash { +public: + size_t operator()(const libfsst::Symbol &s) const { + uint64_t k = *(u64 *) s.symbol; const uint64_t m = 0xc6a4a7935bd1e995; const int r = 47; - uint64_t h = 0x8445d61a4e774912 ^ (8*m); + uint64_t h = 0x8445d61a4e774912 ^ (8 * m); k *= m; k ^= k >> r; k *= m; @@ -49,41 +51,46 @@ class hash { }; } -std::ostream& operator<<(std::ostream& out, const Symbol& s) { - for (u32 i=0; i& sample, const ulong len[], const u8* line[]) { - ulong sampleSize = max(sampleParam, FSST_SAMPLEMAXSZ); // if sampleParam is negative, we need to ignore part of the last line +SymbolMap * +buildSymbolMap(Counters &counters, long sampleParam, vector &sample, const ulong len[], const u8 *line[]) { + ulong sampleSize = max(sampleParam, + FSST_SAMPLEMAXSZ); // if sampleParam is negative, we need to ignore part of the last line SymbolMap *st = new SymbolMap(), *bestMap = new SymbolMap(); long bestGain = -sampleSize; // worst case (everything exception) ulong sampleFrac = 128; - for(ulong i=0; i 500) end = cur + ((end-cur)*sampleFrac)/128; // shorten long lines to the sample fraction + if ((end - cur) > 500) + end = cur + ((end - cur) * sampleFrac) / + 128; // shorten long lines to the sample fraction } else if (sampleFrac < 128) { // in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x) if (rnd128(i) > sampleFrac) continue; @@ -94,32 +101,33 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector& s cur += pos1 >> 12; pos1 &= FSST_CODE_MASK; while (true) { - const u8 *old = cur; + const u8 *old = cur; counters.count1Inc(pos1); - if (curhashTabSize-1); + ulong idx = FSST_HASH(pos) & (st->hashTabSize - 1); Symbol s = st->hashTab[idx]; pos2 = st->shortCodes[word & 0xFFFF]; word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.gcl); - if ((s.gcl < FSST_GCL_FREE) && (*(u64*) s.symbol == word)) { - pos2 = s.code(); cur += s.length(); + if ((s.gcl < FSST_GCL_FREE) && (*(u64 *) s.symbol == word)) { + pos2 = s.code(); + cur += s.length(); } else { cur += (pos2 >> 12); pos2 &= FSST_CODE_MASK; } - } else if (cur==end) { + } else if (cur == end) { break; } else { - assert(curfindExpansion(Symbol(cur, end)); cur += pos2 >> 12; pos2 &= FSST_CODE_MASK; } // compute compressed output size (later divide by 2) - gain += 2*(cur-old)-3; + gain += 2 * (cur - old) - 3; // now count the subsequent two symbols we encode as an extension possibility if (sampleFrac < 128) { // no need to count pairs in final round @@ -129,7 +137,7 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector& s } } } - return gain; + return gain; }; auto makeMap = [&](SymbolMap *st, Counters &counters) { @@ -138,7 +146,7 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector& s auto addOrInc = [&](unordered_set &cands, Symbol s, u32 count) { auto it = cands.find(s); - s.gain = s.length()*count; + s.gain = s.length() * count; if (it != cands.end()) { s.gain += (*it).gain; cands.erase(*it); @@ -147,7 +155,7 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector& s }; // add candidate symbols based on counted frequency - for (u32 pos1=0; pos1symbolCount; pos1++) { + for (u32 pos1 = 0; pos1 < st->symbolCount; pos1++) { u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!! if (!cnt1) continue; @@ -160,7 +168,7 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector& s s1.length() == Symbol::maxLength) { // symbol cannot be extended continue; } - for (u32 pos2=0; pos2symbolCount; pos2++) { + for (u32 pos2 = 0; pos2 < st->symbolCount; pos2++) { u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!! if (!cnt2) continue; @@ -172,9 +180,9 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector& s } // insert candidates into priority queue (by gain) - auto cmpGn = [](const Symbol& q1, const Symbol& q2) { return q1.gain < q2.gain; }; - priority_queue,decltype(cmpGn)> pq(cmpGn); - for (auto& q : cands) + auto cmpGn = [](const Symbol &q1, const Symbol &q2) { return q1.gain < q2.gain; }; + priority_queue, decltype(cmpGn)> pq(cmpGn); + for (auto &q: cands) pq.push(q); // Create new symbol map using best candidates @@ -190,13 +198,14 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector& s for(ulong frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) { sampleFrac = frac; #else - for(sampleFrac=14; true; sampleFrac = sampleFrac + 38) { + for (sampleFrac = 14; true; sampleFrac = sampleFrac + 38) { #endif memset(&counters, 0, sizeof(Counters)); long gain = compressCount(st, counters); if (gain >= bestGain) { // a new best solution! - *bestMap = *st; bestGain = gain; - } + *bestMap = *st; + bestGain = gain; + } if (sampleFrac >= 128) break; // we do 4 rounds (sampleFrac=14,52,90,128) makeMap(st, counters); } @@ -205,21 +214,23 @@ SymbolMap *buildSymbolMap(Counters& counters, long sampleParam, vector& s } // optimized adaptive *scalar* compression method -static inline ulong compressBulk(SymbolMap &symbolMap, ulong nlines, const ulong lenIn[], const u8* strIn[], ulong size, u8* out, ulong lenOut[], u8* strOut[]) { +static inline ulong +compressBulk(SymbolMap &symbolMap, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *out, + ulong lenOut[], u8 *strOut[]) { u8 *lim = out + size; ulong curLine; - for(curLine=0; curLine= 8) { + while (cur + 16 <= end && (lim - out) >= 8) { u64 word = fsst_unaligned_load(cur); ulong code = symbolMap.shortCodes[word & 0xFFFF]; ulong pos = (u32) word; // key is first 4 bytes - ulong idx = FSST_HASH(pos)&(symbolMap.hashTabSize-1); + ulong idx = FSST_HASH(pos) & (symbolMap.hashTabSize - 1); Symbol s = symbolMap.hashTab[idx]; word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.gcl); - if ((s.gcl < FSST_GCL_FREE) && *(ulong*) s.symbol == word) { + if ((s.gcl < FSST_GCL_FREE) && *(ulong *) s.symbol == word) { code = s.gcl >> 16; } cur += (code >> 12); @@ -227,37 +238,37 @@ static inline ulong compressBulk(SymbolMap &symbolMap, ulong nlines, const ulong word = fsst_unaligned_load(cur); code = symbolMap.shortCodes[word & 0xFFFF]; pos = (u32) word; // key is first 4 bytes - idx = FSST_HASH(pos)&(symbolMap.hashTabSize-1); + idx = FSST_HASH(pos) & (symbolMap.hashTabSize - 1); s = symbolMap.hashTab[idx]; word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.gcl); - if ((s.gcl < FSST_GCL_FREE) && *(ulong*) s.symbol == word) { - code = s.gcl >> 16; + if ((s.gcl < FSST_GCL_FREE) && *(ulong *) s.symbol == word) { + code = s.gcl >> 16; } cur += (code >> 12); - res |= (code&FSST_CODE_MASK) << 12; + res |= (code & FSST_CODE_MASK) << 12; memcpy(out, &res, sizeof(u64)); - out += 3; + out += 3; } while (cur < end) { ulong code = symbolMap.findExpansion(Symbol(cur, end)); - u32 res = (code&FSST_CODE_MASK); - if (out+8 > lim) { - return curLine; // u32 write would be out of bounds (out of output memory) + u32 res = (code & FSST_CODE_MASK); + if (out + 8 > lim) { + return curLine; // u32 write would be out of bounds (out of output memory) } cur += code >> 12; if (cur >= end) { memcpy(out, &res, sizeof(u64)); - out += 2; + out += 2; break; } code = symbolMap.findExpansion(Symbol(cur, end)); - res |= (code&FSST_CODE_MASK) << 12; + res |= (code & FSST_CODE_MASK) << 12; cur += code >> 12; memcpy(out, &res, sizeof(u64)); - out += 3; - } + out += 3; + } lenOut[curLine] = out - strOut[curLine]; - } + } return curLine; } @@ -265,56 +276,57 @@ long makeSample(vector &sample, ulong nlines, const ulong len[]) { ulong i, sampleRnd = 1, sampleProb = 256, sampleSize = 0, totSize = 0; ulong sampleTarget = FSST_SAMPLETARGET; - for(i=0; i FSST_SAMPLETARGET) { - // if the batch is larger than the sampletarget, sample this fraction - sampleProb = max(((ulong) 4),(256*sampleTarget) / totSize); + // if the batch is larger than the sampletarget, sample this fraction + sampleProb = max(((ulong) 4), (256 * sampleTarget) / totSize); } else { // too little data. But ok, do not include lines multiple times, just use everything once - sampleTarget = totSize; // sampleProb will be 256/256 (aka 100%) - } + sampleTarget = totSize; // sampleProb will be 256/256 (aka 100%) + } do { // if nlines is very large and strings are small (8, so we need 4K lines), we still expect 4K*256/4 iterations total worst case - for(i=0; i= sampleTarget) // enough? - i = nlines; // break out of both loops; + if (sampleSize >= sampleTarget) // enough? + i = nlines; // break out of both loops; } } sampleProb *= 4; //accelerate the selection process at expense of front-bias (4,16,64,256: 4 passes max) - } while(i <= nlines); // basically continue until we have enough + } while (i <= nlines); // basically continue until we have enough // if the last line (only line?) is excessively long, return a negative samplesize (the amount of front bytes to skip) long sampleLong = (long) sampleSize; assert(sampleLong > 0); - return (sampleLong < FSST_SAMPLEMAXSZ)?sampleLong:FSST_SAMPLEMAXSZ-sampleLong; + return (sampleLong < FSST_SAMPLEMAXSZ) ? sampleLong : FSST_SAMPLEMAXSZ - sampleLong; } -extern "C" fsst_encoder_t* fsst_create(ulong n, const ulong lenIn[], const u8 *strIn[], int dummy) { +extern "C" fsst_encoder_t *fsst_create(ulong n, const ulong lenIn[], const u8 *strIn[], int dummy) { vector sample; (void) dummy; - long sampleSize = makeSample(sample, n?n:1, lenIn); // careful handling of input to get a right-size and representative sample + long sampleSize = makeSample(sample, n ? n : 1, + lenIn); // careful handling of input to get a right-size and representative sample Encoder *encoder = new Encoder(); encoder->symbolMap = shared_ptr(buildSymbolMap(encoder->counters, sampleSize, sample, lenIn, strIn)); - return (fsst_encoder_t*) encoder; + return (fsst_encoder_t *) encoder; } /* create another encoder instance, necessary to do multi-threaded encoding using the same dictionary */ -extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) { +extern "C" fsst_encoder_t *fsst_duplicate(fsst_encoder_t *encoder) { Encoder *e = new Encoder(); - e->symbolMap = ((Encoder*)encoder)->symbolMap; // it is a shared_ptr - return (fsst_encoder_t*) e; + e->symbolMap = ((Encoder *) encoder)->symbolMap; // it is a shared_ptr + return (fsst_encoder_t *) e; } // export a dictionary in compact format. extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) { - Encoder *e = (Encoder*) encoder; + Encoder *e = (Encoder *) encoder; // In ->version there is a versionnr, but we hide also suffixLim/terminator/symbolCount there. // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t // (such functionality could be useful to append compressed data to an existing block). @@ -323,24 +335,24 @@ extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) { // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables. // Doing a endian-conversion during hashing will be slow and self-defeating. // - // Overall, we could support reconstructing an encoder for incremental compression, but + // Overall, we could support reconstructing an encoder for incremental compression, but // should enforce equal-endianness. Bit of a bummer. Not going there now. - // + // // The version field is now there just for future-proofness, but not used yet - + // version allows keeping track of fsst versions, track endianness, and encoder reconstruction u64 version = (FSST_VERSION << 32) | FSST_ENDIAN_MARKER; // least significant byte is nonzero /* do not assume unaligned reads here */ memcpy(buf, &version, 8); - memcpy(buf+8, e->symbolMap->lenHisto, 16); // serialize the lenHisto + memcpy(buf + 8, e->symbolMap->lenHisto, 16); // serialize the lenHisto u32 pos = 24; - // emit only the used bytes of the symbols - for(u32 i = 0; i < e->symbolMap->symbolCount; i++) { + // emit only the used bytes of the symbols + for (u32 i = 0; i < e->symbolMap->symbolCount; i++) { buf[pos++] = e->symbolMap->symbols[i].length(); - for(u32 j = 0; j < e->symbolMap->symbols[i].length(); j++) { - buf[pos++] = ((u8*) &e->symbolMap->symbols[i].symbol)[j]; // serialize used symbol bytes + for (u32 j = 0; j < e->symbolMap->symbols[i].length(); j++) { + buf[pos++] = ((u8 *) &e->symbolMap->symbols[i].symbol)[j]; // serialize used symbol bytes } } return pos; // length of what was serialized @@ -355,57 +367,66 @@ extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 *buf) { // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped) memcpy(&version, buf, 8); - if ((version>>32) != FSST_VERSION) return 0; - memcpy(lenHisto, buf+8, 16); + if ((version >> 32) != FSST_VERSION) return 0; + memcpy(lenHisto, buf + 8, 16); - for(u32 i=0; i<8; i++) - symbolCount += lenHisto[i]; + for (u32 i = 0; i < 8; i++) + symbolCount += lenHisto[i]; - for(u32 i = 0; i < symbolCount; i++) { + for (u32 i = 0; i < symbolCount; i++) { u32 len = decoder->len[i] = buf[pos++]; - for(u32 j = 0; j < len; j++) { - ((u8*) &decoder->symbol[i])[j] = buf[pos++]; + for (u32 j = 0; j < len; j++) { + ((u8 *) &decoder->symbol[i])[j] = buf[pos++]; } } // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols). - while(symbolCount<4096) { - decoder->symbol[symbolCount] = FSST_CORRUPT; - decoder->len[symbolCount++] = 8; + while (symbolCount < 4096) { + decoder->symbol[symbolCount] = FSST_CORRUPT; + decoder->len[symbolCount++] = 8; } return pos; } // runtime check for simd -inline ulong _compressImpl(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { +inline ulong _compressImpl(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, + ulong *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { (void) noSuffixOpt; (void) avoidBranch; (void) simd; return compressBulk(*e->symbolMap, nlines, lenIn, strIn, size, output, lenOut, strOut); } -ulong compressImpl(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { + +ulong compressImpl(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, + ulong *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); } // adaptive choosing of scalar compression method based on symbol length histogram -inline ulong _compressAuto(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], int simd) { +inline ulong _compressAuto(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, + ulong *lenOut, u8 *strOut[], int simd) { (void) simd; return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, false, false, false); } -ulong compressAuto(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], int simd) { + +ulong compressAuto(Encoder *e, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, + ulong *lenOut, u8 *strOut[], int simd) { return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); } +} // the main compression function (everything automatic) -extern "C" ulong fsst_compress(fsst_encoder_t *encoder, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[]) { +extern "C" ulong +fsst_compress(fsst_encoder_t *encoder, ulong nlines, const ulong lenIn[], const u8 *strIn[], ulong size, u8 *output, + ulong *lenOut, u8 *strOut[]) { // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB) - ulong totLen = accumulate(lenIn, lenIn+nlines, 0); - int simd = totLen > nlines*12 && (nlines > 64 || totLen > (ulong) 1<<15); - return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd); + ulong totLen = accumulate(lenIn, lenIn + nlines, 0); + int simd = totLen > nlines * 12 && (nlines > 64 || totLen > (ulong) 1 << 15); + return _compressAuto((libfsst::Encoder *) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3 * simd); } /* deallocate encoder */ -extern "C" void fsst_destroy(fsst_encoder_t* encoder) { - Encoder *e = (Encoder*) encoder; +extern "C" void fsst_destroy(fsst_encoder_t *encoder) { + libfsst::Encoder *e = (libfsst::Encoder *) encoder; delete e; } @@ -415,6 +436,9 @@ extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) { u32 cnt1 = fsst_export(encoder, buf); fsst_decoder_t decoder; u32 cnt2 = fsst_import(&decoder, buf); - assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; + assert(cnt1 == cnt2); + (void) cnt1; + (void) cnt2; return decoder; } + diff --git a/libfsst12.hpp b/libfsst12.hpp index 6a88941..0ac5358 100644 --- a/libfsst12.hpp +++ b/libfsst12.hpp @@ -36,7 +36,7 @@ using namespace std; #include "fsst12.h" // the official FSST API -- also usable by C mortals /* workhorse type for string and buffer lengths: 64-bits on 64-bits platforms and 32-bits on 32-bits platforms */ -typedef unsigned long ulong; +typedef unsigned long ulong; /* unsigned integers */ typedef uint8_t u8; @@ -51,12 +51,14 @@ typedef uint64_t u64; // "symbols" are character sequences (up to 8 bytes) // A symbol is compressed into a "code" of, 1.5 bytes (12 bits) #define FSST_CODE_MAX 4096 -#define FSST_CODE_MASK ((u16) (FSST_CODE_MAX-1)) +#define FSST_CODE_MASK ((u16) (FSST_CODE_MAX-1)) -inline uint64_t fsst_unaligned_load(u8 const* V) { - uint64_t Ret; - memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible) - return Ret; +namespace libfsst { + +inline uint64_t fsst_unaligned_load(u8 const *V) { + uint64_t Ret; + memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible) + return Ret; } struct Symbol { @@ -67,39 +69,54 @@ struct Symbol { mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of Symbols // the byte sequence that this symbol stands for - u8 symbol[maxLength]; + u8 symbol[maxLength]; Symbol() : gcl(0) {} - explicit Symbol(u8 c, u16 code) : gcl((1<<28)|(code<<16)|7) { *(u64*) symbol = c; } // single-char symbol - explicit Symbol(const char* input, u32 len) { + explicit Symbol(u8 c, u16 code) : gcl((1 << 28) | (code << 16) | 7) { *(u64 *) symbol = c; } // single-char symbol + explicit Symbol(const char *input, u32 len) { if (len < 8) { - *(u64*) symbol = 0; - for(u32 i=0; i> 28; } + u16 code() const { return (gcl >> 16) & FSST_CODE_MASK; } + u8 garbageBits() const { return gcl; } - u8 first() const { return 0xFF & *(u64*) symbol; } - u16 first2() const { assert(length() > 1); return (0xFFFF & *(u64*) symbol); } + u8 first() const { return 0xFF & *(u64 *) symbol; } + + u16 first2() const { + assert(length() > 1); + return (0xFFFF & *(u64 *) symbol); + } #define FSST_HASH_LOG2SIZE 14 -#define FSST_HASH_SHIFT 15 +#define FSST_HASH_SHIFT 15 #define FSST_HASH_PRIME1 2971215073LL #define FSST_HASH(w) (((w)*FSST_HASH_PRIME1)^(((w)*FSST_HASH_PRIME1)>>13)) - ulong hash() const { uint v0 = 0xFFFFFFFF & *(ulong*) symbol; return FSST_HASH(v0); } - bool operator==(const Symbol& other) const { return *(u64*) symbol == *(u64*) other.symbol && length() == other.length(); } + ulong hash() const { + uint v0 = 0xFFFFFFFF & *(ulong *) symbol; + return FSST_HASH(v0); + } + + bool operator==(const Symbol &other) const { + return *(u64 *) symbol == *(u64 *) other.symbol && length() == other.length(); + } }; // during search for the best dictionary, we probe both (in this order, first wins): @@ -117,13 +134,13 @@ struct Symbol { // the gain field is only used in the symbol queue that sorts symbols on gain struct SymbolMap { - static const u32 hashTabSize = 1<> (u8) s.gcl); + *(u64 *) hashTab[idx].symbol = (*(u64 *) s.symbol) & (0xFFFFFFFFFFFFFFFF >> (u8) s.gcl); return true; } + bool add(Symbol s) { assert(symbolCount < 4096); u32 len = s.length(); @@ -185,24 +203,26 @@ struct SymbolMap { return false; } symbols[symbolCount++] = s; - lenHisto[len-1]++; + lenHisto[len - 1]++; return true; } + /// Find symbol in hash table, return code u16 hashFind(Symbol s) const { - ulong idx = s.hash() & (hashTabSize-1); - if (hashTab[idx].gcl <= s.gcl && - *(u64*) hashTab[idx].symbol == (*(u64*) s.symbol & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].gcl)))) - return (hashTab[idx].gcl>>16); // matched a long symbol + ulong idx = s.hash() & (hashTabSize - 1); + if (hashTab[idx].gcl <= s.gcl && + *(u64 *) hashTab[idx].symbol == (*(u64 *) s.symbol & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].gcl)))) + return (hashTab[idx].gcl >> 16); // matched a long symbol return 0; } + /// Find longest expansion, return code u16 findExpansion(Symbol s) const { - if (s.length() == 1) { - return 4096 + s.first(); + if (s.length() == 1) { + return 4096 + s.first(); } u16 ret = hashFind(s); - return ret?ret:shortCodes[s.first2()]; + return ret ? ret : shortCodes[s.first2()]; } }; @@ -235,6 +255,7 @@ struct Counters { } }; #else + // we keep two counters count1[pos] and count2[pos1][pos2] of resp 16 and 12-bits. Both are split into two columns for performance reasons // first reason is to make the column we update the most during symbolTable construction (the low bits) thinner, thus reducing CPU cache pressure. // second reason is that when scanning the array, after seeing a 64-bits 0 in the high bits column, we can quickly skip over many codes (15 or 7) @@ -242,28 +263,33 @@ struct Counters { // high arrays come before low arrays, because our GetNext() methods may overrun their 64-bits reads a few bytes u8 count1High[FSST_CODE_MAX]; // array to count frequency of symbols as they occur in the sample (16-bits) u8 count1Low[FSST_CODE_MAX]; // it is split in a low and high byte: cnt = count1High*256 + count1Low - u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX/2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high) + u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX / + 2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high) u8 count2Low[FSST_CODE_MAX][FSST_CODE_MAX]; // its value is (count2High*256+count2Low) -- but high is 4-bits (we put two numbers in one, hence /2) // 385KB -- but hot area likely just 10 + 30*4 = 130 cache lines (=8KB) - - void count1Set(u32 pos1, u16 val) { - count1Low[pos1] = val&255; - count1High[pos1] = val>>8; + + void count1Set(u32 pos1, u16 val) { + count1Low[pos1] = val & 255; + count1High[pos1] = val >> 8; } - void count1Inc(u32 pos1) { + + void count1Inc(u32 pos1) { if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)... } - void count2Inc(u32 pos1, u32 pos2) { - if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) - // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively - count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample) + + void count2Inc(u32 pos1, u32 pos2) { + if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) + // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively + count2High[pos1][(pos2) >> 1] += + 1 << (((pos2) & 1) << 2); // we take our chances with overflow.. (4K maxval, on a 8K sample) } + u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros - u64 high = *(u64*) &count1High[pos1]; // note: this reads 8 subsequent counters [pos1..pos1+7] + u64 high = *(u64 *) &count1High[pos1]; // note: this reads 8 subsequent counters [pos1..pos1+7] - u32 zero = high?(__builtin_ctzl(high)>>3):7; // number of zero bytes + u32 zero = high ? (__builtin_ctzl(high) >> 3) : 7; // number of zero bytes high = (high >> (zero << 3)) & 255; // advance to nonzero counter if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 return 0; // all zero @@ -272,12 +298,13 @@ struct Counters { if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) return (high << 8) + low; } + u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros - u64 high = *(u64*) &count2High[pos1][pos2>>1]; // note: this reads 16 subsequent counters [pos2..pos2+15] - high >>= (pos2&1) << 2; // odd pos2: ignore the lowest 4 bits & we see only 15 counters + u64 high = *(u64 *) &count2High[pos1][pos2 >> 1]; // note: this reads 16 subsequent counters [pos2..pos2+15] + high >>= (pos2 & 1) << 2; // odd pos2: ignore the lowest 4 bits & we see only 15 counters - u32 zero = high?(__builtin_ctzl(high)>>2):(15-(pos2&1)); // number of zero 4-bits counters + u32 zero = high ? (__builtin_ctzl(high) >> 2) : (15 - (pos2 & 1)); // number of zero 4-bits counters high = (high >> (zero << 2)) & 15; // advance to nonzero counter if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 return 0; // all zero @@ -286,15 +313,18 @@ struct Counters { if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) return (high << 8) + low; } + void backup1(u8 *buf) { memcpy(buf, count1High, FSST_CODE_MAX); - memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX); + memcpy(buf + FSST_CODE_MAX, count1Low, FSST_CODE_MAX); } + void restore1(u8 *buf) { memcpy(count1High, buf, FSST_CODE_MAX); - memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX); + memcpy(count1Low, buf + FSST_CODE_MAX, FSST_CODE_MAX); } -}; +}; + #endif // an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression @@ -306,5 +336,11 @@ struct Encoder { }; // C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree) -ulong compressImpl(Encoder *encoder, ulong n, ulong lenIn[], u8 *strIn[], ulong size, u8 * output, ulong *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd); -ulong compressAuto(Encoder *encoder, ulong n, ulong lenIn[], u8 *strIn[], ulong size, u8 * output, ulong *lenOut, u8 *strOut[], int simd); +ulong +compressImpl(Encoder *encoder, ulong n, ulong lenIn[], u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], + bool noSuffixOpt, bool avoidBranch, int simd); + +ulong +compressAuto(Encoder *encoder, ulong n, ulong lenIn[], u8 *strIn[], ulong size, u8 *output, ulong *lenOut, u8 *strOut[], + int simd); +}