tput improvements (#13)

Improvements in throughput and allocations * Eliminated the old `find_longest_symbol` stuff and rewrote `compress_count` to just use `compress`, 5x speedup for the train benchmark * A couple of allocations tricks, including replacing `vec![Symbol::EMPTY; N]` with `vec![0u64; N]` and then transmuting, saving `N` calls to `Symbol.clone` and replacing 2D with 1D vector (allows us to use the `vec!` specialization for creating a vector of all zeros)
spiraldb · Aug 20, 2024 · 2d8db1a · 2d8db1a
1 parent 0941c0b
commit 2d8db1a
Show file tree

Hide file tree

Showing 10 changed files with 187 additions and 139 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -2,17 +2,17 @@ name: CI
 
 on:
   push:
-    branches: [ "develop" ]
-  pull_request: { }
-  workflow_dispatch: { }
+    branches: ["develop"]
+  pull_request: {}
+  workflow_dispatch: {}
 
 permissions:
   actions: read
   contents: read
 
 jobs:
   build:
-    name: 'build'
+    name: "build"
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -40,7 +40,6 @@ jobs:
         uses: mozilla-actions/[email protected]
       - name: Rust Compile Cache Config
         shell: bash
-        # echo "CARGO_LOG=cargo::core::compiler::fingerprint=info" >> $GITHUB_ENV
         run: |
           echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV
           echo "RUSTC_WRAPPER=sccache" >> $GITHUB_ENV

diff --git a/.github/workflows/release-plz.yml b/.github/workflows/release-plz.yml
@@ -4,7 +4,6 @@ permissions:
   pull-requests: write
   contents: write
 
-# TODO(aduffy): uncomment when we're ready to publish
 on:
   push:
     branches:

diff --git a/benches/compress.rs b/benches/compress.rs
@@ -1,12 +1,8 @@
-//! Compression benchmark.
-//!
-//! Contains benchmarks for FSST compression, decompression, and symbol table training.
-//!
-//! Also contains LZ4 baseline.
+//! Benchmarks for FSST compression, decompression, and symbol table training.
 #![allow(missing_docs)]
 use core::str;
 
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
 
 use fsst::{Compressor, ESCAPE_CODE};
 
@@ -34,13 +30,23 @@ fn bench_fsst(c: &mut Criterion) {
     let decompressor = compressor.decompressor();
     let decompressed = decompressor.decompress(&compressed);
     let decompressed = str::from_utf8(&decompressed).unwrap();
-    println!("DECODED: {}", decompressed);
-    assert_eq!(decompressed, TEST);
 
+    group.throughput(Throughput::Elements(1));
+    group.bench_function("compress-word", |b| {
+        let mut out = vec![0u8; 8];
+        let out_ptr = out.as_mut_ptr();
+        let front = &TEST.as_bytes()[0..8];
+        let word = u64::from_le_bytes(front.try_into().unwrap());
+
+        b.iter(|| black_box(unsafe { compressor.compress_word(word, out_ptr) }));
+    });
+
+    group.throughput(Throughput::Bytes(CORPUS.len() as u64));
     group.bench_function("compress-single", |b| {
-        b.iter(|| black_box(compressor.compress(black_box(plaintext))));
+        b.iter(|| black_box(compressor.compress(black_box(CORPUS.as_bytes()))));
     });
 
+    group.throughput(Throughput::Bytes(decompressed.len() as u64));
     group.bench_function("decompress-single", |b| {
         b.iter(|| black_box(decompressor.decompress(black_box(&compressed))));
     });

diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock
diff --git a/src/builder.rs b/src/builder.rs
@@ -7,23 +7,37 @@
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 
-use crate::find_longest::FindLongestSymbol;
-use crate::{Compressor, Symbol, MAX_CODE};
+use crate::{Compressor, Symbol, ESCAPE_CODE, MAX_CODE};
 
 #[derive(Debug, Clone)]
 struct Counter {
     /// Frequency count for each code.
     counts1: Vec<usize>,
 
     /// Frequency count for each code-pair.
-    counts2: Vec<Vec<usize>>,
+    counts2: Vec<usize>,
 }
 
+const COUNTS1_SIZE: usize = MAX_CODE as usize;
+// NOTE: in Rust, creating a 1D vector of length N^2 is ~4x faster than creating a 2-D vector,
+//  because `vec!` has a specialization for zero.
+const COUNTS2_SIZE: usize = COUNTS1_SIZE * COUNTS1_SIZE;
+
 impl Counter {
     fn new() -> Self {
         Self {
-            counts1: vec![0; MAX_CODE as usize],
-            counts2: vec![vec![0; MAX_CODE as usize]; MAX_CODE as usize],
+            counts1: vec![0; COUNTS1_SIZE],
+            counts2: vec![0; COUNTS2_SIZE],
+        }
+    }
+
+    /// reset
+    pub fn reset(&mut self) {
+        for idx in 0..COUNTS1_SIZE {
+            self.counts1[idx] = 0;
+        }
+        for idx in 0..COUNTS2_SIZE {
+            self.counts2[idx] = 0;
         }
     }
 
@@ -34,7 +48,8 @@ impl Counter {
 
     #[inline]
     fn record_count2(&mut self, code1: u16, code2: u16) {
-        self.counts2[code1 as usize][code2 as usize] += 1;
+        let idx = (code1 as usize) * 511 + (code2 as usize);
+        self.counts2[idx] += 1;
     }
 
     #[inline]
@@ -44,14 +59,15 @@ impl Counter {
 
     #[inline]
     fn count2(&self, code1: u16, code2: u16) -> usize {
-        self.counts2[code1 as usize][code2 as usize]
+        let idx = (code1 as usize) * 511 + (code2 as usize);
+        self.counts2[idx]
     }
 }
 
 /// The number of generations used for training. This is taken from the [FSST paper].
 ///
 /// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
-pub const MAX_GENERATIONS: usize = 5;
+const MAX_GENERATIONS: usize = 5;
 
 impl Compressor {
     /// Build and train a `Compressor` from a sample corpus of text.
@@ -70,69 +86,92 @@ impl Compressor {
         if sample.is_empty() {
             return compressor;
         }
-        for _generation in 0..MAX_GENERATIONS {
-            let counter = compressor.compress_count(sample);
-            compressor = compressor.optimize(counter);
+
+        let mut counter = Counter::new();
+
+        for _generation in 0..(MAX_GENERATIONS - 1) {
+            compressor.compress_count(sample, &mut counter);
+            compressor = compressor.optimize(&counter, true);
+            counter.reset();
         }
 
-        compressor
+        compressor.compress_count(sample, &mut counter);
+        compressor.optimize(&counter, true)
     }
 }
 
 impl Compressor {
     /// Compress the text using the current symbol table. Count the code occurrences
     /// and code-pair occurrences to allow us to calculate apparent gain.
-    fn compress_count(&self, sample: &[u8]) -> Counter {
-        let mut counter = Counter::new();
-        let len = sample.len();
-        let mut prev_code = self.find_longest_symbol(sample);
-        counter.record_count1(prev_code);
-        let mut pos = self.symbols[prev_code as usize].len();
+    fn compress_count(&self, sample: &[u8], counter: &mut Counter) {
+        let compressed = self.compress(sample);
+        let len = compressed.len();
+
+        if len == 0 {
+            return;
+        }
+
+        fn next_code(pos: usize, compressed: &[u8]) -> (u16, usize) {
+            if compressed[pos] == ESCAPE_CODE {
+                (compressed[pos + 1] as u16, 2)
+            } else {
+                (256 + compressed[pos] as u16, 1)
+            }
+        }
+
+        // Get first code, record count
+        let (code, pos) = next_code(0, &compressed);
+        counter.record_count1(code);
+
+        let mut pos = pos;
+        let mut prev_code = code;
 
         while pos < len {
-            let code = self.find_longest_symbol(&sample[pos..len]);
+            let (code, advance) = next_code(pos, &compressed);
+            pos += advance;
+
             counter.record_count1(code);
             counter.record_count2(prev_code, code);
-            pos += self.symbols[code as usize].len();
+
             prev_code = code;
         }
-
-        counter
     }
 
     /// Using a set of counters and the existing set of symbols, build a new
     /// set of symbols/codes that optimizes the gain over the distribution in `counter`.
-    fn optimize(&self, counters: Counter) -> Self {
+    fn optimize(&self, counters: &Counter, include_ascii: bool) -> Self {
         let mut res = Compressor::default();
-        let mut pqueue = BinaryHeap::new();
+        let mut pqueue = BinaryHeap::with_capacity(65_536);
         for code1 in 0u16..(256u16 + self.n_symbols as u16) {
             let symbol1 = self.symbols[code1 as usize];
-            let gain = counters.count1(code1) * symbol1.len();
-            pqueue.push(Candidate {
-                symbol: symbol1,
-                gain,
-            });
+            let mut gain = counters.count1(code1) * symbol1.len();
+            // NOTE: use heuristic from C++ implementation to boost the gain of single-byte symbols.
+            // This helps to reduce exception counts.
+            if code1 < 256 {
+                gain *= 8;
+            }
+            if gain > 0 {
+                pqueue.push(Candidate {
+                    symbol: symbol1,
+                    gain,
+                });
+            }
 
             for code2 in 0u16..(256u16 + self.n_symbols as u16) {
                 let symbol2 = &self.symbols[code2 as usize];
                 // If either symbol is zero-length, or if merging would yield a symbol of
                 // length greater than 8, skip.
-                if symbol1.len() + symbol2.len() >= 8 {
+                if symbol1.len() + symbol2.len() > 8 {
                     continue;
                 }
                 let new_symbol = symbol1.concat(symbol2);
-                // as`sert the symbol is not empty
-                assert!(
-                    !new_symbol.is_empty(),
-                    "symbol made by merging {:?} and {:?} is empty",
-                    symbol1,
-                    symbol2,
-                );
-                let gain = counters.count2(code1, code2);
-                pqueue.push(Candidate {
-                    symbol: new_symbol,
-                    gain,
-                })
+                let gain = counters.count2(code1, code2) * new_symbol.len();
+                if gain > 0 {
+                    pqueue.push(Candidate {
+                        symbol: new_symbol,
+                        gain,
+                    })
+                }
             }
         }
 
@@ -145,13 +184,33 @@ impl Compressor {
             }
         }
 
+        // If there are leftover slots, fill them with ASCII chars.
+        // This helps reduce the number of escapes.
+        //
+        // Note that because of the lossy hash table, we won't accidentally
+        // save the same ASCII character twice into the table.
+        if include_ascii {
+            for character in
+                " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ[](){}:?/<>".bytes()
+            {
+                if n_symbols == 255 {
+                    break;
+                }
+
+                if res.insert(Symbol::from_u8(character)) {
+                    n_symbols += 1
+                }
+            }
+        }
+
         res
     }
 }
 
 /// A candidate for inclusion in a symbol table.
 ///
 /// This is really only useful for the `optimize` step of training.
+#[derive(Copy, Clone, Debug)]
 struct Candidate {
     gain: usize,
     symbol: Symbol,
@@ -188,6 +247,7 @@ impl Ord for Candidate {
 
 #[cfg(test)]
 mod test {
+
     use crate::{Compressor, ESCAPE_CODE};
 
     #[test]

diff --git a/src/find_longest/mod.rs b/src/find_longest/mod.rs
diff --git a/src/find_longest/naive.rs b/src/find_longest/naive.rs