Skip to content

Commit

Permalink
tput improvements (#13)
Browse files Browse the repository at this point in the history
Improvements in throughput and allocations

* Eliminated the old `find_longest_symbol` stuff and rewrote
`compress_count` to just use `compress`, 5x speedup for the train
benchmark
* A couple of allocations tricks, including replacing
`vec![Symbol::EMPTY; N]` with `vec![0u64; N]` and then transmuting,
saving `N` calls to `Symbol.clone` and replacing 2D with 1D vector
(allows us to use the `vec!` specialization for creating a vector of all
zeros)
  • Loading branch information
a10y authored Aug 20, 2024
1 parent 0941c0b commit 2d8db1a
Show file tree
Hide file tree
Showing 10 changed files with 187 additions and 139 deletions.
9 changes: 4 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@ name: CI

on:
push:
branches: [ "develop" ]
pull_request: { }
workflow_dispatch: { }
branches: ["develop"]
pull_request: {}
workflow_dispatch: {}

permissions:
actions: read
contents: read

jobs:
build:
name: 'build'
name: "build"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -40,7 +40,6 @@ jobs:
uses: mozilla-actions/[email protected]
- name: Rust Compile Cache Config
shell: bash
# echo "CARGO_LOG=cargo::core::compiler::fingerprint=info" >> $GITHUB_ENV
run: |
echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV
echo "RUSTC_WRAPPER=sccache" >> $GITHUB_ENV
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/release-plz.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ permissions:
pull-requests: write
contents: write

# TODO(aduffy): uncomment when we're ready to publish
on:
push:
branches:
Expand Down
24 changes: 15 additions & 9 deletions benches/compress.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
//! Compression benchmark.
//!
//! Contains benchmarks for FSST compression, decompression, and symbol table training.
//!
//! Also contains LZ4 baseline.
//! Benchmarks for FSST compression, decompression, and symbol table training.
#![allow(missing_docs)]
use core::str;

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};

use fsst::{Compressor, ESCAPE_CODE};

Expand Down Expand Up @@ -34,13 +30,23 @@ fn bench_fsst(c: &mut Criterion) {
let decompressor = compressor.decompressor();
let decompressed = decompressor.decompress(&compressed);
let decompressed = str::from_utf8(&decompressed).unwrap();
println!("DECODED: {}", decompressed);
assert_eq!(decompressed, TEST);

group.throughput(Throughput::Elements(1));
group.bench_function("compress-word", |b| {
let mut out = vec![0u8; 8];
let out_ptr = out.as_mut_ptr();
let front = &TEST.as_bytes()[0..8];
let word = u64::from_le_bytes(front.try_into().unwrap());

b.iter(|| black_box(unsafe { compressor.compress_word(word, out_ptr) }));
});

group.throughput(Throughput::Bytes(CORPUS.len() as u64));
group.bench_function("compress-single", |b| {
b.iter(|| black_box(compressor.compress(black_box(plaintext))));
b.iter(|| black_box(compressor.compress(black_box(CORPUS.as_bytes()))));
});

group.throughput(Throughput::Bytes(decompressed.len() as u64));
group.bench_function("decompress-single", |b| {
b.iter(|| black_box(decompressor.decompress(black_box(&compressed))));
});
Expand Down
2 changes: 1 addition & 1 deletion fuzz/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

144 changes: 102 additions & 42 deletions src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,37 @@
use std::cmp::Ordering;
use std::collections::BinaryHeap;

use crate::find_longest::FindLongestSymbol;
use crate::{Compressor, Symbol, MAX_CODE};
use crate::{Compressor, Symbol, ESCAPE_CODE, MAX_CODE};

#[derive(Debug, Clone)]
struct Counter {
/// Frequency count for each code.
counts1: Vec<usize>,

/// Frequency count for each code-pair.
counts2: Vec<Vec<usize>>,
counts2: Vec<usize>,
}

const COUNTS1_SIZE: usize = MAX_CODE as usize;
// NOTE: in Rust, creating a 1D vector of length N^2 is ~4x faster than creating a 2-D vector,
// because `vec!` has a specialization for zero.
const COUNTS2_SIZE: usize = COUNTS1_SIZE * COUNTS1_SIZE;

impl Counter {
fn new() -> Self {
Self {
counts1: vec![0; MAX_CODE as usize],
counts2: vec![vec![0; MAX_CODE as usize]; MAX_CODE as usize],
counts1: vec![0; COUNTS1_SIZE],
counts2: vec![0; COUNTS2_SIZE],
}
}

/// reset
pub fn reset(&mut self) {
for idx in 0..COUNTS1_SIZE {
self.counts1[idx] = 0;
}
for idx in 0..COUNTS2_SIZE {
self.counts2[idx] = 0;
}
}

Expand All @@ -34,7 +48,8 @@ impl Counter {

#[inline]
fn record_count2(&mut self, code1: u16, code2: u16) {
self.counts2[code1 as usize][code2 as usize] += 1;
let idx = (code1 as usize) * 511 + (code2 as usize);
self.counts2[idx] += 1;
}

#[inline]
Expand All @@ -44,14 +59,15 @@ impl Counter {

#[inline]
fn count2(&self, code1: u16, code2: u16) -> usize {
self.counts2[code1 as usize][code2 as usize]
let idx = (code1 as usize) * 511 + (code2 as usize);
self.counts2[idx]
}
}

/// The number of generations used for training. This is taken from the [FSST paper].
///
/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
pub const MAX_GENERATIONS: usize = 5;
const MAX_GENERATIONS: usize = 5;

impl Compressor {
/// Build and train a `Compressor` from a sample corpus of text.
Expand All @@ -70,69 +86,92 @@ impl Compressor {
if sample.is_empty() {
return compressor;
}
for _generation in 0..MAX_GENERATIONS {
let counter = compressor.compress_count(sample);
compressor = compressor.optimize(counter);

let mut counter = Counter::new();

for _generation in 0..(MAX_GENERATIONS - 1) {
compressor.compress_count(sample, &mut counter);
compressor = compressor.optimize(&counter, true);
counter.reset();
}

compressor
compressor.compress_count(sample, &mut counter);
compressor.optimize(&counter, true)
}
}

impl Compressor {
/// Compress the text using the current symbol table. Count the code occurrences
/// and code-pair occurrences to allow us to calculate apparent gain.
fn compress_count(&self, sample: &[u8]) -> Counter {
let mut counter = Counter::new();
let len = sample.len();
let mut prev_code = self.find_longest_symbol(sample);
counter.record_count1(prev_code);
let mut pos = self.symbols[prev_code as usize].len();
fn compress_count(&self, sample: &[u8], counter: &mut Counter) {
let compressed = self.compress(sample);
let len = compressed.len();

if len == 0 {
return;
}

fn next_code(pos: usize, compressed: &[u8]) -> (u16, usize) {
if compressed[pos] == ESCAPE_CODE {
(compressed[pos + 1] as u16, 2)
} else {
(256 + compressed[pos] as u16, 1)
}
}

// Get first code, record count
let (code, pos) = next_code(0, &compressed);
counter.record_count1(code);

let mut pos = pos;
let mut prev_code = code;

while pos < len {
let code = self.find_longest_symbol(&sample[pos..len]);
let (code, advance) = next_code(pos, &compressed);
pos += advance;

counter.record_count1(code);
counter.record_count2(prev_code, code);
pos += self.symbols[code as usize].len();

prev_code = code;
}

counter
}

/// Using a set of counters and the existing set of symbols, build a new
/// set of symbols/codes that optimizes the gain over the distribution in `counter`.
fn optimize(&self, counters: Counter) -> Self {
fn optimize(&self, counters: &Counter, include_ascii: bool) -> Self {
let mut res = Compressor::default();
let mut pqueue = BinaryHeap::new();
let mut pqueue = BinaryHeap::with_capacity(65_536);
for code1 in 0u16..(256u16 + self.n_symbols as u16) {
let symbol1 = self.symbols[code1 as usize];
let gain = counters.count1(code1) * symbol1.len();
pqueue.push(Candidate {
symbol: symbol1,
gain,
});
let mut gain = counters.count1(code1) * symbol1.len();
// NOTE: use heuristic from C++ implementation to boost the gain of single-byte symbols.
// This helps to reduce exception counts.
if code1 < 256 {
gain *= 8;
}
if gain > 0 {
pqueue.push(Candidate {
symbol: symbol1,
gain,
});
}

for code2 in 0u16..(256u16 + self.n_symbols as u16) {
let symbol2 = &self.symbols[code2 as usize];
// If either symbol is zero-length, or if merging would yield a symbol of
// length greater than 8, skip.
if symbol1.len() + symbol2.len() >= 8 {
if symbol1.len() + symbol2.len() > 8 {
continue;
}
let new_symbol = symbol1.concat(symbol2);
// as`sert the symbol is not empty
assert!(
!new_symbol.is_empty(),
"symbol made by merging {:?} and {:?} is empty",
symbol1,
symbol2,
);
let gain = counters.count2(code1, code2);
pqueue.push(Candidate {
symbol: new_symbol,
gain,
})
let gain = counters.count2(code1, code2) * new_symbol.len();
if gain > 0 {
pqueue.push(Candidate {
symbol: new_symbol,
gain,
})
}
}
}

Expand All @@ -145,13 +184,33 @@ impl Compressor {
}
}

// If there are leftover slots, fill them with ASCII chars.
// This helps reduce the number of escapes.
//
// Note that because of the lossy hash table, we won't accidentally
// save the same ASCII character twice into the table.
if include_ascii {
for character in
" abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ[](){}:?/<>".bytes()
{
if n_symbols == 255 {
break;
}

if res.insert(Symbol::from_u8(character)) {
n_symbols += 1
}
}
}

res
}
}

/// A candidate for inclusion in a symbol table.
///
/// This is really only useful for the `optimize` step of training.
#[derive(Copy, Clone, Debug)]
struct Candidate {
gain: usize,
symbol: Symbol,
Expand Down Expand Up @@ -188,6 +247,7 @@ impl Ord for Candidate {

#[cfg(test)]
mod test {

use crate::{Compressor, ESCAPE_CODE};

#[test]
Expand Down
5 changes: 0 additions & 5 deletions src/find_longest/mod.rs

This file was deleted.

28 changes: 0 additions & 28 deletions src/find_longest/naive.rs

This file was deleted.

Loading

0 comments on commit 2d8db1a

Please sign in to comment.