diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4543885..1e8ad36 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: CI on: push: - branches: [ "develop" ] - pull_request: { } - workflow_dispatch: { } + branches: ["develop"] + pull_request: {} + workflow_dispatch: {} permissions: actions: read @@ -12,7 +12,7 @@ permissions: jobs: build: - name: 'build' + name: "build" runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -40,7 +40,6 @@ jobs: uses: mozilla-actions/sccache-action@v0.0.5 - name: Rust Compile Cache Config shell: bash - # echo "CARGO_LOG=cargo::core::compiler::fingerprint=info" >> $GITHUB_ENV run: | echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV echo "RUSTC_WRAPPER=sccache" >> $GITHUB_ENV diff --git a/.github/workflows/release-plz.yml b/.github/workflows/release-plz.yml index e767d0d..5d30bcf 100644 --- a/.github/workflows/release-plz.yml +++ b/.github/workflows/release-plz.yml @@ -4,7 +4,6 @@ permissions: pull-requests: write contents: write -# TODO(aduffy): uncomment when we're ready to publish on: push: branches: diff --git a/benches/compress.rs b/benches/compress.rs index 97f8c76..c9ff5af 100644 --- a/benches/compress.rs +++ b/benches/compress.rs @@ -1,12 +1,8 @@ -//! Compression benchmark. -//! -//! Contains benchmarks for FSST compression, decompression, and symbol table training. -//! -//! Also contains LZ4 baseline. +//! Benchmarks for FSST compression, decompression, and symbol table training. #![allow(missing_docs)] use core::str; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; use fsst::{Compressor, ESCAPE_CODE}; @@ -34,13 +30,23 @@ fn bench_fsst(c: &mut Criterion) { let decompressor = compressor.decompressor(); let decompressed = decompressor.decompress(&compressed); let decompressed = str::from_utf8(&decompressed).unwrap(); - println!("DECODED: {}", decompressed); - assert_eq!(decompressed, TEST); + group.throughput(Throughput::Elements(1)); + group.bench_function("compress-word", |b| { + let mut out = vec![0u8; 8]; + let out_ptr = out.as_mut_ptr(); + let front = &TEST.as_bytes()[0..8]; + let word = u64::from_le_bytes(front.try_into().unwrap()); + + b.iter(|| black_box(unsafe { compressor.compress_word(word, out_ptr) })); + }); + + group.throughput(Throughput::Bytes(CORPUS.len() as u64)); group.bench_function("compress-single", |b| { - b.iter(|| black_box(compressor.compress(black_box(plaintext)))); + b.iter(|| black_box(compressor.compress(black_box(CORPUS.as_bytes())))); }); + group.throughput(Throughput::Bytes(decompressed.len() as u64)); group.bench_function("decompress-single", |b| { b.iter(|| black_box(decompressor.decompress(black_box(&compressed)))); }); diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 354130a..8c0cc9f 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -21,7 +21,7 @@ dependencies = [ [[package]] name = "fsst-rs" -version = "0.0.1" +version = "0.1.0" [[package]] name = "fsst-rs-fuzz" diff --git a/src/builder.rs b/src/builder.rs index 84ed370..c3272ae 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -7,8 +7,7 @@ use std::cmp::Ordering; use std::collections::BinaryHeap; -use crate::find_longest::FindLongestSymbol; -use crate::{Compressor, Symbol, MAX_CODE}; +use crate::{Compressor, Symbol, ESCAPE_CODE, MAX_CODE}; #[derive(Debug, Clone)] struct Counter { @@ -16,14 +15,29 @@ struct Counter { counts1: Vec, /// Frequency count for each code-pair. - counts2: Vec>, + counts2: Vec, } +const COUNTS1_SIZE: usize = MAX_CODE as usize; +// NOTE: in Rust, creating a 1D vector of length N^2 is ~4x faster than creating a 2-D vector, +// because `vec!` has a specialization for zero. +const COUNTS2_SIZE: usize = COUNTS1_SIZE * COUNTS1_SIZE; + impl Counter { fn new() -> Self { Self { - counts1: vec![0; MAX_CODE as usize], - counts2: vec![vec![0; MAX_CODE as usize]; MAX_CODE as usize], + counts1: vec![0; COUNTS1_SIZE], + counts2: vec![0; COUNTS2_SIZE], + } + } + + /// reset + pub fn reset(&mut self) { + for idx in 0..COUNTS1_SIZE { + self.counts1[idx] = 0; + } + for idx in 0..COUNTS2_SIZE { + self.counts2[idx] = 0; } } @@ -34,7 +48,8 @@ impl Counter { #[inline] fn record_count2(&mut self, code1: u16, code2: u16) { - self.counts2[code1 as usize][code2 as usize] += 1; + let idx = (code1 as usize) * 511 + (code2 as usize); + self.counts2[idx] += 1; } #[inline] @@ -44,14 +59,15 @@ impl Counter { #[inline] fn count2(&self, code1: u16, code2: u16) -> usize { - self.counts2[code1 as usize][code2 as usize] + let idx = (code1 as usize) * 511 + (code2 as usize); + self.counts2[idx] } } /// The number of generations used for training. This is taken from the [FSST paper]. /// /// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf -pub const MAX_GENERATIONS: usize = 5; +const MAX_GENERATIONS: usize = 5; impl Compressor { /// Build and train a `Compressor` from a sample corpus of text. @@ -70,69 +86,92 @@ impl Compressor { if sample.is_empty() { return compressor; } - for _generation in 0..MAX_GENERATIONS { - let counter = compressor.compress_count(sample); - compressor = compressor.optimize(counter); + + let mut counter = Counter::new(); + + for _generation in 0..(MAX_GENERATIONS - 1) { + compressor.compress_count(sample, &mut counter); + compressor = compressor.optimize(&counter, true); + counter.reset(); } - compressor + compressor.compress_count(sample, &mut counter); + compressor.optimize(&counter, true) } } impl Compressor { /// Compress the text using the current symbol table. Count the code occurrences /// and code-pair occurrences to allow us to calculate apparent gain. - fn compress_count(&self, sample: &[u8]) -> Counter { - let mut counter = Counter::new(); - let len = sample.len(); - let mut prev_code = self.find_longest_symbol(sample); - counter.record_count1(prev_code); - let mut pos = self.symbols[prev_code as usize].len(); + fn compress_count(&self, sample: &[u8], counter: &mut Counter) { + let compressed = self.compress(sample); + let len = compressed.len(); + + if len == 0 { + return; + } + + fn next_code(pos: usize, compressed: &[u8]) -> (u16, usize) { + if compressed[pos] == ESCAPE_CODE { + (compressed[pos + 1] as u16, 2) + } else { + (256 + compressed[pos] as u16, 1) + } + } + + // Get first code, record count + let (code, pos) = next_code(0, &compressed); + counter.record_count1(code); + + let mut pos = pos; + let mut prev_code = code; while pos < len { - let code = self.find_longest_symbol(&sample[pos..len]); + let (code, advance) = next_code(pos, &compressed); + pos += advance; + counter.record_count1(code); counter.record_count2(prev_code, code); - pos += self.symbols[code as usize].len(); + prev_code = code; } - - counter } /// Using a set of counters and the existing set of symbols, build a new /// set of symbols/codes that optimizes the gain over the distribution in `counter`. - fn optimize(&self, counters: Counter) -> Self { + fn optimize(&self, counters: &Counter, include_ascii: bool) -> Self { let mut res = Compressor::default(); - let mut pqueue = BinaryHeap::new(); + let mut pqueue = BinaryHeap::with_capacity(65_536); for code1 in 0u16..(256u16 + self.n_symbols as u16) { let symbol1 = self.symbols[code1 as usize]; - let gain = counters.count1(code1) * symbol1.len(); - pqueue.push(Candidate { - symbol: symbol1, - gain, - }); + let mut gain = counters.count1(code1) * symbol1.len(); + // NOTE: use heuristic from C++ implementation to boost the gain of single-byte symbols. + // This helps to reduce exception counts. + if code1 < 256 { + gain *= 8; + } + if gain > 0 { + pqueue.push(Candidate { + symbol: symbol1, + gain, + }); + } for code2 in 0u16..(256u16 + self.n_symbols as u16) { let symbol2 = &self.symbols[code2 as usize]; // If either symbol is zero-length, or if merging would yield a symbol of // length greater than 8, skip. - if symbol1.len() + symbol2.len() >= 8 { + if symbol1.len() + symbol2.len() > 8 { continue; } let new_symbol = symbol1.concat(symbol2); - // as`sert the symbol is not empty - assert!( - !new_symbol.is_empty(), - "symbol made by merging {:?} and {:?} is empty", - symbol1, - symbol2, - ); - let gain = counters.count2(code1, code2); - pqueue.push(Candidate { - symbol: new_symbol, - gain, - }) + let gain = counters.count2(code1, code2) * new_symbol.len(); + if gain > 0 { + pqueue.push(Candidate { + symbol: new_symbol, + gain, + }) + } } } @@ -145,6 +184,25 @@ impl Compressor { } } + // If there are leftover slots, fill them with ASCII chars. + // This helps reduce the number of escapes. + // + // Note that because of the lossy hash table, we won't accidentally + // save the same ASCII character twice into the table. + if include_ascii { + for character in + " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ[](){}:?/<>".bytes() + { + if n_symbols == 255 { + break; + } + + if res.insert(Symbol::from_u8(character)) { + n_symbols += 1 + } + } + } + res } } @@ -152,6 +210,7 @@ impl Compressor { /// A candidate for inclusion in a symbol table. /// /// This is really only useful for the `optimize` step of training. +#[derive(Copy, Clone, Debug)] struct Candidate { gain: usize, symbol: Symbol, @@ -188,6 +247,7 @@ impl Ord for Candidate { #[cfg(test)] mod test { + use crate::{Compressor, ESCAPE_CODE}; #[test] diff --git a/src/find_longest/mod.rs b/src/find_longest/mod.rs deleted file mode 100644 index 00eb7b2..0000000 --- a/src/find_longest/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -mod naive; - -pub trait FindLongestSymbol { - fn find_longest_symbol(&self, text: &[u8]) -> u16; -} diff --git a/src/find_longest/naive.rs b/src/find_longest/naive.rs deleted file mode 100644 index e6519c1..0000000 --- a/src/find_longest/naive.rs +++ /dev/null @@ -1,28 +0,0 @@ -use crate::find_longest::FindLongestSymbol; -use crate::Compressor; - -// Find the code that maps to a symbol with longest-match to a piece of text. -// -// This is the naive algorithm that just scans the whole table and is very slow. - -impl FindLongestSymbol for Compressor { - // NOTE(aduffy): if you don't disable inlining, this function won't show up in profiles. - #[inline(never)] - fn find_longest_symbol(&self, text: &[u8]) -> u16 { - debug_assert!(!text.is_empty(), "text must not be empty"); - - // Find the code that best maps to the provided text table here. - // Start with the code corresponding to the escape of the first character in the text - let mut best_code = text[0] as u16; - let mut best_overlap = 1; - for code in 256..(256 + self.n_symbols as u16) { - let symbol = &self.symbols[code as usize]; - if symbol.is_prefix(text) && symbol.len() > best_overlap { - best_code = code; - best_overlap = symbol.len(); - } - } - - best_code - } -} diff --git a/src/lib.rs b/src/lib.rs index 86619ed..82ed0d7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,11 +10,9 @@ macro_rules! assert_sizeof { use std::fmt::{Debug, Formatter}; -pub use builder::*; use lossy_pht::LossyPHT; mod builder; -mod find_longest; mod lossy_pht; /// `Symbol`s are small (up to 8-byte) segments of strings, stored in a [`Compressor`][`crate::Compressor`] and @@ -43,9 +41,7 @@ impl Symbol { /// Create a new single-byte symbol pub fn from_u8(value: u8) -> Self { - Self { - bytes: [value, 0, 0, 0, 0, 0, 0, 0], - } + Self { num: value as u64 } } } @@ -56,6 +52,7 @@ impl Symbol { /// can contain fewer bytes, padded with 0x00. There is a special case of a symbol /// that holds the byte 0x00. In that case, the symbol contains `0x0000000000000000` /// but we want to interpret that as a one-byte symbol containing `0x00`. + #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> usize { let numeric = unsafe { self.num }; // For little-endian platforms, this counts the number of *trailing* zeros @@ -71,13 +68,6 @@ impl Symbol { } } - /// Returns true if the symbol does not encode any bytes. - /// - /// Note that this should only be true for the zero code. - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - #[inline] fn as_u64(&self) -> u64 { // SAFETY: the bytes can always be viewed as a u64 @@ -120,18 +110,43 @@ impl Symbol { let new_len = self_len + other.len(); assert!(new_len <= 8, "cannot build symbol with length > 8"); - let mut result = *self; + // SAFETY: we assert the combined length <= 8 + unsafe { + Self { + num: (other.num << (8 * self_len)) | self.num, + } + } + } +} - // SAFETY: self_len and new_len are checked to be <= 8 - unsafe { result.bytes[self_len..new_len].copy_from_slice(other.as_slice()) }; +#[cfg(test)] +mod test { + use crate::Symbol; - result + #[test] + fn test_concat() { + let symbola = Symbol::from_u8(b'a'); + let symbolb = Symbol::from_u8(b'b'); + let symbolab = symbola.concat(&symbolb); + assert_eq!(symbolab.as_slice(), b"ab"); } } impl Debug for Symbol { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", unsafe { self.bytes }) + let debug = self + .as_slice() + .iter() + .map(|c| *c as char) + .map(|c| { + if c.is_ascii() { + format!("{c}") + } else { + format!("{c:X?}") + } + }) + .collect::>(); + write!(f, "{:?}", debug) } } @@ -299,7 +314,7 @@ impl<'a> Decompressor<'a> { #[derive(Clone)] pub struct Compressor { /// Table mapping codes to symbols. - pub(crate) symbols: [Symbol; 511], + pub(crate) symbols: Vec, /// The number of entries in the symbol table that have been populated, not counting /// the escape values. @@ -317,8 +332,14 @@ pub struct Compressor { impl Default for Compressor { fn default() -> Self { + // NOTE: `vec!` has a specialization for building a new vector of `0u64`. Because Symbol and u64 + // have the same bit pattern, we can allocate as u64 and transmute. If we do `vec![Symbol::EMPTY; N]`, + // that will create a new Vec and call `Symbol::EMPTY.clone()` `N` times which is considerably slower. + let symbols = vec![0u64; 511]; + // SAFETY: transmute safety assured by the compiler. + let symbols: Vec = unsafe { std::mem::transmute(symbols) }; let mut table = Self { - symbols: [Symbol::ZERO; 511], + symbols, n_symbols: 0, codes_twobyte: vec![CodeMeta::EMPTY; 65_536], lossy_pht: LossyPHT::new(), @@ -379,9 +400,8 @@ impl Compressor { /// # Safety /// /// `out_ptr` must never be NULL or otherwise point to invalid memory. - // NOTE(aduffy): uncomment this line to make the function appear in profiles - #[inline(never)] - pub(crate) unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) { + #[inline] + pub unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) { // Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and // if it isn't, it will be overwritten anyway. // @@ -436,11 +456,11 @@ impl Compressor { // SAFETY: `end` will point just after the end of the `plaintext` slice. let in_end = unsafe { in_ptr.byte_add(plaintext.len()) }; - let in_end_sub8 = unsafe { in_end.byte_sub(8) }; + let in_end_sub8 = in_end as usize - 8; // SAFETY: `end` will point just after the end of the `values` allocation. let out_end = unsafe { out_ptr.byte_add(values.capacity()) }; - while in_ptr < in_end_sub8 && out_ptr < out_end { + while (in_ptr as usize) < in_end_sub8 && out_ptr < out_end { // SAFETY: pointer ranges are checked in the loop condition unsafe { // Load a full 8-byte word of data from in_ptr. @@ -521,6 +541,7 @@ impl Compressor { } } +#[inline] fn advance_8byte_word(word: u64, bytes: usize) -> u64 { // shift the word off the right-end, because little endian means the first // char is stored in the LSB. @@ -534,6 +555,7 @@ fn advance_8byte_word(word: u64, bytes: usize) -> u64 { } } +#[inline] fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool { let mask = if ignored_bits == 64 { 0 @@ -547,6 +569,7 @@ fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool { /// This is a function that will get monomorphized based on the value of `N` to do /// a load of `N` values from the pointer in a minimum number of instructions into /// an output `u64`. +#[inline] unsafe fn extract_u64(ptr: *const u8) -> u64 { match N { 1 => ptr.read() as u64, diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs index db4bcf5..460631e 100644 --- a/src/lossy_pht.rs +++ b/src/lossy_pht.rs @@ -60,15 +60,12 @@ pub(crate) struct LossyPHT { impl LossyPHT { /// Construct a new empty lossy perfect hash table pub(crate) fn new() -> Self { - let mut slots = Vec::with_capacity(HASH_TABLE_SIZE); - // Initialize all slots to empty entries - for _ in 0..HASH_TABLE_SIZE { - slots.push(TableEntry { - symbol: Symbol::ZERO, - code: CodeMeta::EMPTY, - ignored_bits: 64, - }); - } + let slots = [TableEntry { + symbol: Symbol::ZERO, + code: CodeMeta::EMPTY, + ignored_bits: 64, + }] + .repeat(HASH_TABLE_SIZE); Self { slots } } @@ -95,11 +92,13 @@ impl LossyPHT { } } + #[inline] pub(crate) fn lookup(&self, word: u64) -> TableEntry { let prefix_3bytes = word & 0xFF_FF_FF; let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); - self.slots[slot] + // SAFETY: the slot is guaranteed to between 0...(HASH_TABLE_SIZE - 1). + unsafe { *self.slots.get_unchecked(slot) } } /// Hash a value to find the bucket it belongs in. diff --git a/tests/correctness.rs b/tests/correctness.rs index d557f06..5a68cb1 100644 --- a/tests/correctness.rs +++ b/tests/correctness.rs @@ -59,22 +59,17 @@ fn test_zeros() { #[test] fn test_large() { - let mut corpus = String::new(); - // TODO(aduffy): make this larger once table build performance is better. - while corpus.len() < 10 * 1_024 { - corpus.push_str(DECLARATION); - } + let corpus: Vec = DECLARATION.bytes().cycle().take(10_240).collect(); let trained = Compressor::train(&corpus); - let mut massive = String::new(); - while massive.len() < 16 * 1_024 * 1_024 { - massive.push_str(DECLARATION); - } - let compressed = trained.compress(massive.as_bytes()); - assert_eq!( - trained.decompressor().decompress(&compressed), - massive.as_bytes() - ); + let massive: Vec = DECLARATION + .bytes() + .cycle() + .take(16 * 1_024 * 1_024) + .collect(); + + let compressed = trained.compress(&massive); + assert_eq!(trained.decompressor().decompress(&compressed), massive); } #[test]