Skip to content

Commit

Permalink
a lot
Browse files Browse the repository at this point in the history
  • Loading branch information
a10y committed Aug 19, 2024
1 parent bd95a16 commit 4e150d0
Show file tree
Hide file tree
Showing 7 changed files with 226 additions and 101 deletions.
9 changes: 9 additions & 0 deletions benches/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ fn bench_fsst(c: &mut Criterion) {
println!("DECODED: {}", decompressed);
assert_eq!(decompressed, TEST);

let mut out = vec![0u8; 8];
let out_ptr = out.as_mut_ptr();
let chars = &plaintext[0..8];
let word = u64::from_le_bytes(chars.try_into().unwrap());

group.bench_function("compress-word", |b| {
b.iter(|| black_box(unsafe { compressor.compress_word(word, out_ptr) }));
});

group.bench_function("compress-single", |b| {
b.iter(|| black_box(compressor.compress(black_box(plaintext))));
});
Expand Down
49 changes: 40 additions & 9 deletions examples/throughput_fast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
use fsst::Compressor;

const ALPHA: &str = "abcdefghijklmnopqrstuvwxyz";
const DRACULA: &str = include_str!("../benches/dracula.txt");

fn main() {
let bytes = std::env::args()
Expand All @@ -15,29 +15,60 @@ fn main() {
let parsed_bytes = usize::from_str_radix(&bytes, 10).unwrap();

println!("building a simple symbol table");
let compressor = Compressor::train(&ALPHA);
let compressor = Compressor::train(&DRACULA);
for idx in 256..compressor.symbol_table().len() {
let symbol = &compressor.symbol_table()[idx];
println!(
"symbol[{idx}] => '{:?}'",
symbol
.as_slice()
.iter()
.map(|c| *c as char)
.map(|c| if c.is_ascii() {
format!("{c}")
} else {
format!("{c:X?}")
})
.collect::<Vec<String>>()
);
}

println!("building new text array of {parsed_bytes} bytes");
let mut data = vec![0u8; parsed_bytes];
for i in 0..parsed_bytes {
// Return each byte individually as a char style thing.
data[i] = ALPHA.chars().nth(i % ALPHA.len()).unwrap() as u8;
}

let to_compress: Vec<u8> = DRACULA.bytes().cycle().take(parsed_bytes).collect();

println!("beginning compression benchmark...");
let start_time = std::time::Instant::now();
let compressed = compressor.compress(&data);
let compressed = compressor.compress(&to_compress);
let end_time = std::time::Instant::now();

let duration = end_time.duration_since(start_time);

println!("test completed");

let ratio = (parsed_bytes as f64) / (compressed.len() as f64);
let ratio = (to_compress.len() as f64) / (compressed.len() as f64);

println!("compression ratio: {ratio}");
println!("wall time = {duration:?}");

let bytes_per_sec = (parsed_bytes as f64) / duration.as_secs_f64();
println!("tput: {bytes_per_sec} bytes/sec");

// Measure decompression speed.
println!("beginning decompression benchmark...");
let start_time = std::time::Instant::now();
let decompressed = compressor.decompressor().decompress(&compressed);
let end_time = std::time::Instant::now();

let duration = end_time.duration_since(start_time);

println!("test completed");

let ratio = (decompressed.len() as f64) / (compressed.len() as f64);

println!("inflation ratio ratio: {ratio}");
println!("wall time = {duration:?}");

let bytes_per_sec = (compressed.len() as f64) / duration.as_secs_f64();
println!("tput: {bytes_per_sec} bytes/sec");
}
164 changes: 127 additions & 37 deletions src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
use std::cmp::Ordering;
use std::collections::BinaryHeap;

use crate::find_longest::FindLongestSymbol;
use crate::{Compressor, Symbol, MAX_CODE};
use crate::{Compressor, Symbol, ESCAPE_CODE, MAX_CODE};

#[derive(Debug, Clone)]
struct Counter {
Expand All @@ -27,6 +26,16 @@ impl Counter {
}
}

fn reset(&mut self) {
for code1 in 0..MAX_CODE {
self.counts1[code1 as usize] = 0;

for code2 in 0..MAX_CODE {
self.counts2[code1 as usize][code2 as usize] = 0;
}
}
}

#[inline]
fn record_count1(&mut self, code1: u16) {
self.counts1[code1 as usize] += 1;
Expand All @@ -51,7 +60,11 @@ impl Counter {
/// The number of generations used for training. This is taken from the [FSST paper].
///
/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
pub const MAX_GENERATIONS: usize = 5;
#[cfg(not(miri))]
const MAX_GENERATIONS: usize = 5;

#[cfg(miri)]
const MAX_GENERATIONS: usize = 1;

impl Compressor {
/// Build and train a `Compressor` from a sample corpus of text.
Expand All @@ -70,69 +83,121 @@ impl Compressor {
if sample.is_empty() {
return compressor;
}
for _generation in 0..MAX_GENERATIONS {
let counter = compressor.compress_count(sample);
compressor = compressor.optimize(counter);

let mut counter = Counter::new();

for _generation in 0..(MAX_GENERATIONS - 1) {
compressor.compress_count(sample, &mut counter);
compressor = compressor.optimize(&counter, true);
counter.reset();
}

compressor
compressor.compress_count(sample, &mut counter);
compressor.optimize(&counter, true)
}

/// Specify the number of generations to train for
pub fn train_n(corpus: impl AsRef<[u8]>, generations: usize) -> Self {
let mut compressor = Self::default();
// TODO(aduffy): handle truncating/sampling if corpus > requires sample size.
let sample = corpus.as_ref();
if sample.is_empty() {
return compressor;
}

let mut counter = Counter::new();

for _generation in 0..(generations - 1) {
compressor.compress_count(sample, &mut counter);
compressor = compressor.optimize(&counter, false);
}

compressor.compress_count(sample, &mut counter);
compressor.optimize(&counter, true)
}
}

impl Compressor {
/// Compress the text using the current symbol table. Count the code occurrences
/// and code-pair occurrences to allow us to calculate apparent gain.
fn compress_count(&self, sample: &[u8]) -> Counter {
let mut counter = Counter::new();
let len = sample.len();
let mut prev_code = self.find_longest_symbol(sample);
counter.record_count1(prev_code);
let mut pos = self.symbols[prev_code as usize].len();
fn compress_count(&self, sample: &[u8], counter: &mut Counter) {
let compressed = self.compress(sample);
let len = compressed.len();

if len == 0 {
return;
}

#[inline(never)]
fn next_code(pos: usize, compressed: &[u8]) -> (u16, usize) {
if compressed[pos] == ESCAPE_CODE {
(compressed[pos + 1] as u16, 2)
} else {
(256 + compressed[pos] as u16, 1)
}
}

// Get first code, record count
let (code, pos) = next_code(0, &compressed);
counter.record_count1(code);

let mut pos = pos;
let mut prev_code = code;

while pos < len {
let code = self.find_longest_symbol(&sample[pos..len]);
let (code, advance) = next_code(pos, &compressed);
pos += advance;

counter.record_count1(code);
counter.record_count2(prev_code, code);
pos += self.symbols[code as usize].len();

prev_code = code;
}

counter
}

/// Using a set of counters and the existing set of symbols, build a new
/// set of symbols/codes that optimizes the gain over the distribution in `counter`.
fn optimize(&self, counters: Counter) -> Self {
fn optimize(&self, counters: &Counter, include_ascii: bool) -> Self {
let mut res = Compressor::default();
let mut pqueue = BinaryHeap::new();
let mut pqueue = BinaryHeap::with_capacity(65_536);
for code1 in 0u16..(256u16 + self.n_symbols as u16) {
let symbol1 = self.symbols[code1 as usize];
let gain = counters.count1(code1) * symbol1.len();
pqueue.push(Candidate {
symbol: symbol1,
gain,
});
let mut gain = counters.count1(code1) * symbol1.len();
// NOTE: use heuristic from C++ implementation to boost the gain of single-byte symbols.
// This helps to reduce exception counts.
if code1 < 256 {
gain = 8 * gain;
}
if gain > 0 {
// println!("pushing single: symbol = {symbol1:?} gain = {gain}");
pqueue.push(Candidate {
symbol: symbol1,
gain,
});
}

for code2 in 0u16..(256u16 + self.n_symbols as u16) {
let symbol2 = &self.symbols[code2 as usize];
// If either symbol is zero-length, or if merging would yield a symbol of
// length greater than 8, skip.
if symbol1.len() + symbol2.len() >= 8 {
if symbol1.len() + symbol2.len() > 8 {
continue;
}
let new_symbol = symbol1.concat(symbol2);
// as`sert the symbol is not empty
assert!(
!new_symbol.is_empty(),
"symbol made by merging {:?} and {:?} is empty",
symbol1,
symbol2,
);
let gain = counters.count2(code1, code2);
pqueue.push(Candidate {
symbol: new_symbol,
gain,
})
let gain = counters.count2(code1, code2) * new_symbol.len();
if gain > 0 {
// println!("pushing double: symbol = {new_symbol:?} gain = {gain}");
// println!(
// "\tfirst-half gain = {} second-half gain = {}",
// counters.count1(code1),
// counters.count1(code2)
// );

pqueue.push(Candidate {
symbol: new_symbol,
gain,
})
}
}
}

Expand All @@ -145,6 +210,25 @@ impl Compressor {
}
}

// If there are leftover slots, fill them with ASCII chars.
// This helps reduce the number of escapes.
//
// Note that because of the lossy hash table, we won't accidentally
// save the same ASCII character twice into the table.
if include_ascii {
for character in
" abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ[](){}:?/<>".bytes()
{
if n_symbols == 255 {
break;
}

if res.insert(Symbol::from_u8(character)) {
n_symbols += 1
}
}
}

res
}
}
Expand Down Expand Up @@ -190,6 +274,12 @@ impl Ord for Candidate {
mod test {
use crate::{Compressor, ESCAPE_CODE};

#[test]
fn test_sadness() {
let compressor = Compressor::train_n("hello world", 1);
let _ = compressor.compress(&[1]);
}

#[test]
fn test_builder() {
// Train a Compressor on the toy string
Expand Down
5 changes: 0 additions & 5 deletions src/find_longest/mod.rs

This file was deleted.

28 changes: 0 additions & 28 deletions src/find_longest/naive.rs

This file was deleted.

Loading

0 comments on commit 4e150d0

Please sign in to comment.