-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Get compress performance to match paper algorithm 4 (#3)
This gets us close to 2-3 cycles per byte or so that they reference in the paper for predicated scalar compression. ![image](https://github.com/user-attachments/assets/5e0c6c24-cb71-435d-ae5c-51f291018f94) ^ the benchmark is compression on string with length 50, so compression is roughly 1-2ns per byte (roughly 3-5 cycles on my M2)
- Loading branch information
Showing
15 changed files
with
735 additions
and
177 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,2 @@ | ||
/target | ||
.idea/ | ||
|
||
|
||
# Added by cargo | ||
# | ||
# already existing elements were commented out | ||
|
||
#/target |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,10 @@ | ||
[package] | ||
name = "fsst-rs" | ||
version = "0.0.1" | ||
description = "Pure-Rust implementation of Fast Static Symbol Tables algorithm for string compression" | ||
authors = ["SpiralDB Developers <[email protected]>"] | ||
license = "Apache-2.0" | ||
repository = "https://github.com/spiraldb/fsst" | ||
edition = "2021" | ||
|
||
[lints.rust] | ||
|
@@ -22,7 +26,16 @@ use_debug = { level = "deny" } | |
criterion = "0.5" | ||
lz4 = "1" | ||
|
||
[[example]] | ||
name = "round_trip" | ||
bench = false | ||
test = false | ||
|
||
[[bench]] | ||
name = "compress" | ||
harness = false | ||
bench = true | ||
|
||
[[test]] | ||
name = "correctness" | ||
test = true | ||
bench = false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
#![allow(missing_docs, clippy::use_debug)] | ||
|
||
//! This is a command line program that expects two input files as arguments. | ||
//! | ||
//! The first is the file to train a symbol table on. | ||
//! | ||
//! The second is the file to compress. The compressor will run and compress | ||
//! in chunks of 16MB, logging the compression ratio for each chunk. | ||
//! | ||
//! Example: | ||
//! | ||
//! ``` | ||
//! cargo run --release --example file_compressor -- file1.csv file2.csv | ||
//! ``` | ||
use std::{ | ||
fs::File, | ||
io::Read, | ||
os::unix::fs::{FileExt, MetadataExt}, | ||
path::Path, | ||
}; | ||
|
||
fn main() { | ||
let args: Vec<_> = std::env::args().skip(1).collect(); | ||
assert!(args.len() >= 2, "args TRAINING and FILE must be provided"); | ||
|
||
let train_path = Path::new(&args[0]); | ||
let input_path = Path::new(&args[1]); | ||
|
||
let mut train_bytes = Vec::new(); | ||
{ | ||
let mut f = File::open(train_path).unwrap(); | ||
f.read_to_end(&mut train_bytes).unwrap(); | ||
} | ||
|
||
println!("building the compressor from {train_path:?}..."); | ||
let compressor = fsst_rs::train(&train_bytes); | ||
|
||
println!("compressing blocks of {input_path:?} with compressor..."); | ||
|
||
let f = File::open(input_path).unwrap(); | ||
let size_bytes = f.metadata().unwrap().size() as usize; | ||
|
||
const CHUNK_SIZE: usize = 16 * 1024 * 1024; | ||
|
||
let mut chunk_idx = 1; | ||
let mut pos = 0; | ||
let mut chunk = vec![0u8; CHUNK_SIZE]; | ||
while pos + CHUNK_SIZE < size_bytes { | ||
f.read_exact_at(&mut chunk, pos as u64).unwrap(); | ||
// Compress the chunk, don't write it anywhere. | ||
let compact = compressor.compress(&chunk); | ||
let compression_ratio = (CHUNK_SIZE as f64) / (compact.len() as f64); | ||
println!("compressed chunk {chunk_idx} with ratio {compression_ratio}"); | ||
|
||
pos += CHUNK_SIZE; | ||
chunk_idx += 1; | ||
} | ||
|
||
// Read last chunk with a new custom-sized buffer. | ||
if pos < size_bytes { | ||
let amount = size_bytes - pos; | ||
chunk = vec![0u8; size_bytes - pos]; | ||
f.read_exact_at(&mut chunk, pos as u64).unwrap(); | ||
// Compress the chunk, don't write it anywhere. | ||
let compact = compressor.compress(&chunk[0..amount]); | ||
let compression_ratio = (amount as f64) / (compact.len() as f64); | ||
println!("compressed chunk {chunk_idx} with ratio {compression_ratio}"); | ||
} | ||
println!("done"); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
//! Simple example where we show round-tripping a string through the static symbol table. | ||
use core::str; | ||
|
||
fn main() { | ||
// Train on a sample. | ||
let sample = "the quick brown fox jumped over the lazy dog"; | ||
let trained = fsst_rs::train(sample.as_bytes()); | ||
let compressed = trained.compress(sample.as_bytes()); | ||
println!("compressed: {} => {}", sample.len(), compressed.len()); | ||
// decompress now | ||
let decode = trained.decompress(&compressed); | ||
let output = str::from_utf8(&decode).unwrap(); | ||
println!( | ||
"decoded to the original: len={} text='{}'", | ||
decode.len(), | ||
output | ||
); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,4 @@ | ||
[toolchain] | ||
channel = "nightly-2024-06-19" | ||
channel = "nightly-2024-08-14" | ||
components = ["rust-src", "rustfmt", "clippy"] | ||
profile = "minimal" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
mod naive; | ||
|
||
pub trait FindLongestSymbol { | ||
fn find_longest_symbol(&self, text: &[u8]) -> u16; | ||
} |
Oops, something went wrong.