From a6ade02a860d0aa074d1f225cfa6b83a1de33aa4 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 14 Aug 2024 22:48:58 -0400 Subject: [PATCH] add file compressor example --- examples/file_compressor.rs | 67 +++++++++++++++++++++++++++++++++++++ src/lib.rs | 3 +- 2 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 examples/file_compressor.rs diff --git a/examples/file_compressor.rs b/examples/file_compressor.rs new file mode 100644 index 0000000..f820971 --- /dev/null +++ b/examples/file_compressor.rs @@ -0,0 +1,67 @@ +#![allow(missing_docs)] + +//! This is a command line program that expects two input files as arguments. +//! +//! The first is the file to train a symbol table on. +//! +//! The second is the file to compress. The compressed file will be written +//! as a sibling with the suffix ".fsst" + +use std::{ + fs::File, + io::Read, + os::unix::fs::{FileExt, MetadataExt}, + path::Path, +}; + +fn main() { + let args: Vec<_> = std::env::args().skip(1).collect(); + assert!(args.len() >= 2, "args TRAINING and FILE must be provided"); + + let train_path = Path::new(&args[0]); + let input_path = Path::new(&args[1]); + + let mut train_text = String::new(); + { + let mut f = File::open(train_path).unwrap(); + f.read_to_string(&mut train_text).unwrap(); + } + + println!("building the compressor from {train_path:?}..."); + let compressor = fsst_rs::train(&train_text); + + println!("compressing blocks of {input_path:?} with compressor..."); + + let f = File::open(input_path).unwrap(); + let size_bytes = f.metadata().unwrap().size() as usize; + + const CHUNK_SIZE: usize = 16 * 1024 * 1024; + + let mut chunk_idx = 1; + let mut pos = 0; + let mut chunk = Vec::with_capacity(CHUNK_SIZE); + unsafe { chunk.set_len(CHUNK_SIZE) }; + while pos + CHUNK_SIZE < size_bytes { + f.read_exact_at(&mut chunk, pos as u64).unwrap(); + // Compress the chunk, don't write it anywhere. + let compact = compressor.compress(&chunk); + let compression_ratio = (CHUNK_SIZE as f64) / (compact.len() as f64); + println!("compressed chunk {chunk_idx} with ratio {compression_ratio}"); + + pos += CHUNK_SIZE; + chunk_idx += 1; + } + + // Read last chunk with a new custom-sized buffer. + if pos < size_bytes { + let amount = size_bytes - pos; + chunk = Vec::with_capacity(size_bytes - pos); + unsafe { chunk.set_len(amount) }; + f.read_exact_at(&mut chunk, pos as u64).unwrap(); + // Compress the chunk, don't write it anywhere. + let compact = compressor.compress(&chunk[0..amount]); + let compression_ratio = (amount as f64) / (compact.len() as f64); + println!("compressed chunk {chunk_idx} with ratio {compression_ratio}"); + } + println!("done"); +} diff --git a/src/lib.rs b/src/lib.rs index 5350fb3..da195a2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -305,7 +305,8 @@ impl SymbolTable { /// # Safety /// /// `in_ptr` and `out_ptr` must never be NULL or otherwise point to invalid memory. - #[inline(never)] + // NOTE(aduffy): uncomment this line to make the function appear in profiles + // #[inline(never)] pub(crate) unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) { // Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and // if it isn't, it will be overwritten anyway.