diff --git a/.gitignore b/.gitignore index a5ff07f..8b196e9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /target +.idea/ # Added by cargo diff --git a/Cargo.lock b/Cargo.lock index 414f6c6..ef3912f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,630 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" + +[[package]] +name = "autocfg" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cc" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e8aabfac534be767c909e0690571677d49f41bd8465ae876fe043d52ba5292" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11d8838454fda655dafd3accb2b6e2bea645b9e4078abe84a22ceb947235c5cc" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + [[package]] name = "fsst-rs" version = "0.1.0" +dependencies = [ + "criterion", + "lz4", +] + +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "is-terminal" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "js-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "lz4" +version = "1.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958b4caa893816eea05507c20cfe47574a43d9a697138a7872990bba8a0ece68" +dependencies = [ + "libc", + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "109de74d5d2353660401699a4174a4ff23fcc649caf553df71933c7fb45ad868" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "oorandom" +version = "11.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" + +[[package]] +name = "plotters" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a15b6eccb8484002195a3e44fe65a4ce8e93a625797a063735536fd59cb01cf3" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "414cec62c6634ae900ea1c56128dfe87cf63e7caece0852ec76aba307cebadb7" + +[[package]] +name = "plotters-svg" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81b30686a7d9c3e010b84284bdd26a29f2138574f52f5eb6f794fc0ad924e705" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.206" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b3e4cd94123dd520a128bcd11e34d9e9e423e7e3e50425cb1b4b1e3549d0284" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.206" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabfb6138d2383ea8208cf98ccf69cdfb1aff4088460681d84189aa259762f97" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.124" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66ad62847a56b3dba58cc891acd13884b9c61138d330c0d7b6181713d4fce38d" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" + +[[package]] +name = "web-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml index c90cd98..301d560 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,4 +3,11 @@ name = "fsst-rs" version = "0.1.0" edition = "2021" -[dependencies] +[dev-dependencies] +criterion = "0.5" +lz4 = "1" + +[[bench]] +name = "compress" +harness = false +bench = true diff --git a/benches/compress.rs b/benches/compress.rs new file mode 100644 index 0000000..10cc7ce --- /dev/null +++ b/benches/compress.rs @@ -0,0 +1,103 @@ +use std::io::{Cursor, Read, Write}; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use lz4::liblz4::BlockChecksum; +use lz4::{BlockSize, ContentChecksum}; + +use fsst_rs::{train, SymbolTable}; + +const CORPUS: &str = include_str!("dracula.txt"); +const TEST: &str = "I found my smattering of German very useful here"; + +fn bench_fsst(c: &mut Criterion) { + let mut group = c.benchmark_group("fsst"); + group.bench_function("train", |b| { + let corpus = CORPUS.as_bytes(); + b.iter(|| black_box(train(black_box(corpus)))); + }); + + let table = train(CORPUS); + let plaintext = TEST.as_bytes(); + + let compressed = table.compress(plaintext); + let escape_count = compressed + .iter() + .filter(|b| **b == SymbolTable::ESCAPE) + .count(); + let ratio = (plaintext.len() as f64) / (compressed.len() as f64); + println!( + "Escapes = {escape_count}/{}, compression_ratio = {ratio}", + compressed.len() + ); + + assert_eq!(table.decompress(&compressed), TEST.as_bytes()); + + group.bench_function("compress-single", |b| { + b.iter(|| black_box(table.compress(black_box(plaintext)))); + }); + + group.bench_function("decompress-single", |b| { + b.iter(|| black_box(table.decompress(black_box(&compressed)))); + }); +} + +fn bench_lz4(c: &mut Criterion) { + let mut group = c.benchmark_group("lz4"); + + // { + // let compressed = Vec::with_capacity(10_000); + // let mut encoder = lz4::EncoderBuilder::new() + // .block_size(BlockSize::Max64KB) + // .build(compressed) + // .unwrap(); + // + // encoder.write_all(TEST.as_bytes()).unwrap(); + // let (compressed, result) = encoder.finish(); + // result.unwrap(); + // + // let ratio = (TEST.as_bytes().len() as f64) / (compressed.len() as f64); + // println!("LZ4 compress_ratio = {ratio}"); + // + // // ensure decodes cleanly + // let cursor = Cursor::new(compressed); + // let mut decoder = lz4::Decoder::new(cursor).unwrap(); + // let mut output = String::new(); + // + // decoder.read_to_string(&mut output).unwrap(); + // assert_eq!(output.as_str(), TEST); + // } + + group.bench_function("compress-single", |b| { + let mut compressed = Vec::with_capacity(100_000_000); + let mut encoder = lz4::EncoderBuilder::new() + .block_size(BlockSize::Max64KB) + .checksum(ContentChecksum::NoChecksum) + .block_checksum(BlockChecksum::NoBlockChecksum) + .build(&mut compressed) + .unwrap(); + + b.iter(|| encoder.write_all(TEST.as_bytes()).unwrap()); + }); + + group.bench_function("decompress-single", |b| { + let compressed = Vec::new(); + let mut encoder = lz4::EncoderBuilder::new() + .block_size(BlockSize::Max64KB) + .checksum(ContentChecksum::NoChecksum) + .block_checksum(BlockChecksum::NoBlockChecksum) + .build(compressed) + .unwrap(); + encoder.write_all(TEST.as_bytes()).unwrap(); + let (compressed, result) = encoder.finish(); + result.unwrap(); + + let cursor = Cursor::new(compressed); + let mut decoder = lz4::Decoder::new(cursor).unwrap(); + let mut output = Vec::new(); + + b.iter(|| decoder.read_to_end(&mut output).unwrap()); + }); +} + +criterion_group!(compress_bench, bench_fsst, bench_lz4); +criterion_main!(compress_bench); diff --git a/benches/dracula.txt b/benches/dracula.txt new file mode 100644 index 0000000..88adb22 --- /dev/null +++ b/benches/dracula.txt @@ -0,0 +1 @@ +How these papers have been placed in sequence will be made manifest in the reading of them. All needless matters have been eliminated, so that a history almost at variance with the possibilities of later-day belief may stand forth as simple fact. There is throughout no statement of past things wherein memory may err, for all the records chosen are exactly contemporary, given from the standpoints and within the range of knowledge of those who made them. We left in pretty good time, and came after nightfall to Klausenburgh. Here I stopped for the night at the Hotel Royale. I had for dinner, or rather supper, a chicken done up some way with red pepper, which was very good but thirsty. (Mem., get recipe for Mina.) I asked the waiter, and he said it was called “paprika hendl,” and that, as it was a national dish, I should be able to get it anywhere along the Carpathians. I found my smattering of German very useful here; indeed, I don’t know how I should be able to get on without it. diff --git a/src/builder.rs b/src/builder.rs new file mode 100644 index 0000000..c7ae814 --- /dev/null +++ b/src/builder.rs @@ -0,0 +1,203 @@ +//! Functions and types used for building a [`SymbolTable`] from a corpus of text. +//! +//! This module implements the logic from Algorithm 3 of the [FSST Paper]. +//! +//! [FSST Paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf + +use std::cmp::Ordering; +use std::collections::BinaryHeap; + +use crate::{Code, Symbol, SymbolTable}; + +#[derive(Debug, Clone)] +struct Counter { + /// Frequency count for each code. + counts1: Vec, + + /// Frequency count for each code-pair. + counts2: Vec>, +} + +impl Counter { + fn new() -> Self { + Self { + counts1: vec![0; 512], + counts2: vec![vec![0; 512]; 512], + } + } + + #[inline] + fn record_count1(&mut self, code1: Code) { + self.counts1[code1.0 as usize] += 1; + } + + #[inline] + fn record_count2(&mut self, code1: Code, code2: Code) { + self.counts2[code1.0 as usize][code2.0 as usize] += 1; + } + + #[inline] + fn count1(&self, code: Code) -> usize { + self.counts1[code.0 as usize] + } + + #[inline] + fn count2(&self, code1: Code, code2: Code) -> usize { + self.counts2[code1.0 as usize][code2.0 as usize] + } +} + +pub const MAX_GENERATIONS: usize = 5; + +pub fn train(corpus: impl AsRef<[u8]>) -> SymbolTable { + let mut table = SymbolTable::default(); + // TODO(aduffy): handle truncating/sampling if corpus > requires sample size. + let sample = corpus.as_ref(); + for _generation in 0..MAX_GENERATIONS { + let counter = table.compress_count(sample); + table = table.optimize(counter); + } + + table +} + +impl SymbolTable { + /// Compress the text using the current symbol table. Count the code occurrences + /// and code-pair occurrences to allow us to calculate apparent gain. + fn compress_count(&self, sample: &[u8]) -> Counter { + let mut counter = Counter::new(); + let len = sample.len(); + let mut prev_code = self.find_longest_symbol(sample); + counter.record_count1(prev_code); + let mut pos = self.symbols[prev_code.0 as usize].len(); + + while pos < len { + let code = self.find_longest_symbol(&sample[pos..len]); + counter.record_count1(code); + counter.record_count2(prev_code, code); + pos += self.symbols[code.0 as usize].len(); + prev_code = code; + } + + counter + } + + /// Using a set of counters and the existing set of symbols, build a new + /// set of symbols/codes that optimizes the gain over the distribution in `counter`. + fn optimize(&self, counters: Counter) -> Self { + let mut res = SymbolTable::default(); + let mut pqueue = BinaryHeap::new(); + for code1 in 0..512 { + let code1 = Code::from_u16(code1); + let symbol1 = self.symbols[code1.0 as usize]; + let gain = counters.count1(code1) * symbol1.len(); + pqueue.push(Candidate { + symbol: symbol1, + gain, + }); + + for code2 in 0..512 { + let code2 = Code::from_u16(code2); + let symbol2 = &self.symbols[code2.0 as usize]; + // If either symbol is zero-length, or if merging would yield a symbol of + // length greater than 8, skip. + if symbol1.len() + symbol2.len() >= 8 || symbol1.is_empty() || symbol2.is_empty() { + continue; + } + let new_symbol = symbol1.concat(symbol2); + // as`sert the symbol is not empty + assert!( + !new_symbol.is_empty(), + "symbol made by merging {:?} and {:?} is empty", + symbol1, + symbol2, + ); + let gain = counters.count2(code1, code2); + pqueue.push(Candidate { + symbol: new_symbol, + gain, + }) + } + } + + // Pop the 255 best symbols. + pqueue + .iter() + .take(255) + .for_each(|candidate| res.insert(candidate.symbol)); + + res + } +} + +struct Candidate { + gain: usize, + symbol: Symbol, +} + +impl Candidate { + fn comparable_form(&self) -> (usize, usize) { + (self.gain, self.symbol.len()) + } +} + +impl Eq for Candidate {} + +impl PartialEq for Candidate { + fn eq(&self, other: &Self) -> bool { + self.comparable_form().eq(&other.comparable_form()) + } +} + +impl PartialOrd for Candidate { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Candidate { + fn cmp(&self, other: &Self) -> Ordering { + let self_ord = (self.gain, self.symbol.len()); + let other_ord = (other.gain, other.symbol.len()); + + self_ord.cmp(&other_ord) + } +} + +#[cfg(test)] +mod test { + use crate::{train, SymbolTable}; + + #[test] + fn test_builder() { + // Train a SymbolTable on the toy string + let text = "hello world"; + let table = train(text.as_bytes()); + + // Use the table to compress a string, see the values + let compressed = table.compress(text.as_bytes()); + + // Ensure that the compressed string has no escape bytes + assert!(compressed.iter().all(|b| *b != SymbolTable::ESCAPE)); + + // Ensure that we can compress a string with no values seen at training time. + let compressed = table.compress("xyz123".as_bytes()); + assert_eq!( + compressed, + vec![ + SymbolTable::ESCAPE, + b'x', + SymbolTable::ESCAPE, + b'y', + SymbolTable::ESCAPE, + b'z', + SymbolTable::ESCAPE, + b'1', + SymbolTable::ESCAPE, + b'2', + SymbolTable::ESCAPE, + b'3', + ] + ) + } +} diff --git a/src/fsst.rs b/src/fsst.rs deleted file mode 100644 index 960db9d..0000000 --- a/src/fsst.rs +++ /dev/null @@ -1,325 +0,0 @@ -use std::cmp::min; - -const FSST_CODE_MAX: u16 = 256; -const FSST_CODE_MASK: u16 = FSST_CODE_MAX - 1; -const FSST_LEN_BITS: u32 = 12; -const FSST_CODE_BITS: u32 = 9; -const FSST_CODE_BASE: u16 = 256; -const FSST_HASH_LOG2SIZE: usize = 10; -const FSST_HASH_PRIME: u64 = 2971215073; -const FSST_SHIFT: u32 = 15; -const FSST_ICL_FREE: u64 = (15 << 28) | ((FSST_CODE_MASK as u64) << 16); -const FSST_MAXHEADER: usize = 8 + 1 + 8 + 2048 + 1; -const FSST_ESC: u8 = 255; - -#[inline(always)] -fn fsst_unaligned_load(v: &[u8]) -> u64 { - let mut ret: u64 = 0; - unsafe { - std::ptr::copy_nonoverlapping(v.as_ptr(), &mut ret as *mut u64 as *mut u8, 8); - } - ret -} - -#[inline(always)] -fn fsst_hash(w: u64) -> u64 { - ((w * FSST_HASH_PRIME) ^ ((w * FSST_HASH_PRIME) >> FSST_SHIFT)) -} - -#[derive(Clone, Copy)] -struct Symbol { - val: [u8; 8], - icl: u64, -} - -impl Symbol { - const MAX_LENGTH: usize = 8; - - fn new() -> Self { - Symbol { val: [0; 8], icl: 0 } - } - - fn from_byte(c: u8, code: u16) -> Self { - let mut s = Symbol::new(); - s.val[0] = c; - s.set_code_len(code, 1); - s - } - - fn from_slice(input: &[u8]) -> Self { - let mut s = Symbol::new(); - let len = min(input.len(), Self::MAX_LENGTH); - s.val[..len].copy_from_slice(&input[..len]); - s.set_code_len(FSST_CODE_MASK, len as u32); - s - } - - fn set_code_len(&mut self, code: u16, len: u32) { - self.icl = (len << 28) as u64 | (code as u64) << 16 | ((8 - len) * 8) as u64; - } - - fn length(&self) -> u32 { - (self.icl >> 28) as u32 - } - - fn code(&self) -> u16 { - ((self.icl >> 16) & FSST_CODE_MASK as u64) as u16 - } - - fn ignored_bits(&self) -> u32 { - self.icl as u32 - } - - fn first(&self) -> u8 { - self.val[0] - } - - fn first2(&self) -> u16 { - u16::from_le_bytes([self.val[0], self.val[1]]) - } - - fn hash(&self) -> usize { - let v = u32::from_le_bytes([self.val[0], self.val[1], self.val[2], self.val[3]]); - fsst_hash(v as u64) as usize - } -} - -struct SymbolTable { - short_codes: [u16; 65536], - byte_codes: [u16; 256], - symbols: Vec, - hash_tab: Vec, - n_symbols: u16, - suffix_lim: u16, - terminator: u16, - zero_terminated: bool, - len_histo: [u16; FSST_CODE_BITS as usize], -} - -impl SymbolTable { - fn new() -> Self { - let mut st = SymbolTable { - short_codes: [0; 65536], - byte_codes: [0; 256], - symbols: vec![Symbol::new(); FSST_CODE_MAX as usize], - hash_tab: vec![Symbol::new(); 1 << FSST_HASH_LOG2SIZE], - n_symbols: 0, - suffix_lim: FSST_CODE_MAX, - terminator: 0, - zero_terminated: false, - len_histo: [0; FSST_CODE_BITS as usize], - }; - - for i in 0..256 { - st.symbols[i] = Symbol::from_byte(i as u8, i as u16 | (1 << FSST_LEN_BITS)); - } - - for i in 256..FSST_CODE_MAX as usize { - st.symbols[i] = Symbol::from_byte(0, FSST_CODE_MASK); - } - - for i in 0..256 { - st.byte_codes[i] = (1 << FSST_LEN_BITS) | i as u16; - } - - for i in 0..65536 { - st.short_codes[i] = (1 << FSST_LEN_BITS) | (i & 255) as u16; - } - - st - } - - fn clear(&mut self) { - self.len_histo = [0; FSST_CODE_BITS as usize]; - for i in FSST_CODE_BASE as usize..FSST_CODE_BASE as usize + self.n_symbols as usize { - let symbol = &self.symbols[i]; - if symbol.length() == 1 { - let val = symbol.first(); - self.byte_codes[val as usize] = (1 << FSST_LEN_BITS) | val as u16; - } else if symbol.length() == 2 { - let val = symbol.first2(); - self.short_codes[val as usize] = (1 << FSST_LEN_BITS) | (val & 255); - } else { - let idx = symbol.hash() & ((1 << FSST_HASH_LOG2SIZE) - 1); - self.hash_tab[idx] = Symbol::new(); - self.hash_tab[idx].icl = FSST_ICL_FREE; - } - } - self.n_symbols = 0; - } - - fn hash_insert(&mut self, s: Symbol) -> bool { - let idx = s.hash() & ((1 << FSST_HASH_LOG2SIZE) - 1); - let taken = self.hash_tab[idx].icl < FSST_ICL_FREE; - if taken { - return false; - } - self.hash_tab[idx] = s; - true - } - - fn add(&mut self, mut s: Symbol) -> bool { - assert!(FSST_CODE_BASE + self.n_symbols < FSST_CODE_MAX); - let len = s.length(); - s.set_code_len(FSST_CODE_BASE + self.n_symbols, len); - if len == 1 { - self.byte_codes[s.first() as usize] = FSST_CODE_BASE + self.n_symbols + (1 << FSST_LEN_BITS); - } else if len == 2 { - self.short_codes[s.first2() as usize] = FSST_CODE_BASE + self.n_symbols + (2 << FSST_LEN_BITS); - } else if !self.hash_insert(s) { - return false; - } - self.symbols[FSST_CODE_BASE as usize + self.n_symbols as usize] = s; - self.len_histo[len as usize - 1] += 1; - self.n_symbols += 1; - true - } - - fn find_longest_symbol(&self, s: Symbol) -> u16 { - let idx = s.hash() & ((1 << FSST_HASH_LOG2SIZE) - 1); - if self.hash_tab[idx].icl <= s.icl && self.hash_tab[idx].val == s.val { - return (self.hash_tab[idx].icl >> 16) & FSST_CODE_MASK as u64; - } - if s.length() >= 2 { - let code = self.short_codes[s.first2() as usize] & FSST_CODE_MASK; - if code >= FSST_CODE_BASE { - return code; - } - } - self.byte_codes[s.first() as usize] & FSST_CODE_MASK - } - - fn find_longest_symbol_slice(&self, cur: &[u8], end: &[u8]) -> u16 { - self.find_longest_symbol(Symbol::from_slice(&cur[..min(cur.len(), end.len())])) - } -} - -struct Counters { - count1: Vec, - count2: Vec>, -} - -impl Counters { - fn new() -> Self { - Counters { - count1: vec![0; FSST_CODE_MAX as usize], - count2: vec![vec![0; FSST_CODE_MAX as usize]; FSST_CODE_MAX as usize], - } - } - - fn count1_set(&mut self, pos1: usize, val: u16) { - self.count1[pos1] = val; - } - - fn count1_inc(&mut self, pos1: usize) { - self.count1[pos1] += 1; - } - - fn count2_inc(&mut self, pos1: usize, pos2: usize) { - self.count2[pos1][pos2] += 1; - } - - fn count1_get_next(&self, pos1: &mut usize) -> u32 { - self.count1[*pos1] as u32 - } - - fn count2_get_next(&self, pos1: usize, pos2: &mut usize) -> u32 { - self.count2[pos1][*pos2] as u32 - } - - fn backup1(&self, buf: &mut [u8]) { - unsafe { - std::ptr::copy_nonoverlapping( - self.count1.as_ptr() as *const u8, - buf.as_mut_ptr(), - FSST_CODE_MAX as usize * std::mem::size_of::(), - ); - } - } - - fn restore1(&mut self, buf: &[u8]) { - unsafe { - std::ptr::copy_nonoverlapping( - buf.as_ptr(), - self.count1.as_mut_ptr() as *mut u8, - FSST_CODE_MAX as usize * std::mem::size_of::(), - ); - } - } -} - -struct Encoder { - symbol_table: SymbolTable, - counters: Counters, -} - -impl Encoder { - fn new() -> Self { - Encoder { - symbol_table: SymbolTable::new(), - counters: Counters::new(), - } - } - - pub fn compress(&self, input: &[u8], output: &mut [u8]) -> (usize, usize) { - let mut in_pos = 0; - let mut out_pos = 0; - - while in_pos < input.len() && out_pos < output.len() { - let symbol = self.symbol_table.find_longest_symbol_slice(&input[in_pos..], &input[input.len()..]); - let code = symbol & FSST_CODE_MASK; - let len = (symbol >> FSST_LEN_BITS) as usize; - - if code < FSST_CODE_BASE { - // Escape byte - if out_pos + 2 > output.len() { - break; - } - output[out_pos] = FSST_ESC; - output[out_pos + 1] = input[in_pos]; - out_pos += 2; - in_pos += 1; - } else { - if out_pos + 1 > output.len() { - break; - } - output[out_pos] = code as u8; - out_pos += 1; - in_pos += len; - } - } - - (in_pos, out_pos) - } -} - -impl SymbolTable { - pub fn decompress(&self, input: &[u8], output: &mut [u8]) -> usize { - let mut in_pos = 0; - let mut out_pos = 0; - - while in_pos < input.len() && out_pos < output.len() { - let code = input[in_pos] as u16; - in_pos += 1; - - if code == FSST_ESC as u16 { - if in_pos >= input.len() { - break; - } - output[out_pos] = input[in_pos]; - in_pos += 1; - out_pos += 1; - } else { - let symbol = &self.symbols[code as usize]; - let len = symbol.length() as usize; - if out_pos + len > output.len() { - break; - } - output[out_pos..out_pos + len].copy_from_slice(&symbol.val[..len]); - out_pos += len; - } - } - - out_pos - } -} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 44a4684..7fff00d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,12 +1,254 @@ -mod fsst; +use std::fmt::{Debug, Formatter}; -#[cfg(test)] -mod tests { - use super::*; +pub use builder::*; - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); +mod builder; +mod longest; + +pub const ESCAPE: u8 = 0xFF; + +/// A Symbol wraps a set of values of +#[derive(Copy, Clone)] +pub union Symbol { + bytes: [u8; 8], + num: u64, +} + +impl Debug for Symbol { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", unsafe { self.num }) + } +} + +impl Symbol { + pub const ZERO: Self = Self::zero(); + + pub fn from_slice(slice: &[u8; 8]) -> Self { + Self { bytes: *slice } + } + + /// Return a zero symbol + const fn zero() -> Self { + Self { num: 0 } + } + + /// Create a new single-byte symbol + pub fn from_u8(value: u8) -> Self { + Self { + bytes: [value, 0, 0, 0, 0, 0, 0, 0], + } + } +} + +impl Symbol { + /// Calculate the length of the symbol in bytes. + /// + /// Each symbol has the capacity to hold up to 8 bytes of data, but the symbols + /// can contain fewer bytes, padded with 0x00. + pub fn len(&self) -> usize { + let numeric = unsafe { self.num }; + // For little-endian platforms, this counts the number of *trailing* zeros + let null_bytes = (numeric.leading_zeros() >> 3) as usize; + + size_of::() - null_bytes + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn as_slice(&self) -> &[u8] { + let len = self.len(); + // Safety: the length from `len()` can never be more than 8. + unsafe { &self.bytes[0..len] } + } + + pub fn append_to(&self, vec: &mut Vec) { + match self.len() { + 0 => self.append_inner::<0>(vec), + 1 => self.append_inner::<1>(vec), + 2 => self.append_inner::<2>(vec), + 3 => self.append_inner::<3>(vec), + 4 => self.append_inner::<4>(vec), + 5 => self.append_inner::<5>(vec), + 6 => self.append_inner::<6>(vec), + 7 => self.append_inner::<7>(vec), + 8 => self.append_inner::<8>(vec), + _ => unreachable!("Symbol::len() always ≤ 8"), + } + } + + fn append_inner(&self, vec: &mut Vec) { + for i in 0..N { + let byte: u8 = unsafe { self.num >> i } as u8; + vec.push(byte); + } + } + + /// Returns true if the symbol is a prefix of the provided text. + pub fn is_prefix(&self, text: &[u8]) -> bool { + text.starts_with(self.as_slice()) + } + + pub fn concat(&self, other: &Self) -> Self { + let new_len = self.len() + other.len(); + assert!(new_len <= 8, "cannot build symbol with length > 8"); + + let self_len = self.len(); + let mut result = *self; + unsafe { result.bytes[self_len..new_len].copy_from_slice(other.as_slice()) }; + + result + } +} + +/// Codes correspond to bytes. +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct Code(u16); + +impl Code { + pub const CODE_MAX: u16 = 512; + + /// Create a new code representing an escape byte. + pub fn new_escaped(byte: u8) -> Self { + Self(byte as u16) + } + + /// Create a new code representing a symbol. + pub fn new_symbol(code: u8) -> Self { + Self((code as u16) + 256) + } + + /// Create a `Code` directly from a `u16` value. + /// + /// # Panics + /// Panic if the value is ≥ the defined `CODE_MAX`. + pub fn from_u16(code: u16) -> Self { + assert!(code < Self::CODE_MAX, "code value higher than CODE_MAX"); + + Self(code) + } + + /// Returns true if the code is for an escape byte. + #[inline] + pub fn is_escape(&self) -> bool { + self.0 <= 255 + } +} + +#[derive(Clone, Debug)] +pub struct SymbolTable { + /// Table mapping codes to symbols. + pub(crate) symbols: [Symbol; 512], + + /// Indicates the number of entries in the symbol table that have been populated. + /// + /// This value is always at least 256, as the first 256 entries in the `table` are the escape + /// bytes. + pub(crate) n_symbols: usize, +} + +impl Default for SymbolTable { + fn default() -> Self { + let mut table = Self { + symbols: [Symbol::ZERO; 512], + n_symbols: 0, + }; + + // Populate the escape byte entries. + for byte in 0..=255 { + table.symbols[byte as usize] = Symbol::from_u8(byte); + } + table.n_symbols = 256; + + table + } +} + +/// The core structure of the FSST codec, holding a mapping between `Symbol`s and `Code`s. +/// +/// The symbol table is trained on a corpus of data in the form of a single byte array, building up +/// a mapping of 1-byte "codes" to sequences of up to `N` plaintext bytse, or "symbols". +impl SymbolTable { + pub const ESCAPE: u8 = 255; + + /// Insert a new symbol at the end of the table. + /// + /// # Panics + /// Panics if the table is already full. + pub fn insert(&mut self, symbol: Symbol) { + assert!(self.n_symbols < 512, "cannot insert into full symbol table"); + self.symbols[self.n_symbols] = symbol; + self.n_symbols += 1; + } + + /// Return a new encoded sequence of data bytes instead. + pub fn compress(&self, plaintext: &[u8]) -> Vec { + let mut values = Vec::with_capacity(2 * plaintext.len()); + let len = plaintext.len(); + let mut pos = 0; + while pos < len { + // println!("COMPRESS pos={pos} len={len} in_progress_size={}", values.len()); + let next_code = self.find_longest_symbol(&plaintext[pos..len]); + if next_code.is_escape() { + // Case 1 -escape: push an ESCAPE followed by the next byte. + // println!("ESCAPE"); + values.push(Self::ESCAPE); + values.push(next_code.0 as u8); + pos += 1; + } else { + // Case 2 - code: push the code, increment position by symbol length + let symbol = self.symbols[next_code.0 as usize]; + // println!("APPEND symbol={:?} len={}", symbol.as_slice(), symbol.len()); + values.push(next_code.0 as u8); + pos += symbol.len(); + } + } + + values + } + + /// Decompress the provided byte slice into a [`String`] using the symbol table. + pub fn decompress(&self, compressed: &[u8]) -> Vec { + let mut decoded: Vec = Vec::with_capacity(size_of::() * compressed.len()); + let ptr = decoded.as_mut_ptr(); + + let mut in_pos = 0; + let mut out_pos = 0; + + while in_pos < compressed.len() && out_pos < (decoded.capacity() + size_of::()) { + let code = compressed[in_pos]; + if code == SymbolTable::ESCAPE { + // Advance by one, do raw write. + in_pos += 1; + // SAFETY: out_pos is always 8 bytes or more from the end of decoded buffer + unsafe { + let write_addr = ptr.byte_offset(out_pos as isize); + write_addr.write(compressed[in_pos]); + } + out_pos += 1; + in_pos += 1; + } else { + let symbol = self.symbols[256 + code as usize]; + // SAFETY: out_pos is always 8 bytes or more from the end of decoded buffer + unsafe { + let write_addr = ptr.byte_offset(out_pos as isize) as *mut u64; + // Perform 8 byte unaligned write. + write_addr.write_unaligned(symbol.num); + } + in_pos += 1; + out_pos += symbol.len(); + } + } + + assert!( + in_pos >= compressed.len(), + "decompression should exhaust input before output" + ); + + // SAFETY: we enforce in the loop condition that out_pos <= decoded.capacity() + unsafe { decoded.set_len(out_pos) }; + + decoded } } diff --git a/src/longest.rs b/src/longest.rs new file mode 100644 index 0000000..50f0ff7 --- /dev/null +++ b/src/longest.rs @@ -0,0 +1,25 @@ +use crate::{Code, SymbolTable}; + +/// Find the longest substring. + +impl SymbolTable { + // NOTE(aduffy): if you don't disable inlining, this function won't show up in profiles. + #[inline(never)] + pub(crate) fn find_longest_symbol(&self, text: &[u8]) -> Code { + debug_assert!(!text.is_empty(), "text must not be empty"); + + // Find the code that best maps to the provided text table here. + let mut best_code = Code::new_escaped(text[0]); + let mut best_overlap = 1; + for code in 0..512 { + let symbol = &self.symbols[code as usize]; + if symbol.is_prefix(text) && symbol.len() > best_overlap { + // println!("using ideal code: code={code} symbol{:?} len={}", symbol.as_slice(), symbol.len()); + best_code = Code::from_u16(code); + best_overlap = symbol.len(); + } + } + + best_code + } +}