From ca0c7d8cdfe36fb74784058bcb112ac4d56c1738 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 15 Aug 2024 11:42:05 -0400 Subject: [PATCH 1/3] bugfix, comment fix, force compile fails for big-endian --- Cargo.lock | 36 ------------------------------------ Cargo.toml | 1 - README.md | 1 + benches/compress.rs | 40 +--------------------------------------- src/builder.rs | 2 +- src/lib.rs | 15 +++++++++------ tests/correctness.rs | 14 ++++++++++++++ 7 files changed, 26 insertions(+), 83 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b5e4226..43d9b08 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -41,15 +41,6 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" -[[package]] -name = "cc" -version = "1.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fb8dd288a69fc53a1996d7ecfbf4a20d59065bff137ce7e56bbd620de191189" -dependencies = [ - "shlex", -] - [[package]] name = "cfg-if" version = "1.0.0" @@ -186,7 +177,6 @@ name = "fsst-rs" version = "0.0.1" dependencies = [ "criterion", - "lz4", ] [[package]] @@ -252,26 +242,6 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" -[[package]] -name = "lz4" -version = "1.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "958b4caa893816eea05507c20cfe47574a43d9a697138a7872990bba8a0ece68" -dependencies = [ - "libc", - "lz4-sys", -] - -[[package]] -name = "lz4-sys" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "109de74d5d2353660401699a4174a4ff23fcc649caf553df71933c7fb45ad868" -dependencies = [ - "cc", - "libc", -] - [[package]] name = "memchr" version = "2.7.4" @@ -441,12 +411,6 @@ dependencies = [ "serde", ] -[[package]] -name = "shlex" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" - [[package]] name = "syn" version = "2.0.74" diff --git a/Cargo.toml b/Cargo.toml index 31f9e7f..d49faf3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,6 @@ use_debug = { level = "deny" } [dev-dependencies] criterion = "0.5" -lz4 = "1" [[example]] name = "round_trip" diff --git a/README.md b/README.md index d6957db..dc7430c 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ but it is mostly written from a careful reading of the paper. **NOTE: This current implementation is still in-progress and is not production ready, please use at your own risk.** +**NOTE: This crate only works on little-endian architectures currently. There are no current plans to support big-endian targets.** [whitepaper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf [MIT-licensed implementation]: https://github.com/cwida/fsst diff --git a/benches/compress.rs b/benches/compress.rs index 603eca1..92725d9 100644 --- a/benches/compress.rs +++ b/benches/compress.rs @@ -5,11 +5,8 @@ //! Also contains LZ4 baseline. #![allow(missing_docs)] use core::str; -use std::io::{Cursor, Read, Write}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use lz4::liblz4::BlockChecksum; -use lz4::{BlockSize, ContentChecksum}; use fsst_rs::{train, ESCAPE_CODE}; @@ -48,40 +45,5 @@ fn bench_fsst(c: &mut Criterion) { }); } -fn bench_lz4(c: &mut Criterion) { - let mut group = c.benchmark_group("lz4"); - - group.bench_function("compress-single", |b| { - let mut compressed = Vec::with_capacity(100_000_000); - let mut encoder = lz4::EncoderBuilder::new() - .block_size(BlockSize::Max64KB) - .checksum(ContentChecksum::NoChecksum) - .block_checksum(BlockChecksum::NoBlockChecksum) - .build(&mut compressed) - .unwrap(); - - b.iter(|| encoder.write_all(TEST.as_bytes()).unwrap()); - }); - - group.bench_function("decompress-single", |b| { - let compressed = Vec::new(); - let mut encoder = lz4::EncoderBuilder::new() - .block_size(BlockSize::Max64KB) - .checksum(ContentChecksum::NoChecksum) - .block_checksum(BlockChecksum::NoBlockChecksum) - .build(compressed) - .unwrap(); - encoder.write_all(TEST.as_bytes()).unwrap(); - let (compressed, result) = encoder.finish(); - result.unwrap(); - - let cursor = Cursor::new(compressed); - let mut decoder = lz4::Decoder::new(cursor).unwrap(); - let mut output = Vec::new(); - - b.iter(|| decoder.read_to_end(&mut output).unwrap()); - }); -} - -criterion_group!(compress_bench, bench_fsst, bench_lz4); +criterion_group!(compress_bench, bench_fsst); criterion_main!(compress_bench); diff --git a/src/builder.rs b/src/builder.rs index 558a3b4..7e865a8 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -115,7 +115,7 @@ impl SymbolTable { let symbol2 = &self.symbols[code2 as usize]; // If either symbol is zero-length, or if merging would yield a symbol of // length greater than 8, skip. - if symbol1.len() + symbol2.len() >= 8 || symbol1.is_empty() || symbol2.is_empty() { + if symbol1.len() + symbol2.len() >= 8 { continue; } let new_symbol = symbol1.concat(symbol2); diff --git a/src/lib.rs b/src/lib.rs index 7191b00..4c521e4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ #![doc = include_str!("../README.md")] +#![cfg(target_endian = "little")] /// Throw a compiler error if a type isn't guaranteed to have a specific size in bytes. macro_rules! assert_sizeof { @@ -49,10 +50,12 @@ impl Symbol { } impl Symbol { - /// Calculate the length of the symbol in bytes. + /// Calculate the length of the symbol in bytes. Always a value between 1 and 8. /// /// Each symbol has the capacity to hold up to 8 bytes of data, but the symbols - /// can contain fewer bytes, padded with 0x00. + /// can contain fewer bytes, padded with 0x00. There is a special case of a symbol + /// that holds the byte 0x00. In that case, the symbol contains `0x0000000000000000` + /// but we want to interpret that as a one-byte symbol containing `0x00`. pub fn len(&self) -> usize { let numeric = unsafe { self.num }; // For little-endian platforms, this counts the number of *trailing* zeros @@ -113,10 +116,10 @@ impl Symbol { /// Return a new `Symbol` by logically concatenating ourselves with another `Symbol`. pub fn concat(&self, other: &Self) -> Self { - let new_len = self.len() + other.len(); + let self_len = self.len(); + let new_len = self_len + other.len(); assert!(new_len <= 8, "cannot build symbol with length > 8"); - let self_len = self.len(); let mut result = *self; // SAFETY: self_len and new_len are checked to be <= 8 @@ -421,13 +424,13 @@ impl SymbolTable { /// Decompress a byte slice that was previously returned by [compression][Self::compress]. pub fn decompress(&self, compressed: &[u8]) -> Vec { - let mut decoded: Vec = Vec::with_capacity(size_of::() * compressed.len()); + let mut decoded: Vec = Vec::with_capacity(size_of::() * (compressed.len() + 1)); let ptr = decoded.as_mut_ptr(); let mut in_pos = 0; let mut out_pos = 0; - while in_pos < compressed.len() && out_pos < (decoded.capacity() + size_of::()) { + while in_pos < compressed.len() && out_pos < (decoded.capacity() - size_of::()) { let code = compressed[in_pos]; if code == ESCAPE_CODE { // Advance by one, do raw write. diff --git a/tests/correctness.rs b/tests/correctness.rs index 8773bc7..5385630 100644 --- a/tests/correctness.rs +++ b/tests/correctness.rs @@ -1,5 +1,7 @@ #![cfg(test)] +use fsst_rs::Symbol; + static PREAMBLE: &str = r#" When in the Course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume among the @@ -29,6 +31,18 @@ fn test_train_on_empty() { ); } +#[test] +fn test_one_byte() { + let mut empty = fsst_rs::SymbolTable::default(); + // Assign code 0 to map to the symbol containing byte 0x01 + empty.insert(Symbol::from_u8(0x01)); + + let compressed = empty.compress(&[0x01]); + assert_eq!(compressed, vec![0u8]); + + assert_eq!(empty.decompress(&compressed), vec![0x01]); +} + #[test] fn test_zeros() { println!("training zeros"); From 121e8cff060dd0d0469f481da6c91bb5dc6292ee Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 15 Aug 2024 12:03:14 -0400 Subject: [PATCH 2/3] add chinese characters test, fix doc comment --- src/lib.rs | 11 ++++----- tests/correctness.rs | 11 +++++++++ tests/fixtures/art_of_war.txt | 45 +++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 6 deletions(-) create mode 100644 tests/fixtures/art_of_war.txt diff --git a/src/lib.rs b/src/lib.rs index 4c521e4..fecb4ad 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -293,16 +293,15 @@ impl SymbolTable { true } - /// Using the symbol table, runs a single cycle of compression from the front of `in_ptr`, writing - /// the output into `out_ptr`. Attempts to process an entire 64-bit word of prefix from `in_ptr`. + /// Using the symbol table, runs a single cycle of compression on an input word, writing + /// the output into `out_ptr`. /// /// # Returns /// - /// This function returns a tuple of (code, advance_in, advance_out). + /// This function returns a tuple of (advance_in, advance_out) with the number of bytes + /// for the caller to advance the input and output pointers. /// - /// `code` is the code that was emitted into the output buffer. - /// - /// `advance_in` is the number of bytes to advance `in_ptr` before the next call. + /// `advance_in` is the number of bytes to advance the input pointer before the next call. /// /// `advance_out` is the number of bytes to advance `out_ptr` before the next call. /// diff --git a/tests/correctness.rs b/tests/correctness.rs index 5385630..e168dd6 100644 --- a/tests/correctness.rs +++ b/tests/correctness.rs @@ -11,6 +11,8 @@ that they should declare the causes which impel them to the separation."#; static DECLARATION: &str = include_str!("./fixtures/declaration.txt"); +static ART_OF_WAR: &str = include_str!("./fixtures/art_of_war.txt"); + #[test] fn test_basic() { // Roundtrip the declaration @@ -71,3 +73,12 @@ fn test_large() { let compressed = trained.compress(massive.as_bytes()); assert_eq!(trained.decompress(&compressed), massive.as_bytes()); } + +#[test] +fn test_chinese() { + let trained = fsst_rs::train(ART_OF_WAR.as_bytes()); + assert_eq!( + ART_OF_WAR.as_bytes(), + trained.decompress(&trained.compress(ART_OF_WAR.as_bytes())) + ); +} diff --git a/tests/fixtures/art_of_war.txt b/tests/fixtures/art_of_war.txt new file mode 100644 index 0000000..7a6dba9 --- /dev/null +++ b/tests/fixtures/art_of_war.txt @@ -0,0 +1,45 @@ +孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。 +故經之以五事,校之以計,而索其情,一曰道,二曰天,三曰地,四曰將,五曰法。 +道者,令民與上同意也,可與之死,可與之生,而不畏危。天者,陰陽,寒暑,時制也。地者,遠近,險易,廣狹,死生也。將者,智,信,仁,勇,嚴也。法者,曲制,官道,主用也。凡此五者,將莫不聞,知之者勝,不知者不勝。 +故校之以計,而索其情。曰:主孰有道,將孰有能,天地孰得,法令孰行,兵眾孰強,士卒孰練,賞罰孰明,吾以此知勝負矣。將聽吾計,用之必勝,留之;將不聽吾計,用之必敗,去之。 +計利以聽,乃為之勢,以佐其外;勢者,因利而制權也。 +兵者,詭道也。故能而示之不能,用而示之不用,近而示之遠,遠而示之近。利而誘之,亂而取之,實而備之,強而避之,怒而撓之,卑而驕之,佚而勞之,親而離之。攻其無備,出其不意,此兵家之勝,不可先傳也。 +夫未戰而廟算勝者,得算多也;未戰而廟算不勝者,得算少也;多算勝,少算不勝,而況於無算乎?吾以此觀之,勝負見矣。 + +孫子曰:凡用兵之法,馳車千駟,革車千乘,帶甲十萬;千里饋糧,則內外之費賓客之用,膠漆之材,車甲之奉,日費千金,然後十萬之師舉矣。 +其用戰也,勝久則鈍兵挫銳,攻城則力屈,久暴師則國用不足。夫鈍兵,挫銳,屈力,殫貨,則諸侯乘其弊而起,雖有智者,不能善其後矣!故兵聞拙速,未睹巧之久也;夫兵久而國利者,未之有也。 +故不盡知用兵之害者,則不能盡知用兵之利也。善用兵者,役不再籍,糧不三載,取用于國,因糧于敵,故軍食可足也。國之貧于師者遠輸,遠輸則百姓貧,近于師者貴賣,貴賣則百姓財竭,財竭則急于丘役,力屈財殫,中原內虛于家,百姓之費,十去其七,公家之費,破車罷馬,甲冑矢弩,戟楯蔽櫓,丘牛大車,十去其六。 +故智將務食於敵,食敵一鍾,當吾二十鍾,𦮼秆一石,當我二十石。故殺敵者怒也,取敵之利者貨也。故車戰,得車十乘以上,賞其先得者,而更其旌旗,車雜而乘之,卒善而養之,是謂勝敵而益強。 +故兵貴勝,不貴久;故知兵之將,民之司命,國家安危之主也。 +孫子曰:凡用兵之法,全國為上,破國次之;全旅為上,破旅次之;全卒為上,破卒次之;全伍為上,破伍次之。是故百戰百勝,非善之善者也;不戰而屈人之兵,善之善者也。 +故上兵伐謀,其次伐交,其次伐兵,其下攻城。攻城之法,為不得已;修櫓轒轀,具器械,三月而後成;距闉,又三月而後已;將不勝其忿,而蟻附之,殺士卒三分之一,而城不拔者,此攻之災也。 +故善用兵者,屈人之兵,而非戰也;拔人之城,而非攻也;毀人之國,而非久也。必以全爭于天下,故兵不頓,利可全,此謀攻之法也。故用兵之法,十則圍之,五則攻之,倍則分之,敵則能戰之,少則能守之,不若則能避之。故小敵之堅,大敵之擒也。 +夫將者,國之輔也,輔周則國必強,輔隙則國必弱。故軍之所以患于君者三:不知三軍之不可以進,而謂之進;不知三軍之不可以退,而謂之退;是謂縻軍。不知三軍之事,而同三軍之政,則軍士惑矣。不知三軍之權,而同三軍之任,則軍士疑矣。三軍既惑且疑,則諸侯之難至矣,是謂亂軍引勝。 +故知勝者有五:知可以戰與不可以戰者勝,識眾寡之用者勝,上下同欲者勝,以虞待不虞者勝,將能而君不御者勝;此五者,知勝之道也。 +故曰:知彼知己,百戰不殆;不知彼而知己,一勝一負;不知彼,不知己,每戰必敗。 + +孫子曰:昔之善戰者,先為不可勝,以待敵之可勝,不可勝在己,可勝在敵。故善戰者,能為不可勝,不能使敵必可勝。故曰:勝可知,而不可為。 +不可勝者,守也;可勝者,攻也。守則不足,攻則有餘。善守者,藏于九地之下;善攻者,動于九天之上,故能自保而全勝也。 +見勝,不過眾人之所知,非善之善者也。戰勝,而天下曰善,非善之善者也。故舉秋毫,不為多力;見日月,不為明目;聞雷霆,不為聰耳。古之善戰者,勝于易勝者;故善戰者之勝也,無智名,無勇功。故其戰勝不忒,不忒者,其措必勝,勝已敗者也。故善戰者,立于不敗之地,而不失敵之敗也。是故勝兵先勝,而後求戰;敗兵先戰,而後求勝。 +善用兵者,修道而保法,故能為勝敗之政。兵法:「一曰度,二曰量,三曰數,四曰稱,五曰勝;地生度,度生量,量生數,數生稱,稱生勝。」故勝兵若以鎰稱銖,敗兵若以銖稱鎰。勝者之戰民也,若決積水于千仞之谿,形也。 + +孫子曰:凡治眾如治寡,分數是也。鬥眾如鬥寡,形名是也。三軍之眾,可使必受敵而無敗者,奇正是也。兵之所加,如以碬投卵者,虛實是也。 +凡戰者,以正合,以奇勝。故善出奇者,無窮如天地,不竭如江河,終而復始,日月是也;死而復生,四時是也。聲不過五,五聲之變,不可勝聽也。色不過五,五色之變,不可勝觀也。味不過五,五味之變,不可勝嘗也。戰勢不過奇正,奇正之變,不可勝窮也。奇正相生,如循環之無端,孰能窮之哉! +激水之疾,至于漂石者,勢也。鷙鳥之擊,至于毀折者,節也。是故善戰者,其勢險,其節短,勢如張弩,節如機發。 +紛紛紜紜,鬥亂,而不可亂也。渾渾沌沌,形圓,而不可敗也。亂生于治,怯生于勇,弱生于強。治亂,數也。勇怯,勢也。強弱,形也。故善動敵者,形之,敵必從之;予之,敵必取之;以利動之,以實待之。 +故善戰者,求之于勢,不責于人,故能擇人任勢;任勢者,其戰人也,如轉木石,木石之性,安則靜,危則動,方則止,圓則行。故善戰人之勢,如轉圓石于千仞之山者,勢也。 + +孫子曰:凡先處戰地而待敵者佚,後處戰地而趨戰者勞。故善戰者,致人而不致于人。能使敵人自至者,利之也;能使敵不得至者,害之也。故敵佚能勞之,飽能飢之,安能動之。 +出其所不趨,趨其所不意;行千里而不勞者,行于無人之地也;攻而必取者,攻其所不守也;守而必固者,守其所不攻也。故善攻者,敵不知其所守;善守者,敵不知其所攻。微乎微乎!至于無形;神乎神乎!至于無聲,故能為敵之司命。進而不可禦者,衝其虛也;退而不可追者,速而不可及也。故我欲戰,敵雖高壘深溝,不得不與我戰者,攻其所必救也;我不欲戰,雖劃地而守之,敵不得與我戰者,乖其所之也。 +故形人而我無形,則我專而敵分,我專為一,敵分為十,是以十攻其一也。則我眾而敵寡,能以眾擊寡,則我之所與戰者,約矣。 +吾所與戰之地不可知,不可知,則敵所備者多,敵所備者多,則我所與戰者寡矣。故備前則後寡,備後則前寡,備左則右寡,備右則左寡,無所不備,則無所不寡。寡者,備人者也;眾者,使人備己者也。 +故知戰之地,知戰之日,則可千里而會戰。不知戰地,不知戰日,則左不能救右,右不能救左,前不能救後,後不能救前,而況遠者數十里,近者數里乎?以吾度之,越人之兵雖多,亦奚益于勝哉?故曰:勝可為也,敵雖眾,可使無鬥。 +故策之而知得失之計,作之而知動靜之理,形之而知死生之地,角之而知有餘不足之處。故形兵之極,至于無形;無形,則深間不能窺,智者不能謀。因形而措勝于眾,眾不能知,人皆知我所以勝之形,而莫知吾所以制勝之形;故其戰勝不復,而應形於無窮。 +夫兵形象水,水之形,避高而趨下:兵之形,避實而擊虛;水因地而制流,兵因敵而制勝。故兵無常勢,水無常形;能因敵變化而取勝,謂之神。故五行無常勝,四時無常位,日有短長,月有死生。 + +孫子曰:凡用兵之法,將受命於君,合軍聚眾,交和而舍,莫難於軍爭。軍爭之難者,以迂為直,以患為利。故迂其途,而誘之以利,後人發,先人至,此知迂直之計者也。故軍爭為利,軍爭為危。 +舉軍而爭利,則不及;委軍而爭利,則輜重捐。是故卷甲而趨,日夜不處,倍道兼行,百里而爭利,則擒三將軍,勁者先,疲者後,其法十一而至;五十里而爭利,則蹶上將軍,其法半至;卅里而爭利,則三分之二至。是故軍無輜重則亡,無糧食則亡,無委積則亡。故不知諸侯之謀者,不能豫交;不知山林、險阻、沮澤之形者,不能行軍,不能鄉導者,不能得地利。 +故兵以詐立,以利動,以分合為變者也,故其疾如風,其徐如林,侵掠如火,不動如山,難知如陰,動如雷霆。掠鄉分眾,廓地分利,懸權而動,先知迂直之計者勝,此軍爭之法也。 +軍政曰:「言不相聞,故為金鼓;視不相見,故為旌旗。」夫金鼓旌旗者,所以一人之耳目也;人既專一,則勇者不得獨進,怯者不得獨退,此用眾之法也。故夜戰多火鼓,晝戰多旌旗,所以變人之耳目也。 +故三軍可奪氣,將軍可奪心。是故朝氣銳,晝氣惰,暮氣歸;故善用兵者,避其銳氣,擊其惰歸,此治氣者也。以治待亂,以靜待譁,此治心者也。以近待遠,以佚待勞,以飽待飢,此治力者也。 +無邀正正之旗,勿擊堂堂之陣,此治變者也;故用兵之法,高陵勿向,背邱勿逆,佯北勿從,銳卒勿攻,餌兵勿食,歸師勿遏,圍師必闕,窮寇勿迫,此用兵之法也。 From 27c4a0f0416859254098bc9573326f8dc999b7ed Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 15 Aug 2024 12:24:47 -0400 Subject: [PATCH 3/3] each generation of table building should only consider up to self.n_symbols --- src/builder.rs | 4 ++-- src/find_longest/naive.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/builder.rs b/src/builder.rs index 7e865a8..43a6fd4 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -103,7 +103,7 @@ impl SymbolTable { fn optimize(&self, counters: Counter) -> Self { let mut res = SymbolTable::default(); let mut pqueue = BinaryHeap::new(); - for code1 in 0..511 { + for code1 in 0u16..(256u16 + self.n_symbols as u16) { let symbol1 = self.symbols[code1 as usize]; let gain = counters.count1(code1) * symbol1.len(); pqueue.push(Candidate { @@ -111,7 +111,7 @@ impl SymbolTable { gain, }); - for code2 in 0..511 { + for code2 in 0u16..(256u16 + self.n_symbols as u16) { let symbol2 = &self.symbols[code2 as usize]; // If either symbol is zero-length, or if merging would yield a symbol of // length greater than 8, skip. diff --git a/src/find_longest/naive.rs b/src/find_longest/naive.rs index c75ecad..c9add2d 100644 --- a/src/find_longest/naive.rs +++ b/src/find_longest/naive.rs @@ -15,7 +15,7 @@ impl FindLongestSymbol for SymbolTable { // Start with the code corresponding to the escape of the first character in the text let mut best_code = text[0] as u16; let mut best_overlap = 1; - for code in 256..511 { + for code in 256..(256 + self.n_symbols as u16) { let symbol = &self.symbols[code as usize]; if symbol.is_prefix(text) && symbol.len() > best_overlap { best_code = code;