From f047dd7d22f7eebf44218a25c5ff5a59be839c22 Mon Sep 17 00:00:00 2001 From: Chris Ha Date: Thu, 31 Aug 2023 09:34:26 +0900 Subject: [PATCH 1/8] algebraic simplification --- src/bloom_filter/bloom_test.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/bloom_filter/bloom_test.rs b/src/bloom_filter/bloom_test.rs index 15f375d8..e5cb69a4 100644 --- a/src/bloom_filter/bloom_test.rs +++ b/src/bloom_filter/bloom_test.rs @@ -40,16 +40,15 @@ mod tests { #[test] fn bloom_suggest_size() { + use std::f64::consts::LN_2; // it's hard to derive this exactly since the algorithm is doing closest power of 2 // instead of exact theoretical optimum let expected_elements = 1_000_000; - let desired_false_positive_rate = 0.0001 as f64; - let theoretical_optimum = ((expected_elements as f64 * desired_false_positive_rate.ln()) - / f64::ln(1.0 / 2.0f64.powf(2.0f64.ln()))) - .ceil() - .div_euclid(8f64) as usize; - let suggested_size = - BloomFilter::suggest_size_in_bytes(expected_elements, desired_false_positive_rate); + let target_fp_rate = 0.0001 as f64; + let theoretical_optimum = (expected_elements as f64 * target_fp_rate.ln() / (-LN_2 * LN_2)) + .ceil() + .div_euclid(8f64) as usize; + let suggested_size = BloomFilter::suggest_size_in_bytes(expected_elements, target_fp_rate); assert_eq!(suggested_size, 4_194_304); assert_eq!(suggested_size, theoretical_optimum.next_power_of_two()) } From 922b53ea2c970408f21e074c3878ac84c7d2f61f Mon Sep 17 00:00:00 2001 From: Chris Ha Date: Thu, 31 Aug 2023 09:37:31 +0900 Subject: [PATCH 2/8] function encapsulation --- src/bloom_filter/bloom_test.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/bloom_filter/bloom_test.rs b/src/bloom_filter/bloom_test.rs index e5cb69a4..f450627c 100644 --- a/src/bloom_filter/bloom_test.rs +++ b/src/bloom_filter/bloom_test.rs @@ -40,14 +40,18 @@ mod tests { #[test] fn bloom_suggest_size() { - use std::f64::consts::LN_2; + // it's hard to derive this exactly since the algorithm is doing closest power of 2 // instead of exact theoretical optimum let expected_elements = 1_000_000; let target_fp_rate = 0.0001 as f64; - let theoretical_optimum = (expected_elements as f64 * target_fp_rate.ln() / (-LN_2 * LN_2)) + fn simplified_suggest_size (expected_elements: usize, target_fp_rate: f64) { + use std::f64::consts::LN_2; + (expected_elements as f64 * target_fp_rate.ln() / (-LN_2 * LN_2)) .ceil() - .div_euclid(8f64) as usize; + .div_euclid(8f64) as usize + } + let theoretical_optimum = simplified_suggest_size(expected_elements, target_fp_rate); let suggested_size = BloomFilter::suggest_size_in_bytes(expected_elements, target_fp_rate); assert_eq!(suggested_size, 4_194_304); assert_eq!(suggested_size, theoretical_optimum.next_power_of_two()) From c409219a11c8573d283728a93067d796674b7dc1 Mon Sep 17 00:00:00 2001 From: Chris Ha Date: Thu, 31 Aug 2023 09:56:02 +0900 Subject: [PATCH 3/8] further isolation --- src/bloom_filter/bloom_test.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/bloom_filter/bloom_test.rs b/src/bloom_filter/bloom_test.rs index f450627c..e0b5a8f5 100644 --- a/src/bloom_filter/bloom_test.rs +++ b/src/bloom_filter/bloom_test.rs @@ -9,6 +9,14 @@ mod tests { // m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); // k = round((m / n) * log(2)); + fn simplified_suggest_size(expected_elements: usize, target_fp_rate: f64) -> usize { + // m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); + use std::f64::consts::LN_2; + (expected_elements as f64 * target_fp_rate.ln() / (-LN_2 * LN_2)) + .ceil() + .div_euclid(8.0) as usize + } + #[test] fn bloom_optimal_hasher_number() { let size_in_bytes = 1_000_000_000; @@ -40,17 +48,11 @@ mod tests { #[test] fn bloom_suggest_size() { - // it's hard to derive this exactly since the algorithm is doing closest power of 2 // instead of exact theoretical optimum let expected_elements = 1_000_000; let target_fp_rate = 0.0001 as f64; - fn simplified_suggest_size (expected_elements: usize, target_fp_rate: f64) { - use std::f64::consts::LN_2; - (expected_elements as f64 * target_fp_rate.ln() / (-LN_2 * LN_2)) - .ceil() - .div_euclid(8f64) as usize - } + let theoretical_optimum = simplified_suggest_size(expected_elements, target_fp_rate); let suggested_size = BloomFilter::suggest_size_in_bytes(expected_elements, target_fp_rate); assert_eq!(suggested_size, 4_194_304); From cb903969d9e3734b8e89006e1b0e933b51106cfa Mon Sep 17 00:00:00 2001 From: Chris Ha Date: Thu, 31 Aug 2023 10:03:01 +0900 Subject: [PATCH 4/8] simplify test we are rarely interested in the theoretical optimum value, just the returned value from the suggesting function --- src/bloom_filter/bloom_test.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/bloom_filter/bloom_test.rs b/src/bloom_filter/bloom_test.rs index e0b5a8f5..54d68ba8 100644 --- a/src/bloom_filter/bloom_test.rs +++ b/src/bloom_filter/bloom_test.rs @@ -12,9 +12,10 @@ mod tests { fn simplified_suggest_size(expected_elements: usize, target_fp_rate: f64) -> usize { // m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); use std::f64::consts::LN_2; - (expected_elements as f64 * target_fp_rate.ln() / (-LN_2 * LN_2)) + let theoretical_optimum = (expected_elements as f64 * target_fp_rate.ln() / (-LN_2 * LN_2)) .ceil() - .div_euclid(8.0) as usize + .div_euclid(8.0) as usize; + theoretical_optimum.next_power_of_two() } #[test] @@ -52,10 +53,10 @@ mod tests { // instead of exact theoretical optimum let expected_elements = 1_000_000; let target_fp_rate = 0.0001 as f64; - - let theoretical_optimum = simplified_suggest_size(expected_elements, target_fp_rate); + + let simplified_suggest_size = simplified_suggest_size(expected_elements, target_fp_rate); let suggested_size = BloomFilter::suggest_size_in_bytes(expected_elements, target_fp_rate); assert_eq!(suggested_size, 4_194_304); - assert_eq!(suggested_size, theoretical_optimum.next_power_of_two()) + assert_eq!(suggested_size, simplified_suggest_size) } } From d538b8badc917f4b1c9169d7de2a4afccd53d11b Mon Sep 17 00:00:00 2001 From: Chris Ha Date: Thu, 31 Aug 2023 10:23:23 +0900 Subject: [PATCH 5/8] refactor bloom_math this doesn't have to be part of the main for the code and they contain quite a bit of complexity and interdependency so I factored them out. --- src/bloom_filter.rs | 37 +---------- src/bloom_filter/bloom_math.rs | 45 ++++++++++++++ src/bloom_filter/bloom_test.rs | 108 ++++++++++++++++----------------- 3 files changed, 100 insertions(+), 90 deletions(-) create mode 100644 src/bloom_filter/bloom_math.rs diff --git a/src/bloom_filter.rs b/src/bloom_filter.rs index f7805e4c..3fe588a9 100644 --- a/src/bloom_filter.rs +++ b/src/bloom_filter.rs @@ -11,7 +11,9 @@ use std::mem::size_of; use std::path::PathBuf; use std::sync::atomic::{AtomicU32, Ordering}; +mod bloom_math; mod bloom_test; + // A thread-safe bloom filter. pub struct BloomFilter { bits: Vec, @@ -25,41 +27,6 @@ impl BloomFilter { const MAGIC: u32 = 0x81F0F117; const VERSION: u32 = 1; - pub fn optimal_number_of_hashers(size_in_bytes: usize, expected_elements: usize) -> usize { - let expected_elements = expected_elements as f64; - let size_in_bits = (size_in_bytes * 8) as f64; - let k = (size_in_bits / expected_elements) * (2.0f64.ln()); - k.ceil() as usize - } - - pub fn prob_of_false_positive( - size_in_bytes: usize, - expected_elements: usize, - num_hashers: usize, - ) -> f64 { - let k = num_hashers as f64; - let m = (size_in_bytes * 8) as f64; - let n = expected_elements as f64; - (1.0 - (1.0 - (1.0 / m)).powf(k * n)).powf(k) - } - - pub fn suggest_size_in_bytes( - expected_elements: usize, - desired_false_positive_rate: f64, - ) -> usize { - let mut size_in_bytes = 1024 * 1024; - while size_in_bytes < usize::MAX / 2 - && Self::prob_of_false_positive( - size_in_bytes, - expected_elements, - Self::optimal_number_of_hashers(size_in_bytes, expected_elements), - ) > desired_false_positive_rate - { - size_in_bytes *= 2; - } - size_in_bytes - } - #[allow(dead_code)] pub fn my_prob_of_false_positive(&self, expected_elements: usize) -> f64 { Self::prob_of_false_positive( diff --git a/src/bloom_filter/bloom_math.rs b/src/bloom_filter/bloom_math.rs new file mode 100644 index 00000000..01b320b4 --- /dev/null +++ b/src/bloom_filter/bloom_math.rs @@ -0,0 +1,45 @@ +use super::BloomFilter; + +impl BloomFilter { + // n: number of items in filter. p: false positive rate + // m: number of bits in filter. k: number of hashers + // n = ceil(m / (-k / log(1 - exp(log(p) / k)))) + // p = pow(1 - exp(-k / (m / n)), k) + // m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); + // k = round((m / n) * log(2)); + + pub fn optimal_number_of_hashers(size_in_bytes: usize, expected_elements: usize) -> usize { + let expected_elements = expected_elements as f64; + let size_in_bits = (size_in_bytes * 8) as f64; + let k = (size_in_bits / expected_elements) * (2.0f64.ln()); + k.ceil() as usize + } + + pub fn prob_of_false_positive( + size_in_bytes: usize, + expected_elements: usize, + num_hashers: usize, + ) -> f64 { + let k = num_hashers as f64; + let m = (size_in_bytes * 8) as f64; + let n = expected_elements as f64; + (1.0 - (1.0 - (1.0 / m)).powf(k * n)).powf(k) + } + + pub fn suggest_size_in_bytes( + expected_elements: usize, + desired_false_positive_rate: f64, + ) -> usize { + let mut size_in_bytes = 1024 * 1024; + while size_in_bytes < usize::MAX / 2 + && Self::prob_of_false_positive( + size_in_bytes, + expected_elements, + Self::optimal_number_of_hashers(size_in_bytes, expected_elements), + ) > desired_false_positive_rate + { + size_in_bytes *= 2; + } + size_in_bytes + } +} diff --git a/src/bloom_filter/bloom_test.rs b/src/bloom_filter/bloom_test.rs index 54d68ba8..0c7e0426 100644 --- a/src/bloom_filter/bloom_test.rs +++ b/src/bloom_filter/bloom_test.rs @@ -1,62 +1,60 @@ #[cfg(test)] -mod tests { - use super::super::BloomFilter; +use super::BloomFilter; +// n: number of items in filter. p: false positive rate +// m: number of bits in filter. k: number of hashers +// n = ceil(m / (-k / log(1 - exp(log(p) / k)))) +// p = pow(1 - exp(-k / (m / n)), k) +// m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); +// k = round((m / n) * log(2)); - // n: number of items in filter. p: false positive rate - // m: number of bits in filter. k: number of hashers - // n = ceil(m / (-k / log(1 - exp(log(p) / k)))) - // p = pow(1 - exp(-k / (m / n)), k) +#[cfg(test)] +pub fn simplified_suggest_size(expected_elements: usize, target_fp_rate: f64) -> usize { // m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); - // k = round((m / n) * log(2)); - - fn simplified_suggest_size(expected_elements: usize, target_fp_rate: f64) -> usize { - // m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); - use std::f64::consts::LN_2; - let theoretical_optimum = (expected_elements as f64 * target_fp_rate.ln() / (-LN_2 * LN_2)) - .ceil() - .div_euclid(8.0) as usize; - theoretical_optimum.next_power_of_two() - } + use std::f64::consts::LN_2; + let theoretical_optimum = (expected_elements as f64 * target_fp_rate.ln() / (-LN_2 * LN_2)) + .ceil() + .div_euclid(8.0) as usize; + theoretical_optimum.next_power_of_two() +} - #[test] - fn bloom_optimal_hasher_number() { - let size_in_bytes = 1_000_000_000; - let expected_elements = 1_000_000_000; - assert_eq!( - BloomFilter::optimal_number_of_hashers(size_in_bytes, expected_elements), - 6 - ); - assert_eq!( - BloomFilter::optimal_number_of_hashers(1_000_000, 500_000), - 12 - ) - } - #[test] - fn bloom_test_prob_of_false_positive() { - // calculated from https://hur.st/bloomfilter/ - let size_in_bytes = 1_000_000_000; - let expected_elements = 1_000_000_000; - let num_hashers = 8; - assert_eq!( - BloomFilter::prob_of_false_positive(size_in_bytes, expected_elements, num_hashers), - 0.025_491_740_593_406_025 as f64 - ); - assert_eq!( - BloomFilter::prob_of_false_positive(1_048_576, 524288, 2), - 0.013_806_979_447_406_826 as f64 - ) - } +#[test] +fn bloom_optimal_hasher_number() { + let size_in_bytes = 1_000_000_000; + let expected_elements = 1_000_000_000; + assert_eq!( + BloomFilter::optimal_number_of_hashers(size_in_bytes, expected_elements), + 6 + ); + assert_eq!( + BloomFilter::optimal_number_of_hashers(1_000_000, 500_000), + 12 + ) +} +#[test] +fn bloom_test_prob_of_false_positive() { + // calculated from https://hur.st/bloomfilter/ + let size_in_bytes = 1_000_000_000; + let expected_elements = 1_000_000_000; + let num_hashers = 8; + assert_eq!( + BloomFilter::prob_of_false_positive(size_in_bytes, expected_elements, num_hashers), + 0.025_491_740_593_406_025 as f64 + ); + assert_eq!( + BloomFilter::prob_of_false_positive(1_048_576, 524288, 2), + 0.013_806_979_447_406_826 as f64 + ) +} - #[test] - fn bloom_suggest_size() { - // it's hard to derive this exactly since the algorithm is doing closest power of 2 - // instead of exact theoretical optimum - let expected_elements = 1_000_000; - let target_fp_rate = 0.0001 as f64; +#[test] +fn bloom_suggest_size() { + // it's hard to derive this exactly since the algorithm is doing closest power of 2 + // instead of exact theoretical optimum + let expected_elements = 1_000_000; + let target_fp_rate = 0.0001 as f64; - let simplified_suggest_size = simplified_suggest_size(expected_elements, target_fp_rate); - let suggested_size = BloomFilter::suggest_size_in_bytes(expected_elements, target_fp_rate); - assert_eq!(suggested_size, 4_194_304); - assert_eq!(suggested_size, simplified_suggest_size) - } + let simplified_suggest_size = simplified_suggest_size(expected_elements, target_fp_rate); + let suggested_size = BloomFilter::suggest_size_in_bytes(expected_elements, target_fp_rate); + assert_eq!(suggested_size, 4_194_304); + assert_eq!(suggested_size, simplified_suggest_size) } From 3ad79904595243c57d748d1fa5f0d797f210b9b8 Mon Sep 17 00:00:00 2001 From: Chris Ha Date: Thu, 31 Aug 2023 12:25:47 +0900 Subject: [PATCH 6/8] `bloom_math` documenation --- src/bloom_filter/bloom_math.rs | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/bloom_filter/bloom_math.rs b/src/bloom_filter/bloom_math.rs index 01b320b4..372eadff 100644 --- a/src/bloom_filter/bloom_math.rs +++ b/src/bloom_filter/bloom_math.rs @@ -1,17 +1,23 @@ use super::BloomFilter; impl BloomFilter { - // n: number of items in filter. p: false positive rate - // m: number of bits in filter. k: number of hashers + // Technically, we need 3 out of 4 to calculate the other. + // But we often want many of these to be either minimum itself or to allow other values to be minima. + // So we often only need 2 out of 4 ot calculate other two values. + // n: number of items (expected) in filter. + // p: (target) false positive rate + // m: number of bits in filter. + // k: number of hashers // n = ceil(m / (-k / log(1 - exp(log(p) / k)))) // p = pow(1 - exp(-k / (m / n)), k) // m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); // k = round((m / n) * log(2)); pub fn optimal_number_of_hashers(size_in_bytes: usize, expected_elements: usize) -> usize { - let expected_elements = expected_elements as f64; - let size_in_bits = (size_in_bytes * 8) as f64; - let k = (size_in_bits / expected_elements) * (2.0f64.ln()); + use std::f64::consts::LN_2; + let n = expected_elements as f64; + let m = (size_in_bytes * 8) as f64; + let k = (m / n) * (LN_2); k.ceil() as usize } @@ -20,9 +26,9 @@ impl BloomFilter { expected_elements: usize, num_hashers: usize, ) -> f64 { - let k = num_hashers as f64; - let m = (size_in_bytes * 8) as f64; let n = expected_elements as f64; + let m = (size_in_bytes * 8) as f64; + let k = num_hashers as f64; (1.0 - (1.0 - (1.0 / m)).powf(k * n)).powf(k) } @@ -30,8 +36,10 @@ impl BloomFilter { expected_elements: usize, desired_false_positive_rate: f64, ) -> usize { - let mut size_in_bytes = 1024 * 1024; - while size_in_bytes < usize::MAX / 2 + let min_size: usize = 1 << 20; // 1MiB + let max_size: usize = usize::MAX / 2; // 9E18 bytes 8exbi-bytes + let mut size_in_bytes = min_size; + while size_in_bytes < max_size && Self::prob_of_false_positive( size_in_bytes, expected_elements, From fafa7f5382710049e3671a07962f5885f183bb8a Mon Sep 17 00:00:00 2001 From: Chris Ha Date: Thu, 31 Aug 2023 12:26:16 +0900 Subject: [PATCH 7/8] implement missing behavior --- src/bloom_filter/bloom_test.rs | 52 +++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/src/bloom_filter/bloom_test.rs b/src/bloom_filter/bloom_test.rs index 0c7e0426..0fa202a1 100644 --- a/src/bloom_filter/bloom_test.rs +++ b/src/bloom_filter/bloom_test.rs @@ -10,11 +10,16 @@ use super::BloomFilter; #[cfg(test)] pub fn simplified_suggest_size(expected_elements: usize, target_fp_rate: f64) -> usize { // m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); + use std::cmp::{max, min}; use std::f64::consts::LN_2; let theoretical_optimum = (expected_elements as f64 * target_fp_rate.ln() / (-LN_2 * LN_2)) .ceil() .div_euclid(8.0) as usize; - theoretical_optimum.next_power_of_two() + let suggested_size = theoretical_optimum.next_power_of_two(); + + let min_size: usize = 1 << 20; //1 MiB + let max_size: usize = usize::MAX / 2; // 9E18 bytes 8exbi-bytes + min(max(suggested_size, min_size), max_size) } #[test] @@ -50,11 +55,44 @@ fn bloom_test_prob_of_false_positive() { fn bloom_suggest_size() { // it's hard to derive this exactly since the algorithm is doing closest power of 2 // instead of exact theoretical optimum - let expected_elements = 1_000_000; - let target_fp_rate = 0.0001 as f64; - let simplified_suggest_size = simplified_suggest_size(expected_elements, target_fp_rate); - let suggested_size = BloomFilter::suggest_size_in_bytes(expected_elements, target_fp_rate); - assert_eq!(suggested_size, 4_194_304); - assert_eq!(suggested_size, simplified_suggest_size) + // Define a struct to hold test case data + struct TestCase { + elements: usize, + fp_rate: f64, + expected_size: usize, + } + + // Create a vector of test cases + let test_cases = vec![ + // test for minimum size + TestCase { + elements: 4_000, + fp_rate: 1E-7, + expected_size: 1024 * 1024, + }, + // test for average size + TestCase { + elements: 1_000_000, + fp_rate: 1E-4, + expected_size: 4_194_304, + }, + // Add more test cases here as needed + ]; + + for test_case in test_cases { + let tested_size = BloomFilter::suggest_size_in_bytes(test_case.elements, test_case.fp_rate); + let simplified_size = simplified_suggest_size(test_case.elements, test_case.fp_rate); + + assert_eq!( + tested_size, test_case.expected_size, + "Failed for elements: {}, fp_rate: {}", + test_case.elements, test_case.fp_rate + ); + assert_eq!( + tested_size, simplified_size, + "Failed for elements: {}, fp_rate: {}", + test_case.elements, test_case.fp_rate + ); + } } From ea447ee308b33e5a3ee05998a2336a2576b26c48 Mon Sep 17 00:00:00 2001 From: Chris Ha Date: Thu, 31 Aug 2023 12:58:28 +0900 Subject: [PATCH 8/8] simplify with trait function `v1.clamp(min,max)` --- src/bloom_filter/bloom_test.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/bloom_filter/bloom_test.rs b/src/bloom_filter/bloom_test.rs index 0fa202a1..8208c765 100644 --- a/src/bloom_filter/bloom_test.rs +++ b/src/bloom_filter/bloom_test.rs @@ -10,7 +10,6 @@ use super::BloomFilter; #[cfg(test)] pub fn simplified_suggest_size(expected_elements: usize, target_fp_rate: f64) -> usize { // m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); - use std::cmp::{max, min}; use std::f64::consts::LN_2; let theoretical_optimum = (expected_elements as f64 * target_fp_rate.ln() / (-LN_2 * LN_2)) .ceil() @@ -19,7 +18,7 @@ pub fn simplified_suggest_size(expected_elements: usize, target_fp_rate: f64) -> let min_size: usize = 1 << 20; //1 MiB let max_size: usize = usize::MAX / 2; // 9E18 bytes 8exbi-bytes - min(max(suggested_size, min_size), max_size) + suggested_size.clamp(min_size, max_size) } #[test]