Skip to content

Commit

Permalink
refactor(store): replace Rabin chunker with GearCDC implementation (#109
Browse files Browse the repository at this point in the history
)

- Removed RabinChunker unfinished implementation
- Added GearCDC chunker with rolling hash implementation
- Added comprehensive test suite for GearCDC
- Added constants for min/max/desired chunk sizes
- Added gear table for rolling hash computation
- Updated all references to use new constant name
  • Loading branch information
appcypher authored Jan 17, 2025
1 parent 51114d0 commit a6d3128
Show file tree
Hide file tree
Showing 10 changed files with 579 additions and 31 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions monofs/lib/store/flatfsstore.rs
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ where

#[cfg(test)]
mod tests {
use monoutils_store::DEFAULT_CHUNK_MAX_SIZE;
use monoutils_store::DEFAULT_MAX_CHUNK_SIZE;
use std::fs;
use tokio::io::AsyncReadExt;

Expand Down Expand Up @@ -556,9 +556,9 @@ mod tests {
// Verify size limits from chunker
assert_eq!(
store.get_node_block_max_size(),
Some(DEFAULT_CHUNK_MAX_SIZE)
Some(DEFAULT_MAX_CHUNK_SIZE)
);
assert_eq!(store.get_raw_block_max_size(), Some(DEFAULT_CHUNK_MAX_SIZE));
assert_eq!(store.get_raw_block_max_size(), Some(DEFAULT_MAX_CHUNK_SIZE));

Ok(())
}
Expand Down
3 changes: 3 additions & 0 deletions monoutils-store/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,6 @@ thiserror.workspace = true
tokio = { workspace = true, features = ["sync"] }
tokio-util = { workspace = true, features = ["io"] }
monoutils.workspace = true

[dev-dependencies]
rand.workspace = true
81 changes: 79 additions & 2 deletions monoutils-store/lib/implementations/chunkers/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,82 @@
// Constants
//--------------------------------------------------------------------------------------------------

/// The default chunk size is 512 KiB.
pub const DEFAULT_CHUNK_MAX_SIZE: u64 = 512 * 1024;
/// The default maximum chunk size is 512 KiB.
pub const DEFAULT_MAX_CHUNK_SIZE: u64 = 512 * 1024; // TODO: 2KiB ???

/// The default minimum chunk size is 128 KiB.
pub const DEFAULT_MIN_CHUNK_SIZE: u64 = 128 * 1024; // TODO: 4KiB ???

/// The default desired chunk size is 256 KiB.
pub const DEFAULT_DESIRED_CHUNK_SIZE: u64 = 256 * 1024; // TODO: 6MiB ???

// const DEFAULT_ROLLING_HASH_MASK: u64 = 0x0000000000000000;

/// The gear table is used to generate the rolling hash mask.
#[rustfmt::skip]
pub static DEFAULT_GEAR_TABLE: [u64; 256] = [
0x934dc63f1a2fbc75, 0xe42f5a7b7364d1a7, 0xfcc3a557352ead66, 0x1eb038d4e6c75ae9,
0x9826d86994d27fd1, 0xdac0fcffe0399894, 0x2288ffe125735827, 0x0bd9fcd34c572187,
0xbce855206433c277, 0x2d59433a15e2089d, 0x5111df55ce5ba7b8, 0xf4e20ce31df535b6,
0xfee62fa9ae6a36dd, 0x5cc9a1fdb8e9d39f, 0xa1e2d19ff88c4738, 0x7b88cf0bec4ce823,
0x37b3f9cb3b3fac27, 0xb8fb20c023c6c8b2, 0x3b42660df117bfd0, 0xafea8a871b318682,
0x9ba9da395a5730eb, 0x5ba17f0d89de3973, 0x562168e510d6564a, 0x39adf42c51a542a5,
0xcb4d7e410618e4c3, 0x162519d991abcc83, 0xb37e627f7623a61e, 0x4f3ddbd91e5c87a9,
0x87e77c296adf3669, 0x587dd0305612722c, 0x063d37b3e59b6989, 0x368e661cf3434448,
0xa48702aea94a0f46, 0xf8a809dc15c1be18, 0xac677f6498eab68e, 0x77dc02e66a8876bf,
0xd102089ddf25dce2, 0x47365e75b22f7d4d, 0x8e87402901cd05ce, 0x7cdda092cc7f8fcb,
0xa1d26bade1302aa5, 0xcfcd30b90314e8c6, 0x1949238103ef41b6, 0x63fc3684a9cc872e,
0x16a3ac3cd5558592, 0x4ee1228377ea6d00, 0x17b7876d45f54350, 0xcd03b5232c70e911,
0xc7ef23d2a017c930, 0xb89dc625d3907010, 0x6e438f90f74f615a, 0x2b832703c82b2cea,
0x0f43485841d49906, 0xcd1bfdd09569132b, 0x95b9270e3e2e9c3a, 0x53b4ca21c14e940b,
0xa44869f296d575ac, 0x641f05ca74e0355f, 0x939eada2f8e26790, 0x8739f9c4f926c947,
0x09e7e0dd7a0195f2, 0x7cbde45676dec445, 0xe15b0ea0b16f9556, 0x5591406a6617099b,
0x9809f6ddf7c32b72, 0xa6b806dbb39fb230, 0xebf0d07769c874a9, 0xf07e343f653edfc9,
0x63ee20bc3fe1285a, 0xcb04eb23bd48b7e0, 0x4f61c9d4fd7eca50, 0x64c6fb630f22ac50,
0x142a1f4ea6e7072d, 0x7330ae35c334f2dc, 0xa30bcf22e0d963cf, 0x63eff1b02d9b9dd3,
0x6b37381dcb7a5d59, 0x545fcc5e4e6d7dab, 0x094f1ad34c3d1f29, 0x76ab9b8f4a0507b1,
0xcf9181092beb4cc8, 0x5be896ff8506448d, 0xda2a079def5e56a9, 0xcecea52cf7975aa8,
0xd903d0613a812353, 0x3ae7b86896c8e107, 0x5d893869ec46862a, 0x06dc65ec0f77cf08,
0x977bb100c58ea221, 0xe21729b833df1401, 0x63639e075487c549, 0x885a2ba929a53b0b,
0xd4f7253d68e1e693, 0x7b150a9734efe5c6, 0x2d1faf7174838274, 0x12f139bd8d761543,
0x1cac6aa3bdf8405c, 0x2b1f5cb61b014c0c, 0x6cbe1a0531ea6d45, 0xb038e76dacc2d37e,
0x87476df20ba6627b, 0x63bcd73d9bcf3854, 0xf8abfb21f28c88a3, 0xb8d1c0eaa2dc3a28,
0xf416fca050a2ed70, 0x396d53f5e495969a, 0x6093c46ed8df4b09, 0x2ce6ccb27b6da1ba,
0xd020db26beadd4fd, 0x1c60ede6b88fa4d8, 0xf9cbd1e60f5accea, 0xc9cbe32183cdf780,
0xbe4248f8e41e896e, 0xcfa0ae01e14b1c52, 0xa7c8c8d2400eea30, 0x229376ca34eaabf5,
0x3785ec931a7562b0, 0x19ad6f0aa76dda4a, 0x62461f0b1912bd8f, 0x7af96db1a84b1373,
0x034e7d3e5547fd35, 0x3f0bfd17853f403c, 0x27dcf6026e19d44f, 0x3d658c7ebf2c03fe,
0x451a37679de76be3, 0xa3f32da1a1559eac, 0xa6d466ee50c1f808, 0xd2166409b67ade33,
0x57517e23a251c969, 0xda9f98fbb946750c, 0xef05aa950c0165b7, 0xa86aab56733d1d5a,
0x314546298548db64, 0x4d13562cbf14abda, 0x65aa6e478ef65b80, 0x2b2886ecf0d83d62,
0x1f957dc7d7e776f5, 0xe9444f727c6c3928, 0xdcceba0a933bce23, 0x4e0b955478834d48,
0xdbb91b8b0585c263, 0xadc16e742df8698d, 0x1de068add4dddaf2, 0xc00de3921cb3097f,
0x12f134c19342ef64, 0x1fdf38bbc41c16b8, 0xfebd3b2f77ca7a26, 0x99da3b50c0dd30cf,
0xc79a3a8b284e2e77, 0x629cb257ac037762, 0x8abbf035d0bb69b0, 0x4adbfae80a85b75f,
0xc8869cd11e3167a0, 0x550efebb4b07f3ad, 0x3b7f9e1f7cc60e23, 0xca56904382892e6f,
0x0fadda765c741fdd, 0x81e9daf42ef32abb, 0x1fd30c90e4286783, 0xe7ca9bcaee7d4ea7,
0x1b79ead70ba01456, 0x7d7eab8593b24d9f, 0x88690a1b301315df, 0x454eca065a5051db,
0xca4fd9d61385e539, 0x0039e2c776ecef6c, 0xae68842ca70aea0b, 0x863a14316d601bf9,
0xb8324636dc05022b, 0xaec62a278bded5da, 0xc77e48036be43f9c, 0x08150dec6308334b,
0xafec69cc55acc2af, 0xbdbec85be0190e88, 0x584cba9a7899b3a7, 0x66691ba4c9628ca6,
0x3e0edf7ed52faed3, 0x19e0e2487da291f7, 0x86b8f4d78569352d, 0x5d4d7dc9c206d788,
0x608769243d4b3f55, 0x6a0566f48dcb7e90, 0x176fdeaca7205a42, 0x9435254b909d1679,
0xe9a7d0c6377b9c39, 0x34b9a1eb78c34c7b, 0x871855f22bc99139, 0xe268ffef1f7f508f,
0x5f47591fbbb42814, 0x9d795f85f4169dcc, 0xdfba9b3e0fd46412, 0xf85b0e4d4096b048,
0x444a80bc63e04952, 0xc78dc90c7c5eceea, 0x7e99c7d3ac75ae93, 0x7fdf884d3f190ca3,
0xf687d347814186ce, 0x63ef30025d6ca1a7, 0xb0d8bb4b66bef836, 0x5b52ef53be745d40,
0xef31e39969d73dbc, 0xcc93d3ba46730343, 0x9f6628e2bf0d3176, 0xf403b20cf8f06505,
0x335c5996e49be86f, 0xb2f5feae2de888a3, 0x1250dcef56cb0b11, 0xbb2cf97d8074a80f,
0x6edd77ae233db9fc, 0x8c0b5dd112fd375b, 0x95f11213c18559e0, 0xc5d0da638b1d16c1,
0xbe3919a9dc1c940c, 0x2c42bd5a5fb58291, 0x29349a51ffd60426, 0x0268f6100054560b,
0x222767d096123693, 0x6eebef90a17e8771, 0x88ab85f7c1e03711, 0xf24725946153b4d4,
0x4e2e482cd2b128c7, 0x9b33480e5905029e, 0x13a82c68d4ed7d6d, 0x896f727adcdd1607,
0x084182eafda7e6d7, 0xf8d3761f72a21cd3, 0xafb1d5dd9406a46c, 0xac614b3a434c44bb,
0xf0c2e04823152cd7, 0xfc0bc19a0ba17bcb, 0xe37cb42de36f49ea, 0x5edd06863e9f2488,
0xd1242ef181256605, 0xc3c8b875cc5d7483, 0xac75a2c774350d19, 0xa918f4209d2d1219,
0x70cf0f8d1e95c41f, 0xe2c063b89ceb35f2, 0x6c3b8e5b1dba4cd1, 0x848399c9cc10d552,
0x63f6cb62d5f088a8, 0x3fa09e3bad673bd8, 0xbafc8915513554c4, 0xe062541c7d11ba74,
0xd2a45c35598a5c8d, 0x8df580afac8f80d6, 0x589735cd31830eaf, 0x005533d654d3b4b2,
0xb8d133c02207fa33, 0xecbfffd864b2e87c, 0xb240770c24e5cf38, 0xaf8c8783788bb279,
0x47d0d26806edfd85, 0x8f218e6cc4a793c6, 0xd33257b8e2091db6, 0xf3cc67629aff40d3,
];
Original file line number Diff line number Diff line change
Expand Up @@ -4,39 +4,56 @@ use tokio::io::AsyncRead;

use crate::{Chunker, StoreResult};

use super::DEFAULT_CHUNK_MAX_SIZE;
use super::{
constants::DEFAULT_MAX_CHUNK_SIZE, DEFAULT_DESIRED_CHUNK_SIZE, DEFAULT_GEAR_TABLE,
DEFAULT_MIN_CHUNK_SIZE,
};

//--------------------------------------------------------------------------------------------------
// Types
//--------------------------------------------------------------------------------------------------

/// A chunker that splits data into variable-size chunks using the Rabin fingerprinting algorithm.
///
/// The `RabinChunker` leverages the Rabin fingerprinting technique to produce chunks of data with
/// variable sizes. This algorithm is particularly effective for identifying duplicate content within
/// files, as well as across different files, by creating consistent chunk boundaries. The resulting
/// chunks are then processed and stored in an IPLD form.
pub struct RabinChunker {
/// The size of each chunk.
chunk_size: u64,
/// A chunker that splits data into variable-size chunks using the FastCDC algorithm.
pub struct FastCDC {
/// The gear table.
gear_table: [u64; 256],

/// The desired chunk size.
desired_chunk_size: u64,

/// The minimum size of each chunk.
min_chunk_size: u64,

/// The maximum size of each chunk.
max_chunk_size: u64,
}

//--------------------------------------------------------------------------------------------------
// Methods
//--------------------------------------------------------------------------------------------------

impl RabinChunker {
/// Creates a new `RabinChunker` with the given `chunk_size`.
pub fn new(chunk_size: u64) -> Self {
Self { chunk_size }
impl FastCDC {
/// Creates a new `FastCDC` with the given `min_size` and `max_size`.
pub fn new(
desired_chunk_size: u64,
min_chunk_size: u64,
max_chunk_size: u64,
gear_table: [u64; 256],
) -> Self {
Self {
gear_table,
desired_chunk_size,
min_chunk_size,
max_chunk_size,
}
}
}

//--------------------------------------------------------------------------------------------------
// Trait Implementations
//--------------------------------------------------------------------------------------------------

impl Chunker for RabinChunker {
impl Chunker for FastCDC {
async fn chunk<'a>(
&self,
_reader: impl AsyncRead + Send + 'a,
Expand All @@ -46,12 +63,17 @@ impl Chunker for RabinChunker {
}

fn chunk_max_size(&self) -> Option<u64> {
Some(self.chunk_size)
Some(self.max_chunk_size)
}
}

impl Default for RabinChunker {
impl Default for FastCDC {
fn default() -> Self {
Self::new(DEFAULT_CHUNK_MAX_SIZE)
Self::new(
DEFAULT_DESIRED_CHUNK_SIZE,
DEFAULT_MIN_CHUNK_SIZE,
DEFAULT_MAX_CHUNK_SIZE,
DEFAULT_GEAR_TABLE,
)
}
}
4 changes: 2 additions & 2 deletions monoutils-store/lib/implementations/chunkers/fixed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use tokio::io::{AsyncRead, AsyncReadExt};

use crate::{Chunker, StoreError, StoreResult};

use super::DEFAULT_CHUNK_MAX_SIZE;
use super::constants::DEFAULT_MAX_CHUNK_SIZE;

//--------------------------------------------------------------------------------------------------
// Types
Expand Down Expand Up @@ -71,7 +71,7 @@ impl Chunker for FixedSizeChunker {

impl Default for FixedSizeChunker {
fn default() -> Self {
Self::new(DEFAULT_CHUNK_MAX_SIZE)
Self::new(DEFAULT_MAX_CHUNK_SIZE)
}
}

Expand Down
Loading

0 comments on commit a6d3128

Please sign in to comment.