-
Notifications
You must be signed in to change notification settings - Fork 220
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: fsst compression with mini-block #3121
Changes from all commits
4162272
a7f803d
afa1768
a392d13
a0ed83d
09e968c
2397c19
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,7 +31,7 @@ use crate::encodings::physical::bitpack_fastlanes::{ | |
use crate::encodings::physical::block_compress::{CompressionConfig, CompressionScheme}; | ||
use crate::encodings::physical::dictionary::AlreadyDictionaryEncoder; | ||
use crate::encodings::physical::fixed_size_list::FslPerValueCompressor; | ||
use crate::encodings::physical::fsst::FsstArrayEncoder; | ||
use crate::encodings::physical::fsst::{FsstArrayEncoder, FsstMiniBlockEncoder}; | ||
use crate::encodings::physical::packed_struct::PackedStructEncoder; | ||
use crate::format::ProtobufUtils; | ||
use crate::repdef::RepDefBuilder; | ||
|
@@ -49,6 +49,7 @@ use crate::{ | |
}, | ||
format::pb, | ||
}; | ||
use fsst::fsst::{FSST_LEAST_INPUT_MAX_LENGTH, FSST_LEAST_INPUT_SIZE}; | ||
|
||
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus}; | ||
use std::collections::hash_map::RandomState; | ||
|
@@ -792,7 +793,7 @@ impl CompressionStrategy for CoreArrayEncodingStrategy { | |
if let DataBlock::FixedWidth(ref fixed_width_data) = data { | ||
let bit_widths = data | ||
.get_stat(Stat::BitWidth) | ||
.expect("FixedWidthDataBlock should have valid bit width statistics"); | ||
.expect("FixedWidthDataBlock should have valid `Stat::BitWidth` statistics"); | ||
// Temporary hack to work around https://github.com/lancedb/lance/issues/3102 | ||
// Ideally we should still be able to bit-pack here (either to 0 or 1 bit per value) | ||
let has_all_zeros = bit_widths | ||
|
@@ -811,6 +812,21 @@ impl CompressionStrategy for CoreArrayEncodingStrategy { | |
} | ||
if let DataBlock::VariableWidth(ref variable_width_data) = data { | ||
if variable_width_data.bits_per_offset == 32 { | ||
let data_size = variable_width_data.get_stat(Stat::DataSize).expect( | ||
"VariableWidth DataBlock should have valid `Stat::DataSize` statistics", | ||
); | ||
let data_size = data_size.as_primitive::<UInt64Type>().value(0); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also could be helpful to have a
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for the suggestion! I will create a separate PR for this. |
||
|
||
let max_len = variable_width_data.get_stat(Stat::MaxLength).expect( | ||
"VariableWidth DataBlock should have valid `Stat::DataSize` statistics", | ||
); | ||
let max_len = max_len.as_primitive::<UInt64Type>().value(0); | ||
|
||
if max_len >= FSST_LEAST_INPUT_MAX_LENGTH | ||
&& data_size >= FSST_LEAST_INPUT_SIZE as u64 | ||
{ | ||
return Ok(Box::new(FsstMiniBlockEncoder::default())); | ||
} | ||
return Ok(Box::new(BinaryMiniBlockEncoder::default())); | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor nit: if you find yourself repeating the same
expect
statement again and again then maybe it would be worth it to make anexpect_stat
method which does theget_stat
/expect
combination.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ha, I actually didn't know about this, thanks for the suggestion. I will fill a separate PR for this.