-
Notifications
You must be signed in to change notification settings - Fork 853
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use LevelHistogram
in PageIndex
#6135
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -569,7 +569,7 @@ pub struct ColumnChunkMetaData { | |||||||||||
/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the | ||||||||||||
/// number of rows with level 1, and so on. | ||||||||||||
/// | ||||||||||||
#[derive(Debug, Clone, PartialEq)] | ||||||||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)] | ||||||||||||
pub struct LevelHistogram { | ||||||||||||
inner: Vec<i64>, | ||||||||||||
} | ||||||||||||
|
@@ -620,6 +620,12 @@ impl LevelHistogram { | |||||||||||
} | ||||||||||||
} | ||||||||||||
|
||||||||||||
/// Appends values from the other histogram to this histogram | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I found the "append" notion quite confusing which is why I didn't put it in this struct at first Basically without append we have the invariant that Using append breaks that invariant. If we are going to keep it, maybe we can add some more comments like
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right, appending to a histogram makes it no longer a histogram :P I'll revert this part. |
||||||||||||
pub fn append(&mut self, other: &Self) { | ||||||||||||
self.inner.reserve(other.len()); | ||||||||||||
self.inner.extend(other.values()); | ||||||||||||
} | ||||||||||||
|
||||||||||||
/// return the length of the histogram | ||||||||||||
pub fn len(&self) -> usize { | ||||||||||||
self.inner.len() | ||||||||||||
|
@@ -1169,9 +1175,9 @@ pub struct ColumnIndexBuilder { | |||||||||||
null_counts: Vec<i64>, | ||||||||||||
boundary_order: BoundaryOrder, | ||||||||||||
/// contains the concatenation of the histograms of all pages | ||||||||||||
repetition_level_histograms: Option<Vec<i64>>, | ||||||||||||
repetition_level_histograms: Option<LevelHistogram>, | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because |
||||||||||||
/// contains the concatenation of the histograms of all pages | ||||||||||||
definition_level_histograms: Option<Vec<i64>>, | ||||||||||||
definition_level_histograms: Option<LevelHistogram>, | ||||||||||||
/// Is the information in the builder valid? | ||||||||||||
/// | ||||||||||||
/// Set to `false` if any entry in the page doesn't have statistics for | ||||||||||||
|
@@ -1227,14 +1233,16 @@ impl ColumnIndexBuilder { | |||||||||||
return; | ||||||||||||
} | ||||||||||||
if let Some(ref rep_lvl_hist) = repetition_level_histogram { | ||||||||||||
let hist = self.repetition_level_histograms.get_or_insert(Vec::new()); | ||||||||||||
hist.reserve(rep_lvl_hist.len()); | ||||||||||||
hist.extend(rep_lvl_hist.values()); | ||||||||||||
let hist = self | ||||||||||||
.repetition_level_histograms | ||||||||||||
.get_or_insert(LevelHistogram::default()); | ||||||||||||
hist.append(rep_lvl_hist); | ||||||||||||
} | ||||||||||||
if let Some(ref def_lvl_hist) = definition_level_histogram { | ||||||||||||
let hist = self.definition_level_histograms.get_or_insert(Vec::new()); | ||||||||||||
hist.reserve(def_lvl_hist.len()); | ||||||||||||
hist.extend(def_lvl_hist.values()); | ||||||||||||
let hist = self | ||||||||||||
.definition_level_histograms | ||||||||||||
.get_or_insert(LevelHistogram::default()); | ||||||||||||
hist.append(def_lvl_hist); | ||||||||||||
} | ||||||||||||
} | ||||||||||||
|
||||||||||||
|
@@ -1256,14 +1264,16 @@ impl ColumnIndexBuilder { | |||||||||||
/// | ||||||||||||
/// Note: callers should check [`Self::valid`] before calling this method | ||||||||||||
pub fn build_to_thrift(self) -> ColumnIndex { | ||||||||||||
let repetition_level_histograms = self.repetition_level_histograms.map(|x| x.into_inner()); | ||||||||||||
let definition_level_histograms = self.definition_level_histograms.map(|x| x.into_inner()); | ||||||||||||
ColumnIndex::new( | ||||||||||||
self.null_pages, | ||||||||||||
self.min_values, | ||||||||||||
self.max_values, | ||||||||||||
self.boundary_order, | ||||||||||||
self.null_counts, | ||||||||||||
self.repetition_level_histograms, | ||||||||||||
self.definition_level_histograms, | ||||||||||||
repetition_level_histograms, | ||||||||||||
definition_level_histograms, | ||||||||||||
) | ||||||||||||
} | ||||||||||||
} | ||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ use crate::basic::Type; | |
use crate::data_type::private::ParquetValueType; | ||
use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray, Int96}; | ||
use crate::errors::ParquetError; | ||
use crate::file::metadata::LevelHistogram; | ||
use crate::format::{BoundaryOrder, ColumnIndex}; | ||
use crate::util::bit_util::from_le_slice; | ||
use std::fmt::Debug; | ||
|
@@ -40,13 +41,13 @@ pub struct PageIndex<T> { | |
/// | ||
/// `repetition_level_histogram[i]` is a count of how many values are at repetition level `i`. | ||
/// For example, `repetition_level_histogram[0]` indicates how many rows the page contains. | ||
pub repetition_level_histogram: Option<Vec<i64>>, | ||
pub repetition_level_histogram: Option<LevelHistogram>, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 this is a good one -- I missed this |
||
/// Definition level histogram for the page | ||
/// | ||
/// `definition_level_histogram[i]` is a count of how many values are at definition level `i`. | ||
/// For example, `definition_level_histogram[max_definition_level]` indicates how many | ||
/// non-null values are present in the page. | ||
pub definition_level_histogram: Option<Vec<i64>>, | ||
pub definition_level_histogram: Option<LevelHistogram>, | ||
} | ||
|
||
impl<T> PageIndex<T> { | ||
|
@@ -59,10 +60,10 @@ impl<T> PageIndex<T> { | |
pub fn null_count(&self) -> Option<i64> { | ||
self.null_count | ||
} | ||
pub fn repetition_level_histogram(&self) -> Option<&Vec<i64>> { | ||
pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> { | ||
self.repetition_level_histogram.as_ref() | ||
} | ||
pub fn definition_level_histogram(&self) -> Option<&Vec<i64>> { | ||
pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> { | ||
self.definition_level_histogram.as_ref() | ||
} | ||
} | ||
|
@@ -175,17 +176,17 @@ impl<T: ParquetValueType> NativeIndex<T> { | |
for i in 0..len { | ||
let page_idx = i * num_levels; | ||
let page_hist = hist[page_idx..page_idx + num_levels].to_vec(); | ||
res.push(Some(page_hist)); | ||
res.push(Some(LevelHistogram::from(page_hist))); | ||
} | ||
res | ||
} else { | ||
vec![None; len] | ||
} | ||
}; | ||
|
||
let rep_hists: Vec<Option<Vec<i64>>> = | ||
let rep_hists: Vec<Option<LevelHistogram>> = | ||
to_page_histograms(index.repetition_level_histograms); | ||
let def_hists: Vec<Option<Vec<i64>>> = | ||
let def_hists: Vec<Option<LevelHistogram>> = | ||
to_page_histograms(index.definition_level_histograms); | ||
|
||
let indexes = index | ||
|
@@ -236,19 +237,22 @@ mod tests { | |
min: Some(-123), | ||
max: Some(234), | ||
null_count: Some(0), | ||
repetition_level_histogram: Some(vec![1, 2]), | ||
definition_level_histogram: Some(vec![1, 2, 3]), | ||
repetition_level_histogram: Some(LevelHistogram::from(vec![1, 2])), | ||
definition_level_histogram: Some(LevelHistogram::from(vec![1, 2, 3])), | ||
}; | ||
|
||
assert_eq!(page_index.min().unwrap(), &-123); | ||
assert_eq!(page_index.max().unwrap(), &234); | ||
assert_eq!(page_index.min_bytes().unwrap(), (-123).as_bytes()); | ||
assert_eq!(page_index.max_bytes().unwrap(), 234.as_bytes()); | ||
assert_eq!(page_index.null_count().unwrap(), 0); | ||
assert_eq!(page_index.repetition_level_histogram(), Some(&vec![1, 2])); | ||
assert_eq!( | ||
page_index.definition_level_histogram(), | ||
Some(&vec![1, 2, 3]) | ||
page_index.repetition_level_histogram().unwrap().values(), | ||
&vec![1, 2] | ||
); | ||
assert_eq!( | ||
page_index.definition_level_histogram().unwrap().values(), | ||
&vec![1, 2, 3] | ||
); | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍