From f621d28db590ff6ad3907450f7ff434c7deb9766 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Fri, 1 Dec 2023 04:46:36 +1100 Subject: [PATCH] Parquet: omit min/max for interval columns when writing stats (#5147) * Parquet: omit min/max for interval columns when writing stats * Trigger --- parquet/src/column/writer/encoder.rs | 7 +++- parquet/src/column/writer/mod.rs | 59 +++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index d0720dd24306..0d5144f61c26 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -18,7 +18,7 @@ use bytes::Bytes; use half::f16; -use crate::basic::{Encoding, LogicalType, Type}; +use crate::basic::{ConvertedType, Encoding, LogicalType, Type}; use crate::bloom_filter::Sbbf; use crate::column::writer::{ compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, update_min, @@ -137,7 +137,10 @@ pub struct ColumnValueEncoderImpl { impl ColumnValueEncoderImpl { fn write_slice(&mut self, slice: &[T::T]) -> Result<()> { - if self.statistics_enabled == EnabledStatistics::Page { + if self.statistics_enabled == EnabledStatistics::Page + // INTERVAL has undefined sort order, so don't write min/max stats for it + && self.descr.converted_type() != ConvertedType::INTERVAL + { if let Some((min, max)) = self.min_max(slice, None) { update_min(&self.descr, &min, &mut self.min_value); update_max(&self.descr, &max, &mut self.max_value); diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 14b8655091e4..e92a502689a3 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -332,7 +332,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // If only computing chunk-level statistics compute them here, page-level statistics // are computed in [`Self::write_mini_batch`] and used to update chunk statistics in // [`Self::add_data_page`] - if self.statistics_enabled == EnabledStatistics::Chunk { + if self.statistics_enabled == EnabledStatistics::Chunk + // INTERVAL has undefined sort order, so don't write min/max stats for it + && self.descr.converted_type() != ConvertedType::INTERVAL + { match (min, max) { (Some(min), Some(max)) => { update_min(&self.descr, min, &mut self.column_metrics.min_column_value); @@ -1093,7 +1096,6 @@ fn is_nan(descr: &ColumnDescriptor, val: &T) -> bool { /// /// If `cur` is `None`, sets `cur` to `Some(val)`, otherwise calls `should_update` with /// the value of `cur`, and updates `cur` to `Some(val)` if it returns `true` - fn update_stat( descr: &ColumnDescriptor, val: &T, @@ -3066,6 +3068,30 @@ mod tests { Ok(()) } + #[test] + fn test_interval_stats_should_not_have_min_max() { + let input = [ + vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], + vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], + ] + .into_iter() + .map(|s| ByteArray::from(s).into()) + .collect::>(); + + let page_writer = get_test_page_writer(); + let mut writer = get_test_interval_column_writer(page_writer); + writer.write_batch(&input, None, None).unwrap(); + + let metadata = writer.close().unwrap().metadata; + let stats = if let Some(Statistics::FixedLenByteArray(stats)) = metadata.statistics() { + stats.clone() + } else { + panic!("metadata missing statistics"); + }; + assert!(!stats.has_min_max_set()); + } + fn write_multiple_pages( column_descr: &Arc, pages: &[&[Option]], @@ -3395,8 +3421,7 @@ mod tests { values: &[FixedLenByteArray], ) -> ValueStatistics { let page_writer = get_test_page_writer(); - let props = Default::default(); - let mut writer = get_test_float16_column_writer(page_writer, 0, 0, props); + let mut writer = get_test_float16_column_writer(page_writer); writer.write_batch(values, None, None).unwrap(); let metadata = writer.close().unwrap().metadata; @@ -3409,12 +3434,9 @@ mod tests { fn get_test_float16_column_writer( page_writer: Box, - max_def_level: i16, - max_rep_level: i16, - props: WriterPropertiesPtr, ) -> ColumnWriterImpl<'static, FixedLenByteArrayType> { - let descr = Arc::new(get_test_float16_column_descr(max_def_level, max_rep_level)); - let column_writer = get_column_writer(descr, props, page_writer); + let descr = Arc::new(get_test_float16_column_descr(0, 0)); + let column_writer = get_column_writer(descr, Default::default(), page_writer); get_typed_column_writer::(column_writer) } @@ -3429,6 +3451,25 @@ mod tests { ColumnDescriptor::new(Arc::new(tpe), max_def_level, max_rep_level, path) } + fn get_test_interval_column_writer( + page_writer: Box, + ) -> ColumnWriterImpl<'static, FixedLenByteArrayType> { + let descr = Arc::new(get_test_interval_column_descr()); + let column_writer = get_column_writer(descr, Default::default(), page_writer); + get_typed_column_writer::(column_writer) + } + + fn get_test_interval_column_descr() -> ColumnDescriptor { + let path = ColumnPath::from("col"); + let tpe = + SchemaType::primitive_type_builder("col", FixedLenByteArrayType::get_physical_type()) + .with_length(12) + .with_converted_type(ConvertedType::INTERVAL) + .build() + .unwrap(); + ColumnDescriptor::new(Arc::new(tpe), 0, 0, path) + } + /// Returns column writer for UINT32 Column provided as ConvertedType only fn get_test_unsigned_int_given_as_converted_column_writer<'a, T: DataType>( page_writer: Box,