From a7e41c332931e47025bfd0193882c98dfd6dd385 Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 7 Feb 2024 14:48:20 -0800 Subject: [PATCH 01/44] regen thrift with size statistics added --- parquet/src/file/metadata.rs | 5 +- parquet/src/format.rs | 332 ++++++++++++++++++++++++++++++----- 2 files changed, 290 insertions(+), 47 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index acd3a9f938c5..cb1b7f54541e 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -734,6 +734,7 @@ impl ColumnChunkMetaData { .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()), bloom_filter_offset: self.bloom_filter_offset, bloom_filter_length: self.bloom_filter_length, + size_statistics: None, } } @@ -948,6 +949,8 @@ impl ColumnIndexBuilder { self.max_values, self.boundary_order, self.null_counts, + None, + None, ) } } @@ -996,7 +999,7 @@ impl OffsetIndexBuilder { .zip(self.first_row_index_array.iter()) .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index)) .collect::>(); - OffsetIndex::new(locations) + OffsetIndex::new(locations, None) } } diff --git a/parquet/src/format.rs b/parquet/src/format.rs index 4e1aa0b65b7c..e1d2df5f42e3 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -632,6 +632,143 @@ impl From<&BoundaryOrder> for i32 { } } +// +// SizeStatistics +// + +/// A structure for capturing metadata for estimating the unencoded, +/// uncompressed size of data written. This is useful for readers to estimate +/// how much memory is needed to reconstruct data in their memory model and for +/// fine grained filter pushdown on nested structures (the histograms contained +/// in this structure can help determine the number of nulls at a particular +/// nesting level and maximum length of lists). +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct SizeStatistics { + /// The number of physical bytes stored for BYTE_ARRAY data values assuming + /// no encoding. This is exclusive of the bytes needed to store the length of + /// each byte array. In other words, this field is equivalent to the `(size + /// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values + /// written)`. To determine unencoded sizes of other types readers can use + /// schema information multiplied by the number of non-null and null values. + /// The number of null/non-null values can be inferred from the histograms + /// below. + /// + /// For example, if a column chunk is dictionary-encoded with dictionary + /// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], + /// then this value for that data page should be 7 (1 + 1 + 2 + 3). + /// + /// This field should only be set for types that use BYTE_ARRAY as their + /// physical type. + pub unencoded_byte_array_data_bytes: Option, + /// When present, there is expected to be one element corresponding to each + /// repetition (i.e. size=max repetition_level+1) where each element + /// represents the number of times the repetition level was observed in the + /// data. + /// + /// This field may be omitted if max_repetition_level is 0 without loss + /// of information. + /// + pub repetition_level_histogram: Option>, + /// Same as repetition_level_histogram except for definition levels. + /// + /// This field may be omitted if max_definition_level is 0 or 1 without + /// loss of information. + /// + pub definition_level_histogram: Option>, +} + +impl SizeStatistics { + pub fn new(unencoded_byte_array_data_bytes: F1, repetition_level_histogram: F2, definition_level_histogram: F3) -> SizeStatistics where F1: Into>, F2: Into>>, F3: Into>> { + SizeStatistics { + unencoded_byte_array_data_bytes: unencoded_byte_array_data_bytes.into(), + repetition_level_histogram: repetition_level_histogram.into(), + definition_level_histogram: definition_level_histogram.into(), + } + } +} + +impl crate::thrift::TSerializable for SizeStatistics { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option> = None; + let mut f_3: Option> = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i64()?; + f_1 = Some(val); + }, + 2 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_0 = i_prot.read_i64()?; + val.push(list_elem_0); + } + i_prot.read_list_end()?; + f_2 = Some(val); + }, + 3 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_1 = i_prot.read_i64()?; + val.push(list_elem_1); + } + i_prot.read_list_end()?; + f_3 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = SizeStatistics { + unencoded_byte_array_data_bytes: f_1, + repetition_level_histogram: f_2, + definition_level_histogram: f_3, + }; + Ok(ret) + } + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("SizeStatistics"); + o_prot.write_struct_begin(&struct_ident)?; + if let Some(fld_var) = self.unencoded_byte_array_data_bytes { + o_prot.write_field_begin(&TFieldIdentifier::new("unencoded_byte_array_data_bytes", TType::I64, 1))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.repetition_level_histogram { + o_prot.write_field_begin(&TFieldIdentifier::new("repetition_level_histogram", TType::List, 2))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.definition_level_histogram { + o_prot.write_field_begin(&TFieldIdentifier::new("definition_level_histogram", TType::List, 3))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + // // Statistics // @@ -2152,7 +2289,7 @@ pub struct DataPageHeader { pub definition_level_encoding: Encoding, /// Encoding used for repetition levels * pub repetition_level_encoding: Encoding, - /// Optional statistics for the data in this page* + /// Optional statistics for the data in this page * pub statistics: Option, } @@ -2392,17 +2529,17 @@ pub struct DataPageHeaderV2 { pub num_rows: i32, /// Encoding used for data in this page * pub encoding: Encoding, - /// length of the definition levels + /// Length of the definition levels pub definition_levels_byte_length: i32, - /// length of the repetition levels + /// Length of the repetition levels pub repetition_levels_byte_length: i32, - /// whether the values are compressed. + /// Whether the values are compressed. /// Which means the section of the page between /// definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) /// is compressed with the compression_codec. /// If missing it is considered compressed pub is_compressed: Option, - /// optional statistics for the data in this page * + /// Optional statistics for the data in this page * pub statistics: Option, } @@ -3415,10 +3552,15 @@ pub struct ColumnMetaData { /// Writers should write this field so readers can read the bloom filter /// in a single I/O. pub bloom_filter_length: Option, + /// Optional statistics to help estimate total memory when converted to in-memory + /// representations. The histograms contained in these statistics can + /// also be useful in some cases for more fine-grained nullability/list length + /// filter pushdown. + pub size_statistics: Option, } impl ColumnMetaData { - pub fn new(type_: Type, encodings: Vec, path_in_schema: Vec, codec: CompressionCodec, num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64, key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10, dictionary_page_offset: F11, statistics: F12, encoding_stats: F13, bloom_filter_offset: F14, bloom_filter_length: F15) -> ColumnMetaData where F8: Into>>, F10: Into>, F11: Into>, F12: Into>, F13: Into>>, F14: Into>, F15: Into> { + pub fn new(type_: Type, encodings: Vec, path_in_schema: Vec, codec: CompressionCodec, num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64, key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10, dictionary_page_offset: F11, statistics: F12, encoding_stats: F13, bloom_filter_offset: F14, bloom_filter_length: F15, size_statistics: F16) -> ColumnMetaData where F8: Into>>, F10: Into>, F11: Into>, F12: Into>, F13: Into>>, F14: Into>, F15: Into>, F16: Into> { ColumnMetaData { type_, encodings, @@ -3435,6 +3577,7 @@ impl ColumnMetaData { encoding_stats: encoding_stats.into(), bloom_filter_offset: bloom_filter_offset.into(), bloom_filter_length: bloom_filter_length.into(), + size_statistics: size_statistics.into(), } } } @@ -3457,6 +3600,7 @@ impl crate::thrift::TSerializable for ColumnMetaData { let mut f_13: Option> = None; let mut f_14: Option = None; let mut f_15: Option = None; + let mut f_16: Option = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -3472,8 +3616,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_0 = Encoding::read_from_in_protocol(i_prot)?; - val.push(list_elem_0); + let list_elem_2 = Encoding::read_from_in_protocol(i_prot)?; + val.push(list_elem_2); } i_prot.read_list_end()?; f_2 = Some(val); @@ -3482,8 +3626,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_1 = i_prot.read_string()?; - val.push(list_elem_1); + let list_elem_3 = i_prot.read_string()?; + val.push(list_elem_3); } i_prot.read_list_end()?; f_3 = Some(val); @@ -3508,8 +3652,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_2 = KeyValue::read_from_in_protocol(i_prot)?; - val.push(list_elem_2); + let list_elem_4 = KeyValue::read_from_in_protocol(i_prot)?; + val.push(list_elem_4); } i_prot.read_list_end()?; f_8 = Some(val); @@ -3534,8 +3678,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_3 = PageEncodingStats::read_from_in_protocol(i_prot)?; - val.push(list_elem_3); + let list_elem_5 = PageEncodingStats::read_from_in_protocol(i_prot)?; + val.push(list_elem_5); } i_prot.read_list_end()?; f_13 = Some(val); @@ -3548,6 +3692,10 @@ impl crate::thrift::TSerializable for ColumnMetaData { let val = i_prot.read_i32()?; f_15 = Some(val); }, + 16 => { + let val = SizeStatistics::read_from_in_protocol(i_prot)?; + f_16 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -3579,6 +3727,7 @@ impl crate::thrift::TSerializable for ColumnMetaData { encoding_stats: f_13, bloom_filter_offset: f_14, bloom_filter_length: f_15, + size_statistics: f_16, }; Ok(ret) } @@ -3660,6 +3809,11 @@ impl crate::thrift::TSerializable for ColumnMetaData { o_prot.write_i32(fld_var)?; o_prot.write_field_end()? } + if let Some(ref fld_var) = self.size_statistics { + o_prot.write_field_begin(&TFieldIdentifier::new("size_statistics", TType::Struct, 16))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -3739,8 +3893,8 @@ impl crate::thrift::TSerializable for EncryptionWithColumnKey { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_4 = i_prot.read_string()?; - val.push(list_elem_4); + let list_elem_6 = i_prot.read_string()?; + val.push(list_elem_6); } i_prot.read_list_end()?; f_1 = Some(val); @@ -4105,8 +4259,8 @@ impl crate::thrift::TSerializable for RowGroup { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_5 = ColumnChunk::read_from_in_protocol(i_prot)?; - val.push(list_elem_5); + let list_elem_7 = ColumnChunk::read_from_in_protocol(i_prot)?; + val.push(list_elem_7); } i_prot.read_list_end()?; f_1 = Some(val); @@ -4123,8 +4277,8 @@ impl crate::thrift::TSerializable for RowGroup { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_6 = SortingColumn::read_from_in_protocol(i_prot)?; - val.push(list_elem_6); + let list_elem_8 = SortingColumn::read_from_in_protocol(i_prot)?; + val.push(list_elem_8); } i_prot.read_list_end()?; f_4 = Some(val); @@ -4410,14 +4564,20 @@ impl crate::thrift::TSerializable for PageLocation { #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct OffsetIndex { /// PageLocations, ordered by increasing PageLocation.offset. It is required - /// that page_locations\[i\].first_row_index < page_locations\[i+1\].first_row_index. + /// that page_locations[i].first_row_index < page_locations[i+1].first_row_index. pub page_locations: Vec, + /// Unencoded/uncompressed size for BYTE_ARRAY types. + /// + /// See documention for unencoded_byte_array_data_bytes in SizeStatistics for + /// more details on this field. + pub unencoded_byte_array_data_bytes: Option>, } impl OffsetIndex { - pub fn new(page_locations: Vec) -> OffsetIndex { + pub fn new(page_locations: Vec, unencoded_byte_array_data_bytes: F2) -> OffsetIndex where F2: Into>> { OffsetIndex { page_locations, + unencoded_byte_array_data_bytes: unencoded_byte_array_data_bytes.into(), } } } @@ -4426,6 +4586,7 @@ impl crate::thrift::TSerializable for OffsetIndex { fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; + let mut f_2: Option> = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -4437,12 +4598,22 @@ impl crate::thrift::TSerializable for OffsetIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_7 = PageLocation::read_from_in_protocol(i_prot)?; - val.push(list_elem_7); + let list_elem_9 = PageLocation::read_from_in_protocol(i_prot)?; + val.push(list_elem_9); } i_prot.read_list_end()?; f_1 = Some(val); }, + 2 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_10 = i_prot.read_i64()?; + val.push(list_elem_10); + } + i_prot.read_list_end()?; + f_2 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -4453,6 +4624,7 @@ impl crate::thrift::TSerializable for OffsetIndex { verify_required_field_exists("OffsetIndex.page_locations", &f_1)?; let ret = OffsetIndex { page_locations: f_1.expect("auto-generated code should have checked for presence of required fields"), + unencoded_byte_array_data_bytes: f_2, }; Ok(ret) } @@ -4466,6 +4638,15 @@ impl crate::thrift::TSerializable for OffsetIndex { } o_prot.write_list_end()?; o_prot.write_field_end()?; + if let Some(ref fld_var) = self.unencoded_byte_array_data_bytes { + o_prot.write_field_begin(&TFieldIdentifier::new("unencoded_byte_array_data_bytes", TType::List, 2))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -4476,20 +4657,20 @@ impl crate::thrift::TSerializable for OffsetIndex { // /// Description for ColumnIndex. -/// Each ``\[i\] refers to the page at OffsetIndex.page_locations\[i\] +/// Each [i] refers to the page at OffsetIndex.page_locations[i] #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct ColumnIndex { /// A list of Boolean values to determine the validity of the corresponding /// min and max values. If true, a page contains only null values, and writers /// have to set the corresponding entries in min_values and max_values to - /// byte\[0\], so that all lists have the same length. If false, the + /// byte[0], so that all lists have the same length. If false, the /// corresponding entries in min_values and max_values must be valid. pub null_pages: Vec, /// Two lists containing lower and upper bounds for the values of each page /// determined by the ColumnOrder of the column. These may be the actual /// minimum and maximum values found on a page, but can also be (more compact) /// values that do not exist on a page. For example, instead of storing ""Blart - /// Versenwald III", a writer may set min_values\[i\]="B", max_values\[i\]="C". + /// Versenwald III", a writer may set min_values[i]="B", max_values[i]="C". /// Such more compact values must still be valid values within the column's /// logical type. Readers must make sure that list entries are populated before /// using them by inspecting null_pages. @@ -4497,21 +4678,38 @@ pub struct ColumnIndex { pub max_values: Vec>, /// Stores whether both min_values and max_values are ordered and if so, in /// which direction. This allows readers to perform binary searches in both - /// lists. Readers cannot assume that max_values\[i\] <= min_values\[i+1\], even + /// lists. Readers cannot assume that max_values[i] <= min_values[i+1], even /// if the lists are ordered. pub boundary_order: BoundaryOrder, /// A list containing the number of null values for each page * pub null_counts: Option>, + /// Contains repetition level histograms for each page + /// concatenated together. The repetition_level_histogram field on + /// SizeStatistics contains more details. + /// + /// When present the length should always be (number of pages * + /// (max_repetition_level + 1)) elements. + /// + /// Element 0 is the first element of the histogram for the first page. + /// Element (max_repetition_level + 1) is the first element of the histogram + /// for the second page. + /// + pub repetition_level_histograms: Option>, + /// Same as repetition_level_histograms except for definitions levels. + /// + pub definition_level_histograms: Option>, } impl ColumnIndex { - pub fn new(null_pages: Vec, min_values: Vec>, max_values: Vec>, boundary_order: BoundaryOrder, null_counts: F5) -> ColumnIndex where F5: Into>> { + pub fn new(null_pages: Vec, min_values: Vec>, max_values: Vec>, boundary_order: BoundaryOrder, null_counts: F5, repetition_level_histograms: F6, definition_level_histograms: F7) -> ColumnIndex where F5: Into>>, F6: Into>>, F7: Into>> { ColumnIndex { null_pages, min_values, max_values, boundary_order, null_counts: null_counts.into(), + repetition_level_histograms: repetition_level_histograms.into(), + definition_level_histograms: definition_level_histograms.into(), } } } @@ -4524,6 +4722,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let mut f_3: Option>> = None; let mut f_4: Option = None; let mut f_5: Option> = None; + let mut f_6: Option> = None; + let mut f_7: Option> = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -4535,8 +4735,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_8 = i_prot.read_bool()?; - val.push(list_elem_8); + let list_elem_11 = i_prot.read_bool()?; + val.push(list_elem_11); } i_prot.read_list_end()?; f_1 = Some(val); @@ -4545,8 +4745,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec> = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_9 = i_prot.read_bytes()?; - val.push(list_elem_9); + let list_elem_12 = i_prot.read_bytes()?; + val.push(list_elem_12); } i_prot.read_list_end()?; f_2 = Some(val); @@ -4555,8 +4755,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec> = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_10 = i_prot.read_bytes()?; - val.push(list_elem_10); + let list_elem_13 = i_prot.read_bytes()?; + val.push(list_elem_13); } i_prot.read_list_end()?; f_3 = Some(val); @@ -4569,12 +4769,32 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_11 = i_prot.read_i64()?; - val.push(list_elem_11); + let list_elem_14 = i_prot.read_i64()?; + val.push(list_elem_14); } i_prot.read_list_end()?; f_5 = Some(val); }, + 6 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_15 = i_prot.read_i64()?; + val.push(list_elem_15); + } + i_prot.read_list_end()?; + f_6 = Some(val); + }, + 7 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_16 = i_prot.read_i64()?; + val.push(list_elem_16); + } + i_prot.read_list_end()?; + f_7 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -4592,6 +4812,8 @@ impl crate::thrift::TSerializable for ColumnIndex { max_values: f_3.expect("auto-generated code should have checked for presence of required fields"), boundary_order: f_4.expect("auto-generated code should have checked for presence of required fields"), null_counts: f_5, + repetition_level_histograms: f_6, + definition_level_histograms: f_7, }; Ok(ret) } @@ -4631,6 +4853,24 @@ impl crate::thrift::TSerializable for ColumnIndex { o_prot.write_list_end()?; o_prot.write_field_end()? } + if let Some(ref fld_var) = self.repetition_level_histograms { + o_prot.write_field_begin(&TFieldIdentifier::new("repetition_level_histograms", TType::List, 6))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.definition_level_histograms { + o_prot.write_field_begin(&TFieldIdentifier::new("definition_level_histograms", TType::List, 7))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -4919,7 +5159,7 @@ pub struct FileMetaData { /// Optional key/value metadata * pub key_value_metadata: Option>, /// String for application that wrote this file. This should be in the format - /// `` version `` (build ``). + /// version (build ). /// e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) /// pub created_by: Option, @@ -4990,8 +5230,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_12 = SchemaElement::read_from_in_protocol(i_prot)?; - val.push(list_elem_12); + let list_elem_17 = SchemaElement::read_from_in_protocol(i_prot)?; + val.push(list_elem_17); } i_prot.read_list_end()?; f_2 = Some(val); @@ -5004,8 +5244,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_13 = RowGroup::read_from_in_protocol(i_prot)?; - val.push(list_elem_13); + let list_elem_18 = RowGroup::read_from_in_protocol(i_prot)?; + val.push(list_elem_18); } i_prot.read_list_end()?; f_4 = Some(val); @@ -5014,8 +5254,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_14 = KeyValue::read_from_in_protocol(i_prot)?; - val.push(list_elem_14); + let list_elem_19 = KeyValue::read_from_in_protocol(i_prot)?; + val.push(list_elem_19); } i_prot.read_list_end()?; f_5 = Some(val); @@ -5028,8 +5268,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_15 = ColumnOrder::read_from_in_protocol(i_prot)?; - val.push(list_elem_15); + let list_elem_20 = ColumnOrder::read_from_in_protocol(i_prot)?; + val.push(list_elem_20); } i_prot.read_list_end()?; f_7 = Some(val); From 788eef3a8f93b954b4ae133079a403e5f6657b17 Mon Sep 17 00:00:00 2001 From: seidl Date: Fri, 9 Feb 2024 10:13:51 -0800 Subject: [PATCH 02/44] first cut at adding page size statistics --- parquet/src/arrow/arrow_writer/byte_array.rs | 17 ++- parquet/src/column/writer/encoder.rs | 9 ++ parquet/src/column/writer/mod.rs | 128 +++++++++++++++++-- parquet/src/data_type.rs | 11 ++ parquet/src/file/metadata.rs | 101 ++++++++++++++- parquet/src/file/writer.rs | 9 +- 6 files changed, 259 insertions(+), 16 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 61933b24178e..b4d6348c9b9e 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -92,6 +92,7 @@ macro_rules! downcast_op { struct FallbackEncoder { encoder: FallbackEncoderImpl, num_values: usize, + variable_length_bytes: i64, } /// The fallback encoder in use @@ -148,6 +149,7 @@ impl FallbackEncoder { Ok(Self { encoder, num_values: 0, + variable_length_bytes: 0, }) } @@ -164,7 +166,8 @@ impl FallbackEncoder { let value = values.value(*idx); let value = value.as_ref(); buffer.extend_from_slice((value.len() as u32).as_bytes()); - buffer.extend_from_slice(value) + buffer.extend_from_slice(value); + self.variable_length_bytes += value.len() as i64; } } FallbackEncoderImpl::DeltaLength { buffer, lengths } => { @@ -173,6 +176,7 @@ impl FallbackEncoder { let value = value.as_ref(); lengths.put(&[value.len() as i32]).unwrap(); buffer.extend_from_slice(value); + self.variable_length_bytes += value.len() as i64; } } FallbackEncoderImpl::Delta { @@ -201,6 +205,7 @@ impl FallbackEncoder { buffer.extend_from_slice(&value[prefix_length..]); prefix_lengths.put(&[prefix_length as i32]).unwrap(); suffix_lengths.put(&[suffix_length as i32]).unwrap(); + self.variable_length_bytes += value.len() as i64; } } } @@ -261,12 +266,16 @@ impl FallbackEncoder { } }; + let var_bytes = Some(self.variable_length_bytes); + self.variable_length_bytes = 0; + Ok(DataPageValues { buf: buf.into(), num_values: std::mem::take(&mut self.num_values), encoding, min_value, max_value, + variable_length_bytes: var_bytes, }) } } @@ -307,6 +316,7 @@ impl Storage for ByteArrayStorage { struct DictEncoder { interner: Interner, indices: Vec, + variable_length_bytes: i64, } impl DictEncoder { @@ -322,6 +332,7 @@ impl DictEncoder { let value = values.value(*idx); let interned = self.interner.intern(value.as_ref()); self.indices.push(interned); + self.variable_length_bytes += value.as_ref().len() as i64; } } @@ -366,12 +377,16 @@ impl DictEncoder { self.indices.clear(); + let var_bytes = Some(self.variable_length_bytes); + self.variable_length_bytes = 0; + DataPageValues { buf: encoder.consume().into(), num_values, encoding: Encoding::RLE_DICTIONARY, min_value, max_value, + variable_length_bytes: var_bytes, } } } diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index 8624f859f4b0..b9a8f903cc97 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -63,6 +63,7 @@ pub struct DataPageValues { pub encoding: Encoding, pub min_value: Option, pub max_value: Option, + pub variable_length_bytes: Option, } /// A generic encoder of [`ColumnValues`] to data and dictionary pages used by @@ -124,6 +125,7 @@ pub struct ColumnValueEncoderImpl { min_value: Option, max_value: Option, bloom_filter: Option, + variable_length_bytes: Option, } impl ColumnValueEncoderImpl { @@ -143,6 +145,11 @@ impl ColumnValueEncoderImpl { update_min(&self.descr, &min, &mut self.min_value); update_max(&self.descr, &max, &mut self.max_value); } + + if let Some(var_bytes) = T::T::variable_length_bytes(slice) { + self.variable_length_bytes = + Some(var_bytes + self.variable_length_bytes.unwrap_or(0)); + } } // encode the values into bloom filter if enabled @@ -196,6 +203,7 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { bloom_filter, min_value: None, max_value: None, + variable_length_bytes: None, }) } @@ -271,6 +279,7 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { num_values: std::mem::take(&mut self.num_values), min_value: self.min_value.take(), max_value: self.max_value.take(), + variable_length_bytes: self.variable_length_bytes.take(), }) } } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index e993cb4c11a8..c099dffdc854 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -183,6 +183,8 @@ struct PageMetrics { num_buffered_values: u32, num_buffered_rows: u32, num_page_nulls: u64, + repetition_level_histogram: Option>, + definition_level_histogram: Option>, } // Metrics per column writer @@ -198,6 +200,9 @@ struct ColumnMetrics { max_column_value: Option, num_column_nulls: u64, column_distinct_count: Option, + variable_length_bytes: Option, + repetition_level_histogram: Option>, + definition_level_histogram: Option>, } /// Typed column writer for a primitive column. @@ -254,6 +259,19 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // Used for level information encodings.insert(Encoding::RLE); + // histogram data is only collected if there is more than a single level and if + // page or chunk statistics are being collected + let new_histogram_vec = |max_level| { + if statistics_enabled == EnabledStatistics::None || max_level == 0 { + None + } else { + Some(vec![0; max_level as usize + 1]) + } + }; + + let max_rep_level = descr.max_rep_level(); + let max_def_level = descr.max_def_level(); + Self { descr, props, @@ -269,6 +287,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { num_buffered_values: 0, num_buffered_rows: 0, num_page_nulls: 0, + repetition_level_histogram: new_histogram_vec(max_rep_level), + definition_level_histogram: new_histogram_vec(max_def_level), }, column_metrics: ColumnMetrics { total_bytes_written: 0, @@ -282,6 +302,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { max_column_value: None, num_column_nulls: 0, column_distinct_count: None, + variable_length_bytes: None, + repetition_level_histogram: new_histogram_vec(max_rep_level), + definition_level_histogram: new_histogram_vec(max_def_level), }, column_index_builder: ColumnIndexBuilder::new(), offset_index_builder: OffsetIndexBuilder::new(), @@ -513,12 +536,26 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { })?; let mut values_to_write = 0; - for &level in levels { + + let mut process_def_level = |level| { if level == self.descr.max_def_level() { values_to_write += 1; } else { // We must always compute this as it is used to populate v2 pages - self.page_metrics.num_page_nulls += 1 + self.page_metrics.num_page_nulls += 1; + } + }; + + if let Some(ref mut def_hist) = self.page_metrics.definition_level_histogram { + // Count values and update histogram + for &level in levels { + process_def_level(level); + def_hist[level as usize] += 1; + } + } else { + // Count values + for &level in levels { + process_def_level(level); } } @@ -545,9 +582,17 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { )); } - // Count the occasions where we start a new row - for &level in levels { - self.page_metrics.num_buffered_rows += (level == 0) as u32 + if let Some(ref mut rep_hist) = self.page_metrics.repetition_level_histogram { + // Count the occasions where we start a new row and update histogram + for &level in levels { + self.page_metrics.num_buffered_rows += (level == 0) as u32; + rep_hist[level as usize] += 1; + } + } else { + // Count the occasions where we start a new row + for &level in levels { + self.page_metrics.num_buffered_rows += (level == 0) as u32 + } } self.rep_levels_sink.extend_from_slice(levels); @@ -618,7 +663,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } /// Update the column index and offset index when adding the data page - fn update_column_offset_index(&mut self, page_statistics: Option<&ValueStatistics>) { + fn update_column_offset_index( + &mut self, + page_statistics: Option<&ValueStatistics>, + page_variable_length_bytes: Option, + ) { // update the column index let null_page = (self.page_metrics.num_buffered_rows as u64) == self.page_metrics.num_page_nulls; @@ -689,9 +738,21 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } } } + + // update histograms + if self.column_index_builder.valid() { + self.column_index_builder.append_histograms( + &self.page_metrics.repetition_level_histogram, + &self.page_metrics.definition_level_histogram, + ); + } + // update the offset index self.offset_index_builder .append_row_count(self.page_metrics.num_buffered_rows as i64); + + self.offset_index_builder + .append_unencoded_byte_array_data_bytes(page_variable_length_bytes); } /// Determine if we should allow truncating min/max values for this column's statistics @@ -766,8 +827,50 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { _ => None, }; + if let Some(var_bytes) = values_data.variable_length_bytes { + self.column_metrics.variable_length_bytes = + Some(self.column_metrics.variable_length_bytes.unwrap_or(0) + var_bytes); + } + // update column and offset index - self.update_column_offset_index(page_statistics.as_ref()); + self.update_column_offset_index( + page_statistics.as_ref(), + values_data.variable_length_bytes, + ); + + // collect page histograms into chunk histograms and zero out page histograms + // TODO(ets): This could instead just add the vectors, and then allow page_metrics to be reset + // below. Would then need to recreate the histogram vectors, so `new_histogram_vec` above + // would need to become a function. + if let Some(ref mut page_hist) = self.page_metrics.repetition_level_histogram { + if let Some(ref mut chunk_hist) = self.column_metrics.repetition_level_histogram { + assert_eq!(chunk_hist.len(), page_hist.len()); + for i in 0..page_hist.len() { + chunk_hist[i] += page_hist[i]; + page_hist[i] = 0; + } + } else { + // this should never be reached, but zero out histogram just in case + for v in page_hist { + *v = 0; + } + } + } + if let Some(ref mut page_hist) = self.page_metrics.definition_level_histogram { + if let Some(ref mut chunk_hist) = self.column_metrics.definition_level_histogram { + assert_eq!(chunk_hist.len(), page_hist.len()); + for i in 0..page_hist.len() { + chunk_hist[i] += page_hist[i]; + page_hist[i] = 0; + } + } else { + // this should never be reached, but zero out histogram just in case + for v in page_hist { + *v = 0; + } + } + } + let page_statistics = page_statistics.map(Statistics::from); let compressed_page = match self.props.writer_version() { @@ -871,7 +974,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // Reset state. self.rep_levels_sink.clear(); self.def_levels_sink.clear(); - self.page_metrics = PageMetrics::default(); + + // don't clobber histogram vectors + self.page_metrics.num_buffered_values = 0; + self.page_metrics.num_buffered_rows = 0; + self.page_metrics.num_page_nulls = 0; Ok(()) } @@ -914,7 +1021,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .set_total_uncompressed_size(total_uncompressed_size) .set_num_values(num_values) .set_data_page_offset(data_page_offset) - .set_dictionary_page_offset(dict_page_offset); + .set_dictionary_page_offset(dict_page_offset) + .set_unencoded_byte_array_data_bytes(self.column_metrics.variable_length_bytes) + .set_repetition_level_histogram(self.column_metrics.repetition_level_histogram.take()) + .set_definition_level_histogram(self.column_metrics.definition_level_histogram.take()); if self.statistics_enabled != EnabledStatistics::None { let backwards_compatible_min_max = self.descr.sort_order().is_signed(); diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 86da7a3acee4..43aee14b092d 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -632,6 +632,13 @@ pub(crate) mod private { (std::mem::size_of::(), 1) } + /// Return the number of variable length bytes in a given slice of data + /// + /// Returns the sum of lengths for BYTE_ARRAY data, and None for all other data types + fn variable_length_bytes(_: &[Self]) -> Option { + None + } + /// Return the value as i64 if possible /// /// This is essentially the same as `std::convert::TryInto` but can't be @@ -938,6 +945,10 @@ pub(crate) mod private { Ok(num_values) } + fn variable_length_bytes(values: &[Self]) -> Option { + Some(values.iter().map(|x| x.len() as i64).sum()) + } + fn skip(decoder: &mut PlainDecoderDetails, num_values: usize) -> Result { let data = decoder .data diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index cb1b7f54541e..b1804ef6f8cd 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -38,7 +38,7 @@ use std::sync::Arc; use crate::format::{ BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup, - SortingColumn, + SizeStatistics, SortingColumn, }; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; @@ -482,6 +482,9 @@ pub struct ColumnChunkMetaData { offset_index_length: Option, column_index_offset: Option, column_index_length: Option, + unencoded_byte_array_data_bytes: Option, + repetition_level_histogram: Option>, + definition_level_histogram: Option>, } /// Represents common operations for a column chunk. @@ -634,6 +637,21 @@ impl ColumnChunkMetaData { Some(offset..(offset + length)) } + /// Returns the number of bytes of variable length data. + pub fn unencoded_byte_array_data_bytes(&self) -> Option { + self.unencoded_byte_array_data_bytes + } + + /// Returns the repetition level histogram. + pub fn repetition_level_histogram(&self) -> Option<&Vec> { + self.repetition_level_histogram.as_ref() + } + + /// Returns the repetition level histogram. + pub fn definition_level_histogram(&self) -> Option<&Vec> { + self.definition_level_histogram.as_ref() + } + /// Method to convert from Thrift. pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { if cc.meta_data.is_none() { @@ -671,6 +689,19 @@ impl ColumnChunkMetaData { let offset_index_length = cc.offset_index_length; let column_index_offset = cc.column_index_offset; let column_index_length = cc.column_index_length; + let ( + unencoded_byte_array_data_bytes, + repetition_level_histogram, + definition_level_histogram, + ) = if let Some(size_stats) = col_metadata.size_statistics { + ( + size_stats.unencoded_byte_array_data_bytes, + size_stats.repetition_level_histogram, + size_stats.definition_level_histogram, + ) + } else { + (None, None, None) + }; let result = ColumnChunkMetaData { column_descr, @@ -692,6 +723,9 @@ impl ColumnChunkMetaData { offset_index_length, column_index_offset, column_index_length, + unencoded_byte_array_data_bytes, + repetition_level_histogram, + definition_level_histogram, }; Ok(result) } @@ -734,7 +768,11 @@ impl ColumnChunkMetaData { .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()), bloom_filter_offset: self.bloom_filter_offset, bloom_filter_length: self.bloom_filter_length, - size_statistics: None, + size_statistics: Some(SizeStatistics { + unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes, + repetition_level_histogram: self.repetition_level_histogram.clone(), + definition_level_histogram: self.definition_level_histogram.clone(), + }), } } @@ -770,6 +808,9 @@ impl ColumnChunkMetaDataBuilder { offset_index_length: None, column_index_offset: None, column_index_length: None, + unencoded_byte_array_data_bytes: None, + repetition_level_histogram: None, + definition_level_histogram: None, }) } @@ -881,6 +922,24 @@ impl ColumnChunkMetaDataBuilder { self } + /// Sets optional length of variable length data in bytes. + pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option) -> Self { + self.0.unencoded_byte_array_data_bytes = value; + self + } + + /// Sets optional repetition level histogram + pub fn set_repetition_level_histogram(mut self, value: Option>) -> Self { + self.0.repetition_level_histogram = value; + self + } + + /// Sets optional repetition level histogram + pub fn set_definition_level_histogram(mut self, value: Option>) -> Self { + self.0.definition_level_histogram = value; + self + } + /// Builds column chunk metadata. pub fn build(self) -> Result { Ok(self.0) @@ -894,6 +953,8 @@ pub struct ColumnIndexBuilder { max_values: Vec>, null_counts: Vec, boundary_order: BoundaryOrder, + repetition_level_histograms: Vec, + definition_level_histograms: Vec, // If one page can't get build index, need to ignore all index in this column valid: bool, } @@ -912,6 +973,8 @@ impl ColumnIndexBuilder { max_values: Vec::new(), null_counts: Vec::new(), boundary_order: BoundaryOrder::UNORDERED, + repetition_level_histograms: Vec::new(), + definition_level_histograms: Vec::new(), valid: true, } } @@ -929,6 +992,21 @@ impl ColumnIndexBuilder { self.null_counts.push(null_count); } + pub fn append_histograms( + &mut self, + repetition_level_histogram: &Option>, + definition_level_histogram: &Option>, + ) { + if let Some(ref rep_lvl_hist) = repetition_level_histogram { + self.repetition_level_histograms.reserve(rep_lvl_hist.len()); + self.repetition_level_histograms.extend(rep_lvl_hist); + } + if let Some(ref def_lvl_hist) = definition_level_histogram { + self.definition_level_histograms.reserve(def_lvl_hist.len()); + self.definition_level_histograms.extend(def_lvl_hist); + } + } + pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) { self.boundary_order = boundary_order; } @@ -949,8 +1027,8 @@ impl ColumnIndexBuilder { self.max_values, self.boundary_order, self.null_counts, - None, - None, + self.repetition_level_histograms, + self.definition_level_histograms, ) } } @@ -960,6 +1038,7 @@ pub struct OffsetIndexBuilder { offset_array: Vec, compressed_page_size_array: Vec, first_row_index_array: Vec, + unencoded_byte_array_data_bytes_array: Option>, current_first_row_index: i64, } @@ -975,6 +1054,7 @@ impl OffsetIndexBuilder { offset_array: Vec::new(), compressed_page_size_array: Vec::new(), first_row_index_array: Vec::new(), + unencoded_byte_array_data_bytes_array: None, current_first_row_index: 0, } } @@ -990,6 +1070,17 @@ impl OffsetIndexBuilder { self.compressed_page_size_array.push(compressed_page_size); } + pub fn append_unencoded_byte_array_data_bytes( + &mut self, + unencoded_byte_array_data_bytes: Option, + ) { + if let Some(val) = unencoded_byte_array_data_bytes { + self.unencoded_byte_array_data_bytes_array + .get_or_insert(Vec::new()) + .push(val); + } + } + /// Build and get the thrift metadata of offset index pub fn build_to_thrift(self) -> OffsetIndex { let locations = self @@ -999,7 +1090,7 @@ impl OffsetIndexBuilder { .zip(self.first_row_index_array.iter()) .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index)) .collect::>(); - OffsetIndex::new(locations, None) + OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array) } } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index e15a7195028f..f573dddcb508 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -594,8 +594,15 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { .set_total_uncompressed_size(metadata.uncompressed_size()) .set_num_values(metadata.num_values()) .set_data_page_offset(map_offset(src_data_offset)) - .set_dictionary_page_offset(src_dictionary_offset.map(map_offset)); + .set_dictionary_page_offset(src_dictionary_offset.map(map_offset)) + .set_unencoded_byte_array_data_bytes(metadata.unencoded_byte_array_data_bytes()); + if let Some(rep_hist) = metadata.repetition_level_histogram() { + builder = builder.set_repetition_level_histogram(Some(rep_hist.clone())) + } + if let Some(def_hist) = metadata.definition_level_histogram() { + builder = builder.set_definition_level_histogram(Some(def_hist.clone())) + } if let Some(statistics) = metadata.statistics() { builder = builder.set_statistics(statistics.clone()) } From 6296ada4cb23be0b2294482203e65d7e511ecfda Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 15 Feb 2024 21:18:41 -0800 Subject: [PATCH 03/44] add new stats to chunk metadata test --- parquet/src/file/metadata.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index b1804ef6f8cd..864baa822aa8 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -1240,6 +1240,9 @@ mod tests { .set_offset_index_length(Some(25)) .set_column_index_offset(Some(8000)) .set_column_index_length(Some(25)) + .set_unencoded_byte_array_data_bytes(Some(2000)) + .set_repetition_level_histogram(Some(vec![100,100])) + .set_definition_level_histogram(Some(vec![0,200])) .build() .unwrap(); From 0da05a8b70855b2454f12b3359295e252a690883 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 12 Mar 2024 08:17:44 -0700 Subject: [PATCH 04/44] fix escapes --- parquet/src/format.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/parquet/src/format.rs b/parquet/src/format.rs index e1d2df5f42e3..393ac7ef2eea 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -4564,7 +4564,7 @@ impl crate::thrift::TSerializable for PageLocation { #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct OffsetIndex { /// PageLocations, ordered by increasing PageLocation.offset. It is required - /// that page_locations[i].first_row_index < page_locations[i+1].first_row_index. + /// that page_locations\[i\].first_row_index < page_locations\[i+1\].first_row_index. pub page_locations: Vec, /// Unencoded/uncompressed size for BYTE_ARRAY types. /// @@ -4657,20 +4657,20 @@ impl crate::thrift::TSerializable for OffsetIndex { // /// Description for ColumnIndex. -/// Each [i] refers to the page at OffsetIndex.page_locations[i] +/// Each ``\[i\] refers to the page at OffsetIndex.page_locations\[i\] #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct ColumnIndex { /// A list of Boolean values to determine the validity of the corresponding /// min and max values. If true, a page contains only null values, and writers /// have to set the corresponding entries in min_values and max_values to - /// byte[0], so that all lists have the same length. If false, the + /// byte\[0\], so that all lists have the same length. If false, the /// corresponding entries in min_values and max_values must be valid. pub null_pages: Vec, /// Two lists containing lower and upper bounds for the values of each page /// determined by the ColumnOrder of the column. These may be the actual /// minimum and maximum values found on a page, but can also be (more compact) /// values that do not exist on a page. For example, instead of storing ""Blart - /// Versenwald III", a writer may set min_values[i]="B", max_values[i]="C". + /// Versenwald III", a writer may set min_values\[i\]="B", max_values\[i\]="C". /// Such more compact values must still be valid values within the column's /// logical type. Readers must make sure that list entries are populated before /// using them by inspecting null_pages. @@ -4678,7 +4678,7 @@ pub struct ColumnIndex { pub max_values: Vec>, /// Stores whether both min_values and max_values are ordered and if so, in /// which direction. This allows readers to perform binary searches in both - /// lists. Readers cannot assume that max_values[i] <= min_values[i+1], even + /// lists. Readers cannot assume that max_values\[i\] <= min_values\[i+1\], even /// if the lists are ordered. pub boundary_order: BoundaryOrder, /// A list containing the number of null values for each page * @@ -5159,7 +5159,7 @@ pub struct FileMetaData { /// Optional key/value metadata * pub key_value_metadata: Option>, /// String for application that wrote this file. This should be in the format - /// version (build ). + /// `` version `` (build ``). /// e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) /// pub created_by: Option, From 6e5fecece93ca8070349cbcd0075d31eed2a9206 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 12 Mar 2024 08:31:59 -0700 Subject: [PATCH 05/44] format --- parquet/src/file/metadata.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 4d21e8aeabb2..be059f2dd5ce 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -1241,8 +1241,8 @@ mod tests { .set_column_index_offset(Some(8000)) .set_column_index_length(Some(25)) .set_unencoded_byte_array_data_bytes(Some(2000)) - .set_repetition_level_histogram(Some(vec![100,100])) - .set_definition_level_histogram(Some(vec![0,200])) + .set_repetition_level_histogram(Some(vec![100, 100])) + .set_definition_level_histogram(Some(vec![0, 200])) .build() .unwrap(); From 457eb4a77667b399a80038d23636018707ae1d20 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 12 Mar 2024 09:20:52 -0700 Subject: [PATCH 06/44] formatting --- parquet/src/file/metadata.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 864baa822aa8..6ae0d51bd163 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -1241,8 +1241,8 @@ mod tests { .set_column_index_offset(Some(8000)) .set_column_index_length(Some(25)) .set_unencoded_byte_array_data_bytes(Some(2000)) - .set_repetition_level_histogram(Some(vec![100,100])) - .set_definition_level_histogram(Some(vec![0,200])) + .set_repetition_level_histogram(Some(vec![100, 100])) + .set_definition_level_histogram(Some(vec![0, 200])) .build() .unwrap(); From 18a573204c3cbc6bd464c1de7651e3168841b08d Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 12 Mar 2024 09:25:00 -0700 Subject: [PATCH 07/44] add escapes --- parquet/src/format.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/parquet/src/format.rs b/parquet/src/format.rs index e1d2df5f42e3..393ac7ef2eea 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -4564,7 +4564,7 @@ impl crate::thrift::TSerializable for PageLocation { #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct OffsetIndex { /// PageLocations, ordered by increasing PageLocation.offset. It is required - /// that page_locations[i].first_row_index < page_locations[i+1].first_row_index. + /// that page_locations\[i\].first_row_index < page_locations\[i+1\].first_row_index. pub page_locations: Vec, /// Unencoded/uncompressed size for BYTE_ARRAY types. /// @@ -4657,20 +4657,20 @@ impl crate::thrift::TSerializable for OffsetIndex { // /// Description for ColumnIndex. -/// Each [i] refers to the page at OffsetIndex.page_locations[i] +/// Each ``\[i\] refers to the page at OffsetIndex.page_locations\[i\] #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct ColumnIndex { /// A list of Boolean values to determine the validity of the corresponding /// min and max values. If true, a page contains only null values, and writers /// have to set the corresponding entries in min_values and max_values to - /// byte[0], so that all lists have the same length. If false, the + /// byte\[0\], so that all lists have the same length. If false, the /// corresponding entries in min_values and max_values must be valid. pub null_pages: Vec, /// Two lists containing lower and upper bounds for the values of each page /// determined by the ColumnOrder of the column. These may be the actual /// minimum and maximum values found on a page, but can also be (more compact) /// values that do not exist on a page. For example, instead of storing ""Blart - /// Versenwald III", a writer may set min_values[i]="B", max_values[i]="C". + /// Versenwald III", a writer may set min_values\[i\]="B", max_values\[i\]="C". /// Such more compact values must still be valid values within the column's /// logical type. Readers must make sure that list entries are populated before /// using them by inspecting null_pages. @@ -4678,7 +4678,7 @@ pub struct ColumnIndex { pub max_values: Vec>, /// Stores whether both min_values and max_values are ordered and if so, in /// which direction. This allows readers to perform binary searches in both - /// lists. Readers cannot assume that max_values[i] <= min_values[i+1], even + /// lists. Readers cannot assume that max_values\[i\] <= min_values\[i+1\], even /// if the lists are ordered. pub boundary_order: BoundaryOrder, /// A list containing the number of null values for each page * @@ -5159,7 +5159,7 @@ pub struct FileMetaData { /// Optional key/value metadata * pub key_value_metadata: Option>, /// String for application that wrote this file. This should be in the format - /// version (build ). + /// `` version `` (build ``). /// e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) /// pub created_by: Option, From 9635e5ea7d968d91f01cb025ef67e7cb551d2227 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 27 Jun 2024 12:02:16 -0700 Subject: [PATCH 08/44] add test of SizeStatistics.unencoded_byte_array_data_bytes --- parquet/src/file/writer.rs | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 9b08f551d826..d8b3c1ba130d 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -811,7 +811,7 @@ mod tests { use crate::column::page::{Page, PageReader}; use crate::column::reader::get_typed_column_reader; use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; - use crate::data_type::{BoolType, Int32Type}; + use crate::data_type::{BoolType, ByteArrayType, Int32Type}; use crate::file::page_index::index::Index; use crate::file::properties::EnabledStatistics; use crate::file::serialized_reader::ReadOptionsBuilder; @@ -824,6 +824,7 @@ mod tests { use crate::record::{Row, RowAccessor}; use crate::schema::parser::parse_message_type; use crate::schema::types::{ColumnDescriptor, ColumnPath}; + use crate::util::test_common::rand_gen::RandGen; #[test] fn test_row_group_writer_error_not_all_columns_written() { @@ -1835,4 +1836,31 @@ mod tests { let b_idx = &column_index[0][1]; assert!(matches!(b_idx, Index::NONE), "{b_idx:?}"); } + + #[test] + fn test_unencoded_byte_array_size() { + let data = vec![ByteArrayType::gen_vec(32, 5)]; + let unenc_size: i64 = data[0].iter().map(|x| x.len() as i64).sum(); + let file: File = tempfile::tempfile().unwrap(); + let file_metadata = test_roundtrip::( + file, + data, + |r| r.get_bytes(0).unwrap().clone(), + Compression::UNCOMPRESSED, + ); + + assert_eq!(file_metadata.row_groups.len(), 1); + assert_eq!(file_metadata.row_groups[0].columns.len(), 1); + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + + if let Some(ref meta_data) = file_metadata.row_groups[0].columns[0].meta_data { + assert!(meta_data.size_statistics.is_some()); + if let Some(ref size_stats) = meta_data.size_statistics { + assert_eq!( + unenc_size, + size_stats.unencoded_byte_array_data_bytes.unwrap_or(0) + ) + } + } + } } From c5c07b6f68b507219ea91482bba8b93911ce3d7f Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 27 Jun 2024 13:31:53 -0700 Subject: [PATCH 09/44] test def histogram as well, rename test --- parquet/src/file/writer.rs | 50 ++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index d8b3c1ba130d..045d8e4439b9 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1398,12 +1398,19 @@ mod tests { file: W, data: Vec>, compression: Compression, + repetition: Repetition, ) -> crate::format::FileMetaData where W: Write + Send, R: ChunkReader + From + 'static, { - test_roundtrip::(file, data, |r| r.get_int(0).unwrap(), compression) + test_roundtrip::( + file, + data, + |r| r.get_int(0).unwrap(), + compression, + repetition, + ) } /// Tests roundtrip of data of type `D` written using `W` and read using `R` @@ -1413,6 +1420,7 @@ mod tests { data: Vec>, value: F, compression: Compression, + repetition: Repetition, ) -> crate::format::FileMetaData where W: Write + Send, @@ -1424,7 +1432,7 @@ mod tests { types::Type::group_type_builder("schema") .with_fields(vec![Arc::new( types::Type::primitive_type_builder("col1", D::get_physical_type()) - .with_repetition(Repetition::REQUIRED) + .with_repetition(repetition) .build() .unwrap(), )]) @@ -1443,9 +1451,15 @@ mod tests { let row_group_file_offset = file_writer.buf.bytes_written(); let mut row_group_writer = file_writer.next_row_group().unwrap(); if let Some(mut writer) = row_group_writer.next_column().unwrap() { + let def_vec = vec![1; subset.len()]; + let def_lvls = if repetition != Repetition::REQUIRED { + Some(def_vec.as_slice()) + } else { + None + }; rows += writer .typed::() - .write_batch(&subset[..], None, None) + .write_batch(&subset[..], def_lvls, None) .unwrap() as i64; writer.close().unwrap(); } @@ -1485,7 +1499,12 @@ mod tests { /// File write-read roundtrip. /// `data` consists of arrays of values for each row group. fn test_file_roundtrip(file: File, data: Vec>) -> crate::format::FileMetaData { - test_roundtrip_i32::(file, data, Compression::UNCOMPRESSED) + test_roundtrip_i32::( + file, + data, + Compression::UNCOMPRESSED, + Repetition::REQUIRED, + ) } #[test] @@ -1530,7 +1549,12 @@ mod tests { } fn test_bytes_roundtrip(data: Vec>, compression: Compression) { - test_roundtrip_i32::, Bytes>(Vec::with_capacity(1024), data, compression); + test_roundtrip_i32::, Bytes>( + Vec::with_capacity(1024), + data, + compression, + Repetition::REQUIRED, + ); } #[test] @@ -1541,6 +1565,7 @@ mod tests { vec![my_bool_values], |r| r.get_bool(0).unwrap(), Compression::UNCOMPRESSED, + Repetition::REQUIRED, ); } @@ -1552,6 +1577,7 @@ mod tests { vec![my_bool_values], |r| r.get_bool(0).unwrap(), Compression::SNAPPY, + Repetition::REQUIRED, ); } @@ -1838,8 +1864,9 @@ mod tests { } #[test] - fn test_unencoded_byte_array_size() { - let data = vec![ByteArrayType::gen_vec(32, 5)]; + fn test_size_statistics() { + let num_rows: i64 = 5; + let data = vec![ByteArrayType::gen_vec(32, num_rows as usize)]; let unenc_size: i64 = data[0].iter().map(|x| x.len() as i64).sum(); let file: File = tempfile::tempfile().unwrap(); let file_metadata = test_roundtrip::( @@ -1847,6 +1874,7 @@ mod tests { data, |r| r.get_bytes(0).unwrap().clone(), Compression::UNCOMPRESSED, + Repetition::OPTIONAL, ); assert_eq!(file_metadata.row_groups.len(), 1); @@ -1859,7 +1887,13 @@ mod tests { assert_eq!( unenc_size, size_stats.unencoded_byte_array_data_bytes.unwrap_or(0) - ) + ); + assert!(size_stats.repetition_level_histogram.is_none()); + assert!(size_stats.definition_level_histogram.is_some()); + if let Some(ref def_hist) = size_stats.definition_level_histogram { + assert_eq!(def_hist[0], 0); + assert_eq!(def_hist[1], num_rows); + } } } } From 6dd160ff95fc41acb301b709933d741a237d13fd Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 27 Jun 2024 13:35:42 -0700 Subject: [PATCH 10/44] add an assert --- parquet/src/file/writer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 045d8e4439b9..679c64e967fd 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1891,6 +1891,7 @@ mod tests { assert!(size_stats.repetition_level_histogram.is_none()); assert!(size_stats.definition_level_histogram.is_some()); if let Some(ref def_hist) = size_stats.definition_level_histogram { + assert_eq!(def_hist.len(), 2); assert_eq!(def_hist[0], 0); assert_eq!(def_hist[1], num_rows); } From 917b412a704910f79306663de547ab7f973957c9 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 27 Jun 2024 16:02:11 -0700 Subject: [PATCH 11/44] refactor and add test of def histogram with nulls --- parquet/src/file/writer.rs | 117 ++++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 46 deletions(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 679c64e967fd..4adfe7d1cfcf 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1398,7 +1398,7 @@ mod tests { file: W, data: Vec>, compression: Compression, - repetition: Repetition, + def_levels: Option<&[i16]>, ) -> crate::format::FileMetaData where W: Write + Send, @@ -1409,7 +1409,7 @@ mod tests { data, |r| r.get_int(0).unwrap(), compression, - repetition, + def_levels, ) } @@ -1420,7 +1420,7 @@ mod tests { data: Vec>, value: F, compression: Compression, - repetition: Repetition, + def_levels: Option<&[i16]>, ) -> crate::format::FileMetaData where W: Write + Send, @@ -1428,6 +1428,10 @@ mod tests { D: DataType, F: Fn(Row) -> D::T, { + let repetition = match def_levels { + Some(_) => Repetition::OPTIONAL, + _ => Repetition::REQUIRED, + }; let schema = Arc::new( types::Type::group_type_builder("schema") .with_fields(vec![Arc::new( @@ -1451,15 +1455,9 @@ mod tests { let row_group_file_offset = file_writer.buf.bytes_written(); let mut row_group_writer = file_writer.next_row_group().unwrap(); if let Some(mut writer) = row_group_writer.next_column().unwrap() { - let def_vec = vec![1; subset.len()]; - let def_lvls = if repetition != Repetition::REQUIRED { - Some(def_vec.as_slice()) - } else { - None - }; rows += writer .typed::() - .write_batch(&subset[..], def_lvls, None) + .write_batch(&subset[..], def_levels, None) .unwrap() as i64; writer.close().unwrap(); } @@ -1474,24 +1472,28 @@ mod tests { let reader = SerializedFileReader::new(R::from(file)).unwrap(); assert_eq!(reader.num_row_groups(), data.len()); - assert_eq!( - reader.metadata().file_metadata().num_rows(), - rows, - "row count in metadata not equal to number of rows written" - ); - for (i, item) in data.iter().enumerate().take(reader.num_row_groups()) { - let row_group_reader = reader.get_row_group(i).unwrap(); - let iter = row_group_reader.get_row_iter(None).unwrap(); - let res: Vec<_> = iter.map(|row| row.unwrap()).map(&value).collect(); - let row_group_size = row_group_reader.metadata().total_byte_size(); - let uncompressed_size: i64 = row_group_reader - .metadata() - .columns() - .iter() - .map(|v| v.uncompressed_size()) - .sum(); - assert_eq!(row_group_size, uncompressed_size); - assert_eq!(res, *item); + // Row based API does not like nulls, so skip these validation steps if nulls might + // be present. + if repetition == Repetition::REQUIRED { + assert_eq!( + reader.metadata().file_metadata().num_rows(), + rows, + "row count in metadata not equal to number of rows written" + ); + for (i, item) in data.iter().enumerate().take(reader.num_row_groups()) { + let row_group_reader = reader.get_row_group(i).unwrap(); + let iter = row_group_reader.get_row_iter(None).unwrap(); + let res: Vec<_> = iter.map(|row| row.unwrap()).map(&value).collect(); + let row_group_size = row_group_reader.metadata().total_byte_size(); + let uncompressed_size: i64 = row_group_reader + .metadata() + .columns() + .iter() + .map(|v| v.uncompressed_size()) + .sum(); + assert_eq!(row_group_size, uncompressed_size); + assert_eq!(res, *item); + } } file_metadata } @@ -1499,12 +1501,7 @@ mod tests { /// File write-read roundtrip. /// `data` consists of arrays of values for each row group. fn test_file_roundtrip(file: File, data: Vec>) -> crate::format::FileMetaData { - test_roundtrip_i32::( - file, - data, - Compression::UNCOMPRESSED, - Repetition::REQUIRED, - ) + test_roundtrip_i32::(file, data, Compression::UNCOMPRESSED, None) } #[test] @@ -1549,12 +1546,7 @@ mod tests { } fn test_bytes_roundtrip(data: Vec>, compression: Compression) { - test_roundtrip_i32::, Bytes>( - Vec::with_capacity(1024), - data, - compression, - Repetition::REQUIRED, - ); + test_roundtrip_i32::, Bytes>(Vec::with_capacity(1024), data, compression, None); } #[test] @@ -1565,7 +1557,7 @@ mod tests { vec![my_bool_values], |r| r.get_bool(0).unwrap(), Compression::UNCOMPRESSED, - Repetition::REQUIRED, + None, ); } @@ -1577,7 +1569,7 @@ mod tests { vec![my_bool_values], |r| r.get_bool(0).unwrap(), Compression::SNAPPY, - Repetition::REQUIRED, + None, ); } @@ -1865,16 +1857,19 @@ mod tests { #[test] fn test_size_statistics() { - let num_rows: i64 = 5; - let data = vec![ByteArrayType::gen_vec(32, num_rows as usize)]; + let num_rows: usize = 5; + let data = vec![ByteArrayType::gen_vec(32, num_rows)]; let unenc_size: i64 = data[0].iter().map(|x| x.len() as i64).sum(); let file: File = tempfile::tempfile().unwrap(); + let def_vec = vec![1; num_rows]; + let def_levels = Some(def_vec.as_slice()); + let file_metadata = test_roundtrip::( file, data, |r| r.get_bytes(0).unwrap().clone(), Compression::UNCOMPRESSED, - Repetition::OPTIONAL, + def_levels, ); assert_eq!(file_metadata.row_groups.len(), 1); @@ -1893,7 +1888,37 @@ mod tests { if let Some(ref def_hist) = size_stats.definition_level_histogram { assert_eq!(def_hist.len(), 2); assert_eq!(def_hist[0], 0); - assert_eq!(def_hist[1], num_rows); + assert_eq!(def_hist[1], num_rows as i64); + } + } + } + } + + #[test] + fn test_size_statistics_with_nulls() { + let def_levels = [1, 1, 0, 1, 0]; + let data = vec![vec![1, 2, 3, 4, 5]]; + let file: File = tempfile::tempfile().unwrap(); + let file_metadata = test_roundtrip_i32::( + file, + data, + Compression::UNCOMPRESSED, + Some(&def_levels), + ); + + assert_eq!(file_metadata.row_groups.len(), 1); + assert_eq!(file_metadata.row_groups[0].columns.len(), 1); + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + + if let Some(ref meta_data) = file_metadata.row_groups[0].columns[0].meta_data { + assert!(meta_data.size_statistics.is_some()); + if let Some(ref size_stats) = meta_data.size_statistics { + assert!(size_stats.repetition_level_histogram.is_none()); + assert!(size_stats.definition_level_histogram.is_some()); + if let Some(ref def_hist) = size_stats.definition_level_histogram { + assert_eq!(def_hist.len(), 2); + assert_eq!(def_hist[0], 2); // two nulls + assert_eq!(def_hist[1], 3); // three non-null } } } From f8961a3483f16cb99dad5b12e8606a76a34d5e77 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 27 Jun 2024 17:48:32 -0700 Subject: [PATCH 12/44] add test of repetition level histogram --- parquet/src/file/writer.rs | 54 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 4adfe7d1cfcf..be887ab88490 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1923,4 +1923,58 @@ mod tests { } } } + + #[test] + fn test_size_statistics_with_repetition() { + let message_type = " + message test_schema { + OPTIONAL group i32_list (LIST) { + REPEATED group list { + OPTIONAL INT32 element; + } + } + } + "; + let schema = Arc::new(parse_message_type(message_type).unwrap()); + let data = [1, 2, 3, 4, 5]; + let def_levels = [3, 3, 3, 3, 3]; + let rep_levels = [0, 1, 1, 0, 1]; + let file = tempfile::tempfile().unwrap(); + let props = Default::default(); + let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer + .typed::() + .write_batch(&data, Some(&def_levels), Some(&rep_levels)) + .unwrap(); + col_writer.close().unwrap(); + row_group_writer.close().unwrap(); + let file_metadata = writer.finish().unwrap(); + + assert_eq!(file_metadata.row_groups.len(), 1); + assert_eq!(file_metadata.row_groups[0].columns.len(), 1); + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + + if let Some(ref meta_data) = file_metadata.row_groups[0].columns[0].meta_data { + assert!(meta_data.size_statistics.is_some()); + if let Some(ref size_stats) = meta_data.size_statistics { + assert!(size_stats.repetition_level_histogram.is_some()); + assert!(size_stats.definition_level_histogram.is_some()); + if let Some(ref def_hist) = size_stats.definition_level_histogram { + assert_eq!(def_hist.len(), 4); + assert_eq!(def_hist[0], 0); + assert_eq!(def_hist[1], 0); + assert_eq!(def_hist[2], 0); + assert_eq!(def_hist[3], 5); + } + if let Some(ref rep_hist) = size_stats.repetition_level_histogram { + assert_eq!(rep_hist.len(), 2); + assert_eq!(rep_hist[0], 2); + assert_eq!(rep_hist[1], 3); + } + } + } + } } From 73fa0994bbb2ba1795fdb3b050a66ca59864f403 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 28 Jun 2024 08:48:23 -0700 Subject: [PATCH 13/44] revert changes to test_roundtrip --- parquet/src/file/writer.rs | 132 +++++++++++-------------------------- 1 file changed, 40 insertions(+), 92 deletions(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index be887ab88490..5d0c4d0eb3e2 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1398,19 +1398,12 @@ mod tests { file: W, data: Vec>, compression: Compression, - def_levels: Option<&[i16]>, ) -> crate::format::FileMetaData where W: Write + Send, R: ChunkReader + From + 'static, { - test_roundtrip::( - file, - data, - |r| r.get_int(0).unwrap(), - compression, - def_levels, - ) + test_roundtrip::(file, data, |r| r.get_int(0).unwrap(), compression) } /// Tests roundtrip of data of type `D` written using `W` and read using `R` @@ -1420,7 +1413,6 @@ mod tests { data: Vec>, value: F, compression: Compression, - def_levels: Option<&[i16]>, ) -> crate::format::FileMetaData where W: Write + Send, @@ -1428,15 +1420,11 @@ mod tests { D: DataType, F: Fn(Row) -> D::T, { - let repetition = match def_levels { - Some(_) => Repetition::OPTIONAL, - _ => Repetition::REQUIRED, - }; let schema = Arc::new( types::Type::group_type_builder("schema") .with_fields(vec![Arc::new( types::Type::primitive_type_builder("col1", D::get_physical_type()) - .with_repetition(repetition) + .with_repetition(Repetition::REQUIRED) .build() .unwrap(), )]) @@ -1457,7 +1445,7 @@ mod tests { if let Some(mut writer) = row_group_writer.next_column().unwrap() { rows += writer .typed::() - .write_batch(&subset[..], def_levels, None) + .write_batch(&subset[..], None, None) .unwrap() as i64; writer.close().unwrap(); } @@ -1472,28 +1460,24 @@ mod tests { let reader = SerializedFileReader::new(R::from(file)).unwrap(); assert_eq!(reader.num_row_groups(), data.len()); - // Row based API does not like nulls, so skip these validation steps if nulls might - // be present. - if repetition == Repetition::REQUIRED { - assert_eq!( - reader.metadata().file_metadata().num_rows(), - rows, - "row count in metadata not equal to number of rows written" - ); - for (i, item) in data.iter().enumerate().take(reader.num_row_groups()) { - let row_group_reader = reader.get_row_group(i).unwrap(); - let iter = row_group_reader.get_row_iter(None).unwrap(); - let res: Vec<_> = iter.map(|row| row.unwrap()).map(&value).collect(); - let row_group_size = row_group_reader.metadata().total_byte_size(); - let uncompressed_size: i64 = row_group_reader - .metadata() - .columns() - .iter() - .map(|v| v.uncompressed_size()) - .sum(); - assert_eq!(row_group_size, uncompressed_size); - assert_eq!(res, *item); - } + assert_eq!( + reader.metadata().file_metadata().num_rows(), + rows, + "row count in metadata not equal to number of rows written" + ); + for (i, item) in data.iter().enumerate().take(reader.num_row_groups()) { + let row_group_reader = reader.get_row_group(i).unwrap(); + let iter = row_group_reader.get_row_iter(None).unwrap(); + let res: Vec<_> = iter.map(|row| row.unwrap()).map(&value).collect(); + let row_group_size = row_group_reader.metadata().total_byte_size(); + let uncompressed_size: i64 = row_group_reader + .metadata() + .columns() + .iter() + .map(|v| v.uncompressed_size()) + .sum(); + assert_eq!(row_group_size, uncompressed_size); + assert_eq!(res, *item); } file_metadata } @@ -1501,7 +1485,7 @@ mod tests { /// File write-read roundtrip. /// `data` consists of arrays of values for each row group. fn test_file_roundtrip(file: File, data: Vec>) -> crate::format::FileMetaData { - test_roundtrip_i32::(file, data, Compression::UNCOMPRESSED, None) + test_roundtrip_i32::(file, data, Compression::UNCOMPRESSED) } #[test] @@ -1546,7 +1530,7 @@ mod tests { } fn test_bytes_roundtrip(data: Vec>, compression: Compression) { - test_roundtrip_i32::, Bytes>(Vec::with_capacity(1024), data, compression, None); + test_roundtrip_i32::, Bytes>(Vec::with_capacity(1024), data, compression); } #[test] @@ -1557,7 +1541,6 @@ mod tests { vec![my_bool_values], |r| r.get_bool(0).unwrap(), Compression::UNCOMPRESSED, - None, ); } @@ -1569,7 +1552,6 @@ mod tests { vec![my_bool_values], |r| r.get_bool(0).unwrap(), Compression::SNAPPY, - None, ); } @@ -1856,20 +1838,17 @@ mod tests { } #[test] - fn test_size_statistics() { + fn test_byte_array_size_statistics() { let num_rows: usize = 5; let data = vec![ByteArrayType::gen_vec(32, num_rows)]; let unenc_size: i64 = data[0].iter().map(|x| x.len() as i64).sum(); let file: File = tempfile::tempfile().unwrap(); - let def_vec = vec![1; num_rows]; - let def_levels = Some(def_vec.as_slice()); let file_metadata = test_roundtrip::( file, data, |r| r.get_bytes(0).unwrap().clone(), Compression::UNCOMPRESSED, - def_levels, ); assert_eq!(file_metadata.row_groups.len(), 1); @@ -1883,49 +1862,12 @@ mod tests { unenc_size, size_stats.unencoded_byte_array_data_bytes.unwrap_or(0) ); - assert!(size_stats.repetition_level_histogram.is_none()); - assert!(size_stats.definition_level_histogram.is_some()); - if let Some(ref def_hist) = size_stats.definition_level_histogram { - assert_eq!(def_hist.len(), 2); - assert_eq!(def_hist[0], 0); - assert_eq!(def_hist[1], num_rows as i64); - } - } - } - } - - #[test] - fn test_size_statistics_with_nulls() { - let def_levels = [1, 1, 0, 1, 0]; - let data = vec![vec![1, 2, 3, 4, 5]]; - let file: File = tempfile::tempfile().unwrap(); - let file_metadata = test_roundtrip_i32::( - file, - data, - Compression::UNCOMPRESSED, - Some(&def_levels), - ); - - assert_eq!(file_metadata.row_groups.len(), 1); - assert_eq!(file_metadata.row_groups[0].columns.len(), 1); - assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); - - if let Some(ref meta_data) = file_metadata.row_groups[0].columns[0].meta_data { - assert!(meta_data.size_statistics.is_some()); - if let Some(ref size_stats) = meta_data.size_statistics { - assert!(size_stats.repetition_level_histogram.is_none()); - assert!(size_stats.definition_level_histogram.is_some()); - if let Some(ref def_hist) = size_stats.definition_level_histogram { - assert_eq!(def_hist.len(), 2); - assert_eq!(def_hist[0], 2); // two nulls - assert_eq!(def_hist[1], 3); // three non-null - } } } } #[test] - fn test_size_statistics_with_repetition() { + fn test_size_statistics_with_repetition_and_nulls() { let message_type = " message test_schema { OPTIONAL group i32_list (LIST) { @@ -1935,10 +1877,16 @@ mod tests { } } "; + // column is: + // row 0: [1, 2] + // row 1: NULL + // row 2: [4, NULL] + // row 3: [] + // row 4: [7, 8, 9, 10] let schema = Arc::new(parse_message_type(message_type).unwrap()); - let data = [1, 2, 3, 4, 5]; - let def_levels = [3, 3, 3, 3, 3]; - let rep_levels = [0, 1, 1, 0, 1]; + let data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + let def_levels = [3, 3, 0, 3, 2, 1, 3, 3, 3, 3]; + let rep_levels = [0, 1, 0, 0, 1, 0, 0, 1, 1, 1]; let file = tempfile::tempfile().unwrap(); let props = Default::default(); let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); @@ -1964,15 +1912,15 @@ mod tests { assert!(size_stats.definition_level_histogram.is_some()); if let Some(ref def_hist) = size_stats.definition_level_histogram { assert_eq!(def_hist.len(), 4); - assert_eq!(def_hist[0], 0); - assert_eq!(def_hist[1], 0); - assert_eq!(def_hist[2], 0); - assert_eq!(def_hist[3], 5); + assert_eq!(def_hist[0], 1); + assert_eq!(def_hist[1], 1); + assert_eq!(def_hist[2], 1); + assert_eq!(def_hist[3], 7); } if let Some(ref rep_hist) = size_stats.repetition_level_histogram { assert_eq!(rep_hist.len(), 2); - assert_eq!(rep_hist[0], 2); - assert_eq!(rep_hist[1], 3); + assert_eq!(rep_hist[0], 5); + assert_eq!(rep_hist[1], 5); } } } From 00ca596b8e1eb4a952c67f48ed68883c84361a35 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 1 Jul 2024 14:30:36 -0700 Subject: [PATCH 14/44] suggestion from review Co-authored-by: Andrew Lamb --- parquet/src/file/metadata.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index ec4d237b5284..84d763b2528d 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -668,7 +668,7 @@ impl ColumnChunkMetaData { Some(offset..(offset + length)) } - /// Returns the number of bytes of variable length data. + /// Returns the number of bytes of variable length data after decoding pub fn unencoded_byte_array_data_bytes(&self) -> Option { self.unencoded_byte_array_data_bytes } From 6acc50059fca5e535102034e058f4a53064380c7 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 1 Jul 2024 15:07:22 -0700 Subject: [PATCH 15/44] add to documentation as suggested in review --- parquet/src/file/metadata.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 84d763b2528d..44bd14d8310f 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -674,11 +674,15 @@ impl ColumnChunkMetaData { } /// Returns the repetition level histogram. + /// + /// The returned value `vec[i]` is how many values are at repetition level `i`. pub fn repetition_level_histogram(&self) -> Option<&Vec> { self.repetition_level_histogram.as_ref() } - /// Returns the repetition level histogram. + /// Returns the definition level histogram. + /// + /// The returned value `vec[i]` is how many values are at definition level `i`. pub fn definition_level_histogram(&self) -> Option<&Vec> { self.definition_level_histogram.as_ref() } From 787e3e8f74fb537e378bf419875959556597526a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 1 Jul 2024 20:29:33 -0700 Subject: [PATCH 16/44] make histograms optional --- parquet/src/file/metadata.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 44bd14d8310f..20a03f4a6f8f 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -988,8 +988,8 @@ pub struct ColumnIndexBuilder { max_values: Vec>, null_counts: Vec, boundary_order: BoundaryOrder, - repetition_level_histograms: Vec, - definition_level_histograms: Vec, + repetition_level_histograms: Option>, + definition_level_histograms: Option>, // If one page can't get build index, need to ignore all index in this column valid: bool, } @@ -1008,8 +1008,8 @@ impl ColumnIndexBuilder { max_values: Vec::new(), null_counts: Vec::new(), boundary_order: BoundaryOrder::UNORDERED, - repetition_level_histograms: Vec::new(), - definition_level_histograms: Vec::new(), + repetition_level_histograms: None, + definition_level_histograms: None, valid: true, } } @@ -1033,12 +1033,14 @@ impl ColumnIndexBuilder { definition_level_histogram: &Option>, ) { if let Some(ref rep_lvl_hist) = repetition_level_histogram { - self.repetition_level_histograms.reserve(rep_lvl_hist.len()); - self.repetition_level_histograms.extend(rep_lvl_hist); + let hist = self.repetition_level_histograms.get_or_insert(Vec::new()); + hist.reserve(rep_lvl_hist.len()); + hist.extend(rep_lvl_hist); } if let Some(ref def_lvl_hist) = definition_level_histogram { - self.definition_level_histograms.reserve(def_lvl_hist.len()); - self.definition_level_histograms.extend(def_lvl_hist); + let hist = self.definition_level_histograms.get_or_insert(Vec::new()); + hist.reserve(def_lvl_hist.len()); + hist.extend(def_lvl_hist); } } From 46851f458cb78e692f55c7b0b8520f5d0c8a2da2 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 1 Jul 2024 20:30:38 -0700 Subject: [PATCH 17/44] add histograms to PageIndex --- parquet/src/file/page_index/index.rs | 77 +++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 14 deletions(-) diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 71fb47afa960..ef341602096b 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -41,6 +41,10 @@ pub struct PageIndex { pub max: Option, /// Null values in the page pub null_count: Option, + /// Repetition level histogram for the page + pub repetition_level_histogram: Option>, + /// Definition level histogram for the page + pub definition_level_histogram: Option>, } impl PageIndex { @@ -53,6 +57,12 @@ impl PageIndex { pub fn null_count(&self) -> Option { self.null_count } + pub fn repetition_level_histogram(&self) -> Option<&Vec> { + self.repetition_level_histogram.as_ref() + } + pub fn definition_level_histogram(&self) -> Option<&Vec> { + self.definition_level_histogram.as_ref() + } } impl PageIndex @@ -141,26 +151,57 @@ impl NativeIndex { .map(|x| x.into_iter().map(Some).collect::>()) .unwrap_or_else(|| vec![None; len]); + // histograms are a 1D array encoding a 2D num_pages X num_levels matrix. + let to_page_histograms = |opt_hist: Option>| { + if let Some(hist) = opt_hist { + // TODO: should we assert (hist.len() % len) == 0? + let num_levels = hist.len() / len; + let mut res = Vec::with_capacity(len); + for i in 0..len { + let page_idx = i * num_levels; + let page_hist = hist[page_idx..page_idx + num_levels].to_vec(); + res[i] = Some(page_hist); + } + res + } else { + vec![None; len] + } + }; + + let rep_hists: Vec>> = + to_page_histograms(index.repetition_level_histograms); + let def_hists: Vec>> = + to_page_histograms(index.definition_level_histograms); + let indexes = index .min_values .iter() .zip(index.max_values.into_iter()) .zip(index.null_pages.into_iter()) .zip(null_counts.into_iter()) - .map(|(((min, max), is_null), null_count)| { - let (min, max) = if is_null { - (None, None) - } else { - let min = min.as_slice(); - let max = max.as_slice(); - (Some(from_le_slice::(min)), Some(from_le_slice::(max))) - }; - Ok(PageIndex { - min, - max, - null_count, - }) - }) + .zip(rep_hists.into_iter()) + .zip(def_hists.into_iter()) + .map( + |( + ((((min, max), is_null), null_count), repetition_level_histogram), + definition_level_histogram, + )| { + let (min, max) = if is_null { + (None, None) + } else { + let min = min.as_slice(); + let max = max.as_slice(); + (Some(from_le_slice::(min)), Some(from_le_slice::(max))) + }; + Ok(PageIndex { + min, + max, + null_count, + repetition_level_histogram, + definition_level_histogram, + }) + }, + ) .collect::, ParquetError>>()?; Ok(Self { @@ -180,6 +221,8 @@ mod tests { min: Some(-123), max: Some(234), null_count: Some(0), + repetition_level_histogram: Some(vec![1, 2]), + definition_level_histogram: Some(vec![1, 2, 3]), }; assert_eq!(page_index.min().unwrap(), &-123); @@ -187,6 +230,8 @@ mod tests { assert_eq!(page_index.min_bytes().unwrap(), (-123).as_bytes()); assert_eq!(page_index.max_bytes().unwrap(), 234.as_bytes()); assert_eq!(page_index.null_count().unwrap(), 0); + assert_eq!(page_index.repetition_level_histogram(), Some(&vec![1, 2])); + assert_eq!(page_index.definition_level_histogram(), Some(&vec![1, 2, 3])); } #[test] @@ -195,6 +240,8 @@ mod tests { min: None, max: None, null_count: None, + repetition_level_histogram: None, + definition_level_histogram: None, }; assert_eq!(page_index.min(), None); @@ -202,5 +249,7 @@ mod tests { assert_eq!(page_index.min_bytes(), None); assert_eq!(page_index.max_bytes(), None); assert_eq!(page_index.null_count(), None); + assert_eq!(page_index.repetition_level_histogram(), None); + assert_eq!(page_index.definition_level_histogram(), None); } } From 4f8487b157f9d9826c066b431c989f831618ef12 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 1 Jul 2024 21:10:37 -0700 Subject: [PATCH 18/44] use Vec::push() --- parquet/src/file/page_index/index.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index ef341602096b..eb7014ee9b0f 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -160,7 +160,7 @@ impl NativeIndex { for i in 0..len { let page_idx = i * num_levels; let page_hist = hist[page_idx..page_idx + num_levels].to_vec(); - res[i] = Some(page_hist); + res.push(Some(page_hist)); } res } else { From 903b06bffe5f8bc12d80ab1914d99d0412e728ea Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 1 Jul 2024 23:29:32 -0700 Subject: [PATCH 19/44] formatting --- parquet/src/file/page_index/index.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index eb7014ee9b0f..7ad45c2e1000 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -231,7 +231,10 @@ mod tests { assert_eq!(page_index.max_bytes().unwrap(), 234.as_bytes()); assert_eq!(page_index.null_count().unwrap(), 0); assert_eq!(page_index.repetition_level_histogram(), Some(&vec![1, 2])); - assert_eq!(page_index.definition_level_histogram(), Some(&vec![1, 2, 3])); + assert_eq!( + page_index.definition_level_histogram(), + Some(&vec![1, 2, 3]) + ); } #[test] From fa89836a88521bdcc571aeecc58e01d8813010a7 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 1 Jul 2024 23:30:01 -0700 Subject: [PATCH 20/44] check size stats in read metadata --- parquet/src/file/writer.rs | 72 ++++++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 11 deletions(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 5d0c4d0eb3e2..fda21eec75aa 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1888,8 +1888,12 @@ mod tests { let def_levels = [3, 3, 0, 3, 2, 1, 3, 3, 3, 3]; let rep_levels = [0, 1, 0, 0, 1, 0, 0, 1, 1, 1]; let file = tempfile::tempfile().unwrap(); - let props = Default::default(); - let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); + let props = Arc::new( + WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .build(), + ); + let mut writer = SerializedFileWriter::new(&file, schema, props).unwrap(); let mut row_group_writer = writer.next_row_group().unwrap(); let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); @@ -1899,30 +1903,76 @@ mod tests { .unwrap(); col_writer.close().unwrap(); row_group_writer.close().unwrap(); - let file_metadata = writer.finish().unwrap(); + let file_metadata = writer.close().unwrap(); assert_eq!(file_metadata.row_groups.len(), 1); assert_eq!(file_metadata.row_groups[0].columns.len(), 1); assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + let check_def_hist = |def_hist: &Vec| { + assert_eq!(def_hist.len(), 4); + assert_eq!(def_hist[0], 1); + assert_eq!(def_hist[1], 1); + assert_eq!(def_hist[2], 1); + assert_eq!(def_hist[3], 7); + }; + + let check_rep_hist = |rep_hist: &Vec| { + assert_eq!(rep_hist.len(), 2); + assert_eq!(rep_hist[0], 5); + assert_eq!(rep_hist[1], 5); + }; + if let Some(ref meta_data) = file_metadata.row_groups[0].columns[0].meta_data { assert!(meta_data.size_statistics.is_some()); if let Some(ref size_stats) = meta_data.size_statistics { assert!(size_stats.repetition_level_histogram.is_some()); assert!(size_stats.definition_level_histogram.is_some()); if let Some(ref def_hist) = size_stats.definition_level_histogram { - assert_eq!(def_hist.len(), 4); - assert_eq!(def_hist[0], 1); - assert_eq!(def_hist[1], 1); - assert_eq!(def_hist[2], 1); - assert_eq!(def_hist[3], 7); + check_def_hist(def_hist) } if let Some(ref rep_hist) = size_stats.repetition_level_histogram { - assert_eq!(rep_hist.len(), 2); - assert_eq!(rep_hist[0], 5); - assert_eq!(rep_hist[1], 5); + check_rep_hist(rep_hist) } } } + + // check that the read metadata is also correct + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = SerializedFileReader::new_with_options(file, options).unwrap(); + + let rfile_metadata = reader.metadata().file_metadata(); + assert_eq!(rfile_metadata.num_rows(), file_metadata.num_rows); + assert_eq!(reader.num_row_groups(), 1); + let rowgroup = reader.get_row_group(0).unwrap(); + assert_eq!(rowgroup.num_columns(), 1); + let column = rowgroup.metadata().column(0); + assert!(column.definition_level_histogram().is_some()); + assert!(column.repetition_level_histogram().is_some()); + if let Some(def_hist) = column.definition_level_histogram() { + check_def_hist(def_hist) + } + if let Some(rep_hist) = column.repetition_level_histogram() { + check_rep_hist(rep_hist) + } + + // check histogram in column index as well + assert!(reader.metadata().column_index().is_some()); + let column_index = reader.metadata().column_index().unwrap(); + assert_eq!(column_index.len(), 1); + assert_eq!(column_index[0].len(), 1); + let col_idx = if let Index::INT32(index) = &column_index[0][0] { + assert_eq!(index.indexes.len(), 1); + &index.indexes[0] + } else { + unreachable!() + }; + + if let Some(def_hist) = col_idx.definition_level_histogram() { + check_def_hist(def_hist) + } + if let Some(rep_hist) = col_idx.repetition_level_histogram() { + check_rep_hist(rep_hist) + } } } From 2800cc72bcf60cc63c334aa8ee1519223a6c8721 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 2 Jul 2024 11:21:20 -0700 Subject: [PATCH 21/44] check unencoded_byte_array_data_bytes is not set for int cols --- parquet/src/file/writer.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index fda21eec75aa..2c7482aa95cb 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1923,11 +1923,14 @@ mod tests { assert_eq!(rep_hist[1], 5); }; + // check that histograms are set properly in the write and read metadata + // also check that unencoded_byte_array_data_bytes is not set if let Some(ref meta_data) = file_metadata.row_groups[0].columns[0].meta_data { assert!(meta_data.size_statistics.is_some()); if let Some(ref size_stats) = meta_data.size_statistics { assert!(size_stats.repetition_level_histogram.is_some()); assert!(size_stats.definition_level_histogram.is_some()); + assert!(size_stats.unencoded_byte_array_data_bytes.is_none()); if let Some(ref def_hist) = size_stats.definition_level_histogram { check_def_hist(def_hist) } @@ -1949,6 +1952,7 @@ mod tests { let column = rowgroup.metadata().column(0); assert!(column.definition_level_histogram().is_some()); assert!(column.repetition_level_histogram().is_some()); + assert!(column.unencoded_byte_array_data_bytes().is_none()); if let Some(def_hist) = column.definition_level_histogram() { check_def_hist(def_hist) } @@ -1974,5 +1978,7 @@ mod tests { if let Some(rep_hist) = col_idx.repetition_level_histogram() { check_rep_hist(rep_hist) } + + // TODO check no unencoded_byte_array_data_bytes in offset index } } From 95a053571cb7eb5bf2bd8906ec9dc4850378574b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 2 Jul 2024 14:01:53 -0700 Subject: [PATCH 22/44] rewrite test_byte_array_size_statistics() to not use test_roundtrip() --- parquet/src/file/writer.rs | 83 +++++++++++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 10 deletions(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 2c7482aa95cb..f87b48977569 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1839,31 +1839,94 @@ mod tests { #[test] fn test_byte_array_size_statistics() { - let num_rows: usize = 5; - let data = vec![ByteArrayType::gen_vec(32, num_rows)]; - let unenc_size: i64 = data[0].iter().map(|x| x.len() as i64).sum(); + let message_type = " + message test_schema { + OPTIONAL BYTE_ARRAY a (UTF8); + } + "; + let schema = Arc::new(parse_message_type(message_type).unwrap()); + let data = ByteArrayType::gen_vec(32, 7); + let def_levels = [1, 1, 1, 1, 0, 1, 0, 1, 0, 1]; + let unenc_size: i64 = data.iter().map(|x| x.len() as i64).sum(); let file: File = tempfile::tempfile().unwrap(); - - let file_metadata = test_roundtrip::( - file, - data, - |r| r.get_bytes(0).unwrap().clone(), - Compression::UNCOMPRESSED, + let props = Arc::new( + WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .build(), ); + let mut writer = SerializedFileWriter::new(&file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer + .typed::() + .write_batch(&data, Some(&def_levels), None) + .unwrap(); + col_writer.close().unwrap(); + row_group_writer.close().unwrap(); + let file_metadata = writer.close().unwrap(); + assert_eq!(file_metadata.row_groups.len(), 1); assert_eq!(file_metadata.row_groups[0].columns.len(), 1); assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + let check_def_hist = |def_hist: &Vec| { + assert_eq!(def_hist.len(), 2); + assert_eq!(def_hist[0], 3); + assert_eq!(def_hist[1], 7); + }; + if let Some(ref meta_data) = file_metadata.row_groups[0].columns[0].meta_data { assert!(meta_data.size_statistics.is_some()); if let Some(ref size_stats) = meta_data.size_statistics { + assert!(size_stats.repetition_level_histogram.is_none()); + assert!(size_stats.definition_level_histogram.is_some()); + assert!(size_stats.unencoded_byte_array_data_bytes.is_some()); assert_eq!( unenc_size, size_stats.unencoded_byte_array_data_bytes.unwrap_or(0) ); + if let Some(ref def_hist) = size_stats.definition_level_histogram { + check_def_hist(def_hist) + } } } + + // check that the read metadata is also correct + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = SerializedFileReader::new_with_options(file, options).unwrap(); + + let rfile_metadata = reader.metadata().file_metadata(); + assert_eq!(rfile_metadata.num_rows(), file_metadata.num_rows); + assert_eq!(reader.num_row_groups(), 1); + let rowgroup = reader.get_row_group(0).unwrap(); + assert_eq!(rowgroup.num_columns(), 1); + let column = rowgroup.metadata().column(0); + assert!(column.definition_level_histogram().is_some()); + assert!(column.repetition_level_histogram().is_none()); + assert!(column.unencoded_byte_array_data_bytes().is_some()); + if let Some(def_hist) = column.definition_level_histogram() { + check_def_hist(def_hist) + } + assert_eq!(unenc_size, column.unencoded_byte_array_data_bytes().unwrap_or(0)); + + // check histogram in column index as well + assert!(reader.metadata().column_index().is_some()); + let column_index = reader.metadata().column_index().unwrap(); + assert_eq!(column_index.len(), 1); + assert_eq!(column_index[0].len(), 1); + let col_idx = if let Index::BYTE_ARRAY(index) = &column_index[0][0] { + assert_eq!(index.indexes.len(), 1); + &index.indexes[0] + } else { + unreachable!() + }; + + assert!(col_idx.repetition_level_histogram().is_none()); + if let Some(def_hist) = col_idx.definition_level_histogram() { + check_def_hist(def_hist) + } } #[test] @@ -1884,7 +1947,7 @@ mod tests { // row 3: [] // row 4: [7, 8, 9, 10] let schema = Arc::new(parse_message_type(message_type).unwrap()); - let data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + let data = [1, 2, 4, 7, 8, 9, 10]; let def_levels = [3, 3, 0, 3, 2, 1, 3, 3, 3, 3]; let rep_levels = [0, 1, 0, 0, 1, 0, 0, 1, 1, 1]; let file = tempfile::tempfile().unwrap(); From fc66a59c54bae11823d7cc843defe2eb2851e523 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 2 Jul 2024 14:46:43 -0700 Subject: [PATCH 23/44] add unencoded_byte_array_data_bytes support in page index --- parquet/src/arrow/async_reader/mod.rs | 1 + parquet/src/file/metadata.rs | 15 +++++++ parquet/src/file/page_index/index_reader.rs | 44 +++++++++++++++++++++ parquet/src/file/serialized_reader.rs | 9 +++++ parquet/src/file/writer.rs | 27 ++++++++++++- 5 files changed, 94 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 0a72583b90d0..eac8f33745b1 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -1538,6 +1538,7 @@ mod tests { vec![row_group_meta], None, Some(vec![offset_index.clone()]), + None, ); let metadata = Arc::new(metadata); diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 20a03f4a6f8f..63bf215811e0 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -93,6 +93,8 @@ pub struct ParquetMetaData { column_index: Option, /// Offset index for all each page in each column chunk offset_index: Option, + /// `unencoded_byte_array_data_bytes` from the offset index + unencoded_byte_array_data_bytes: Option>>>>, } impl ParquetMetaData { @@ -104,6 +106,7 @@ impl ParquetMetaData { row_groups, column_index: None, offset_index: None, + unencoded_byte_array_data_bytes: None, } } @@ -114,12 +117,14 @@ impl ParquetMetaData { row_groups: Vec, column_index: Option, offset_index: Option, + unencoded_byte_array_data_bytes: Option>>>>, ) -> Self { ParquetMetaData { file_metadata, row_groups, column_index, offset_index, + unencoded_byte_array_data_bytes, } } @@ -176,6 +181,16 @@ impl ParquetMetaData { self.offset_index.as_ref() } + /// Returns `unencoded_byte_array_data_bytes` from the offset indexes in this file, if loaded + /// + /// Returns `None` if the parquet file does not have a `OffsetIndex` or + /// [ArrowReaderOptions::with_page_index] was set to false. + /// + /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index + pub fn unencoded_byte_array_data_bytes(&self) -> Option<&Vec>>>> { + self.unencoded_byte_array_data_bytes.as_ref() + } + /// Override the column index #[cfg(feature = "arrow")] pub(crate) fn set_column_index(&mut self, index: Option) { diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 2ddf826fb022..0915b3dcc6f5 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -109,6 +109,50 @@ pub fn read_pages_locations( .collect() } +/// Reads [`OffsetIndex`], per-page [`unencoded_byte_array_data_bytes`] for all columns of a row +/// group. +/// +/// Returns a vector of `unencoded_byte_array_data_bytes[column_number][page_number]` +/// +/// Return an empty vector if this row group does not contain an +/// [`OffsetIndex]`. +/// +/// See [Column Index Documentation] for more details. +/// +/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +pub fn read_unencoded_byte_array_data_bytes( + reader: &R, + chunks: &[ColumnChunkMetaData], +) -> Result>>, ParquetError> { + let fetch = chunks + .iter() + .fold(None, |range, c| acc_range(range, c.offset_index_range())); + + let fetch = match fetch { + Some(r) => r, + None => return Ok(vec![]), + }; + + let bytes = reader.get_bytes(fetch.start as _, fetch.end - fetch.start)?; + let get = |r: Range| &bytes[(r.start - fetch.start)..(r.end - fetch.start)]; + + chunks + .iter() + .map(|c| match c.offset_index_range() { + Some(r) => decode_unencoded_byte_array_data_bytes(get(r)), + None => Err(general_err!("missing offset index")), + }) + .collect() +} + +pub(crate) fn decode_unencoded_byte_array_data_bytes( + data: &[u8], +) -> Result>, ParquetError> { + let mut prot = TCompactSliceInputProtocol::new(data); + let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; + Ok(offset.unencoded_byte_array_data_bytes) +} + pub(crate) fn decode_offset_index(data: &[u8]) -> Result, ParquetError> { let mut prot = TCompactSliceInputProtocol::new(data); let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index ac7d2d287488..62a4dc49d7c0 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -211,12 +211,20 @@ impl SerializedFileReader { if options.enable_page_index { let mut columns_indexes = vec![]; let mut offset_indexes = vec![]; + let mut unenc_byte_sizes = vec![]; for rg in &mut filtered_row_groups { let column_index = index_reader::read_columns_indexes(&chunk_reader, rg.columns())?; let offset_index = index_reader::read_pages_locations(&chunk_reader, rg.columns())?; + // TODO the following should be in an `OffsetIndex` struct along with the page + // locations. For now keeping them separate to avoid breaking API changes. + let unenc_bytes = index_reader::read_unencoded_byte_array_data_bytes( + &chunk_reader, + rg.columns(), + )?; columns_indexes.push(column_index); offset_indexes.push(offset_index); + unenc_byte_sizes.push(unenc_bytes); } Ok(Self { @@ -226,6 +234,7 @@ impl SerializedFileReader { filtered_row_groups, Some(columns_indexes), Some(offset_indexes), + Some(unenc_byte_sizes), )), props: Arc::new(options.props), }) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index f87b48977569..06d3ebd9f645 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1909,7 +1909,10 @@ mod tests { if let Some(def_hist) = column.definition_level_histogram() { check_def_hist(def_hist) } - assert_eq!(unenc_size, column.unencoded_byte_array_data_bytes().unwrap_or(0)); + assert_eq!( + unenc_size, + column.unencoded_byte_array_data_bytes().unwrap_or(0) + ); // check histogram in column index as well assert!(reader.metadata().column_index().is_some()); @@ -1927,6 +1930,19 @@ mod tests { if let Some(def_hist) = col_idx.definition_level_histogram() { check_def_hist(def_hist) } + + assert!(reader + .metadata() + .unencoded_byte_array_data_bytes() + .is_some()); + let unenc_sizes = reader.metadata().unencoded_byte_array_data_bytes().unwrap(); + assert_eq!(unenc_sizes.len(), 1); + assert_eq!(unenc_sizes[0].len(), 1); + assert!(unenc_sizes[0][0].is_some()); + if let Some(page_sizes) = &unenc_sizes[0][0] { + assert_eq!(page_sizes.len(), 1); + assert_eq!(page_sizes[0], unenc_size); + } } #[test] @@ -2042,6 +2058,13 @@ mod tests { check_rep_hist(rep_hist) } - // TODO check no unencoded_byte_array_data_bytes in offset index + assert!(reader + .metadata() + .unencoded_byte_array_data_bytes() + .is_some()); + let unenc_sizes = reader.metadata().unencoded_byte_array_data_bytes().unwrap(); + assert_eq!(unenc_sizes.len(), 1); + assert_eq!(unenc_sizes[0].len(), 1); + assert!(unenc_sizes[0][0].is_none()); } } From 7be97e52bc7014295044fe1b276a57d644914f66 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 2 Jul 2024 15:06:16 -0700 Subject: [PATCH 24/44] update expected sizes to account for new stats --- parquet/src/file/metadata/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 533b14d2abef..4a792b183764 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1408,7 +1408,7 @@ mod tests { column_orders, ); let parquet_meta = ParquetMetaData::new(file_metadata.clone(), row_group_meta.clone()); - let base_expected_size = 1320; + let base_expected_size = 1472; assert_eq!(parquet_meta.memory_size(), base_expected_size); let mut column_index = ColumnIndexBuilder::new(); @@ -1428,7 +1428,7 @@ mod tests { None, ); - let bigger_expected_size = 2304; + let bigger_expected_size = 2776; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); assert_eq!(parquet_meta.memory_size(), bigger_expected_size); From f5ab47bf52db1c9fe31767fcf50493f2f9532447 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 3 Jul 2024 11:05:21 -0700 Subject: [PATCH 25/44] only write SizeStatistics in ColumnMetaData if statistics are enabled --- parquet/src/column/writer/mod.rs | 15 ++++++++++----- parquet/src/file/metadata/mod.rs | 19 ++++++++++++++----- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 784724c8dc1d..54d4e2fa5295 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1037,10 +1037,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .set_total_uncompressed_size(total_uncompressed_size) .set_num_values(num_values) .set_data_page_offset(data_page_offset) - .set_dictionary_page_offset(dict_page_offset) - .set_unencoded_byte_array_data_bytes(self.column_metrics.variable_length_bytes) - .set_repetition_level_histogram(self.column_metrics.repetition_level_histogram.take()) - .set_definition_level_histogram(self.column_metrics.definition_level_histogram.take()); + .set_dictionary_page_offset(dict_page_offset); if self.statistics_enabled != EnabledStatistics::None { let backwards_compatible_min_max = self.descr.sort_order().is_signed(); @@ -1103,7 +1100,15 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { stats => stats, }; - builder = builder.set_statistics(statistics); + builder = builder + .set_statistics(statistics) + .set_unencoded_byte_array_data_bytes(self.column_metrics.variable_length_bytes) + .set_repetition_level_histogram( + self.column_metrics.repetition_level_histogram.take(), + ) + .set_definition_level_histogram( + self.column_metrics.definition_level_histogram.take(), + ); } let metadata = builder.build()?; diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 4a792b183764..7245e882dc20 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -824,6 +824,19 @@ impl ColumnChunkMetaData { /// Method to convert to Thrift `ColumnMetaData` pub fn to_column_metadata_thrift(&self) -> ColumnMetaData { + let size_statistics = if self.unencoded_byte_array_data_bytes.is_some() + || self.repetition_level_histogram.is_some() + || self.definition_level_histogram.is_some() + { + Some(SizeStatistics { + unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes, + repetition_level_histogram: self.repetition_level_histogram.clone(), + definition_level_histogram: self.definition_level_histogram.clone(), + }) + } else { + None + }; + ColumnMetaData { type_: self.column_type().into(), encodings: self.encodings().iter().map(|&v| v.into()).collect(), @@ -843,11 +856,7 @@ impl ColumnChunkMetaData { .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()), bloom_filter_offset: self.bloom_filter_offset, bloom_filter_length: self.bloom_filter_length, - size_statistics: Some(SizeStatistics { - unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes, - repetition_level_histogram: self.repetition_level_histogram.clone(), - definition_level_histogram: self.definition_level_histogram.clone(), - }), + size_statistics, } } From a008e9ed6d8cb6667af3ee40d4b3880de2c5c9ee Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 5 Jul 2024 10:11:16 -0700 Subject: [PATCH 26/44] add a little documentation --- parquet/src/file/metadata/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 7245e882dc20..86fb16361550 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -708,7 +708,8 @@ impl ColumnChunkMetaData { Some(offset..(offset + length)) } - /// Returns the number of bytes of variable length data after decoding + /// Returns the number of bytes of variable length data after decoding. + /// Only set for BYTE_ARRAY columns. pub fn unencoded_byte_array_data_bytes(&self) -> Option { self.unencoded_byte_array_data_bytes } From 87ccec21ab69bdb2f456d6c6019734f50c79494d Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 5 Jul 2024 16:27:04 -0700 Subject: [PATCH 27/44] add ParquetOffsetIndex to avoid double read of OffsetIndex --- parquet/src/file/page_index/index_reader.rs | 28 ++++++------- parquet/src/file/page_index/mod.rs | 1 + parquet/src/file/page_index/offset_index.rs | 44 +++++++++++++++++++++ parquet/src/file/serialized_reader.rs | 18 +++++---- 4 files changed, 69 insertions(+), 22 deletions(-) create mode 100644 parquet/src/file/page_index/offset_index.rs diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 0915b3dcc6f5..fdd020ae2b77 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -27,6 +27,8 @@ use crate::format::{ColumnIndex, OffsetIndex, PageLocation}; use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; use std::ops::Range; +use super::offset_index::ParquetOffsetIndex; + /// Computes the covering range of two optional ranges /// /// For example `acc_range(Some(7..9), Some(1..3)) = Some(1..9)` @@ -109,21 +111,21 @@ pub fn read_pages_locations( .collect() } -/// Reads [`OffsetIndex`], per-page [`unencoded_byte_array_data_bytes`] for all columns of a row -/// group. +/// Reads per-column [`ParquetOffsetIndex`] for all columns of a row group by +/// decoding [`OffsetIndex`] . /// -/// Returns a vector of `unencoded_byte_array_data_bytes[column_number][page_number]` +/// Returns a vector of `index[column_number]`. /// -/// Return an empty vector if this row group does not contain an -/// [`OffsetIndex]`. +/// Returns an empty vector if this row group does not contain an +/// [`OffsetIndex`]. /// -/// See [Column Index Documentation] for more details. +/// See [Offset Index Documentation] for more details. /// -/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md -pub fn read_unencoded_byte_array_data_bytes( +/// [Offset Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +pub fn read_offset_indexes( reader: &R, chunks: &[ColumnChunkMetaData], -) -> Result>>, ParquetError> { +) -> Result, ParquetError> { let fetch = chunks .iter() .fold(None, |range, c| acc_range(range, c.offset_index_range())); @@ -139,18 +141,16 @@ pub fn read_unencoded_byte_array_data_bytes( chunks .iter() .map(|c| match c.offset_index_range() { - Some(r) => decode_unencoded_byte_array_data_bytes(get(r)), + Some(r) => decode_full_offset_index(get(r)), None => Err(general_err!("missing offset index")), }) .collect() } -pub(crate) fn decode_unencoded_byte_array_data_bytes( - data: &[u8], -) -> Result>, ParquetError> { +pub(crate) fn decode_full_offset_index(data: &[u8]) -> Result { let mut prot = TCompactSliceInputProtocol::new(data); let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; - Ok(offset.unencoded_byte_array_data_bytes) + ParquetOffsetIndex::try_new(offset) } pub(crate) fn decode_offset_index(data: &[u8]) -> Result, ParquetError> { diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs index 9372645d76ee..a8077896db34 100644 --- a/parquet/src/file/page_index/mod.rs +++ b/parquet/src/file/page_index/mod.rs @@ -21,3 +21,4 @@ pub mod index; pub mod index_reader; +pub mod offset_index; diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs new file mode 100644 index 000000000000..25ac94d8eea8 --- /dev/null +++ b/parquet/src/file/page_index/offset_index.rs @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`ParquetOffsetIndex`] structure holding decoded [`OffsetIndex`] information + +use crate::errors::ParquetError; +use crate::format::{OffsetIndex, PageLocation}; + +#[derive(Debug, Clone, PartialEq)] +pub struct ParquetOffsetIndex { + pub page_locations: Vec, + pub unencoded_byte_array_data_bytes: Option>, +} + +impl ParquetOffsetIndex { + pub(crate) fn try_new(index: OffsetIndex) -> Result { + Ok(Self { + page_locations: index.page_locations, + unencoded_byte_array_data_bytes: index.unencoded_byte_array_data_bytes, + }) + } + + pub fn page_locations(&self) -> &Vec { + &self.page_locations + } + + pub fn unencoded_byte_array_data_bytes(&self) -> Option<&Vec> { + self.unencoded_byte_array_data_bytes.as_ref() + } +} diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 62a4dc49d7c0..65b6ebf2ec98 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -215,15 +215,17 @@ impl SerializedFileReader { for rg in &mut filtered_row_groups { let column_index = index_reader::read_columns_indexes(&chunk_reader, rg.columns())?; - let offset_index = index_reader::read_pages_locations(&chunk_reader, rg.columns())?; - // TODO the following should be in an `OffsetIndex` struct along with the page - // locations. For now keeping them separate to avoid breaking API changes. - let unenc_bytes = index_reader::read_unencoded_byte_array_data_bytes( - &chunk_reader, - rg.columns(), - )?; + let offset_index = index_reader::read_offset_indexes(&chunk_reader, rg.columns())?; + + // split offset_index into two vectors to not break API + let mut page_locations = vec![]; + let mut unenc_bytes = vec![]; + offset_index.into_iter().for_each(|index| { + page_locations.push(index.page_locations); + unenc_bytes.push(index.unencoded_byte_array_data_bytes); + }); columns_indexes.push(column_index); - offset_indexes.push(offset_index); + offset_indexes.push(page_locations); unenc_byte_sizes.push(unenc_bytes); } From 3eead30b9bfd0a44bfdc23719260929a7f0fa165 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 5 Jul 2024 16:34:47 -0700 Subject: [PATCH 28/44] cleanup --- parquet/src/file/page_index/index_reader.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index fdd020ae2b77..ab95fe8d70f9 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -22,13 +22,12 @@ use crate::data_type::Int96; use crate::errors::ParquetError; use crate::file::metadata::ColumnChunkMetaData; use crate::file::page_index::index::{Index, NativeIndex}; +use crate::file::page_index::offset_index::ParquetOffsetIndex; use crate::file::reader::ChunkReader; use crate::format::{ColumnIndex, OffsetIndex, PageLocation}; use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; use std::ops::Range; -use super::offset_index::ParquetOffsetIndex; - /// Computes the covering range of two optional ranges /// /// For example `acc_range(Some(7..9), Some(1..3)) = Some(1..9)` From ddf40c321248cdab58e2daa9f2030bd85988915c Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 5 Jul 2024 16:44:09 -0700 Subject: [PATCH 29/44] use less verbose update of variable_length_bytes --- parquet/src/column/writer/encoder.rs | 3 +-- parquet/src/column/writer/mod.rs | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index fa3d3a3ba4d0..9d01c09040de 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -154,8 +154,7 @@ impl ColumnValueEncoderImpl { } if let Some(var_bytes) = T::T::variable_length_bytes(slice) { - self.variable_length_bytes = - Some(var_bytes + self.variable_length_bytes.unwrap_or(0)); + *self.variable_length_bytes.get_or_insert(0) += var_bytes; } } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 54d4e2fa5295..0e4e4063936d 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -844,8 +844,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { }; if let Some(var_bytes) = values_data.variable_length_bytes { - self.column_metrics.variable_length_bytes = - Some(self.column_metrics.variable_length_bytes.unwrap_or(0) + var_bytes); + *self.column_metrics.variable_length_bytes.get_or_insert(0) += var_bytes; } // update column and offset index From 0ebb72f56678f382a227db0447108b9f54899d6c Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 5 Jul 2024 19:46:29 -0700 Subject: [PATCH 30/44] add some documentation --- parquet/src/file/page_index/offset_index.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index 25ac94d8eea8..31305ba57080 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -20,6 +20,8 @@ use crate::errors::ParquetError; use crate::format::{OffsetIndex, PageLocation}; +/// [`OffsetIndex`] information for a column chunk. Contains offsets and sizes for each page +/// in the chunk. Optionally stores fully decoded page sizes for BYTE_ARRAY columns. #[derive(Debug, Clone, PartialEq)] pub struct ParquetOffsetIndex { pub page_locations: Vec, @@ -27,6 +29,7 @@ pub struct ParquetOffsetIndex { } impl ParquetOffsetIndex { + /// Creates a new [`ParquetOffsetIndex`] from an [`OffsetIndex`]. pub(crate) fn try_new(index: OffsetIndex) -> Result { Ok(Self { page_locations: index.page_locations, @@ -34,10 +37,13 @@ impl ParquetOffsetIndex { }) } + /// Vector of [`PageLocation`] objects, one per page in the chunk. pub fn page_locations(&self) -> &Vec { &self.page_locations } + /// Optional vector of unencoded page sizes, one per page in the chunk. Only defined + /// for BYTE_ARRAY columns. pub fn unencoded_byte_array_data_bytes(&self) -> Option<&Vec> { self.unencoded_byte_array_data_bytes.as_ref() } From 393aea19e76addcc9fd6f0d94715430c29ca4481 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 11 Jul 2024 10:52:55 -0700 Subject: [PATCH 31/44] update to latest thrift (as of 11 Jul 2024) from parquet-format --- parquet/regen.sh | 2 +- parquet/src/format.rs | 397 +++++++++++++++++++++++++++++++++++------- 2 files changed, 335 insertions(+), 64 deletions(-) diff --git a/parquet/regen.sh b/parquet/regen.sh index d1b82108a018..39999c7872cd 100755 --- a/parquet/regen.sh +++ b/parquet/regen.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2 +REVISION=5b564f3c47679526cf72e54f207013f28f53acc4 SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" diff --git a/parquet/src/format.rs b/parquet/src/format.rs index b210d6ec1b7e..c35f779057a6 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -117,12 +117,12 @@ impl ConvertedType { /// a list is converted into an optional field containing a repeated field for its /// values pub const LIST: ConvertedType = ConvertedType(3); - /// an enum is converted into a binary field + /// an enum is converted into a BYTE_ARRAY field pub const ENUM: ConvertedType = ConvertedType(4); /// A decimal value. /// - /// This may be used to annotate binary or fixed primitive types. The - /// underlying byte array stores the unscaled value encoded as two's + /// This may be used to annotate BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY primitive + /// types. The underlying byte array stores the unscaled value encoded as two's /// complement using big-endian byte order (the most significant byte is the /// zeroth element). The value of the decimal is the value * 10^{-scale}. /// @@ -185,7 +185,7 @@ impl ConvertedType { pub const JSON: ConvertedType = ConvertedType(19); /// An embedded BSON document /// - /// A BSON document embedded within a single BINARY column. + /// A BSON document embedded within a single BYTE_ARRAY column. pub const BSON: ConvertedType = ConvertedType(20); /// An interval of time /// @@ -288,9 +288,9 @@ impl From<&ConvertedType> for i32 { pub struct FieldRepetitionType(pub i32); impl FieldRepetitionType { - /// This field is required (can not be null) and each record has exactly 1 value. + /// This field is required (can not be null) and each row has exactly 1 value. pub const REQUIRED: FieldRepetitionType = FieldRepetitionType(0); - /// The field is optional (can be null) and each record has 0 or 1 values. + /// The field is optional (can be null) and each row has 0 or 1 values. pub const OPTIONAL: FieldRepetitionType = FieldRepetitionType(1); /// The field is repeated and can contain 0 or more values pub const REPEATED: FieldRepetitionType = FieldRepetitionType(2); @@ -379,12 +379,15 @@ impl Encoding { pub const DELTA_BYTE_ARRAY: Encoding = Encoding(7); /// Dictionary encoding: the ids are encoded using the RLE encoding pub const RLE_DICTIONARY: Encoding = Encoding(8); - /// Encoding for floating-point data. + /// Encoding for fixed-width data (FLOAT, DOUBLE, INT32, INT64, FIXED_LEN_BYTE_ARRAY). /// K byte-streams are created where K is the size in bytes of the data type. - /// The individual bytes of an FP value are scattered to the corresponding stream and + /// The individual bytes of a value are scattered to the corresponding stream and /// the streams are concatenated. /// This itself does not reduce the size of the data but can lead to better compression /// afterwards. + /// + /// Added in 2.8 for FLOAT and DOUBLE. + /// Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11. pub const BYTE_STREAM_SPLIT: Encoding = Encoding(9); pub const ENUM_VALUES: &'static [Self] = &[ Self::PLAIN, @@ -634,6 +637,143 @@ impl From<&BoundaryOrder> for i32 { } } +// +// SizeStatistics +// + +/// A structure for capturing metadata for estimating the unencoded, +/// uncompressed size of data written. This is useful for readers to estimate +/// how much memory is needed to reconstruct data in their memory model and for +/// fine grained filter pushdown on nested structures (the histograms contained +/// in this structure can help determine the number of nulls at a particular +/// nesting level and maximum length of lists). +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct SizeStatistics { + /// The number of physical bytes stored for BYTE_ARRAY data values assuming + /// no encoding. This is exclusive of the bytes needed to store the length of + /// each byte array. In other words, this field is equivalent to the `(size + /// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values + /// written)`. To determine unencoded sizes of other types readers can use + /// schema information multiplied by the number of non-null and null values. + /// The number of null/non-null values can be inferred from the histograms + /// below. + /// + /// For example, if a column chunk is dictionary-encoded with dictionary + /// \["a", "bc", "cde"\], and a data page contains the indices \[0, 0, 1, 2\], + /// then this value for that data page should be 7 (1 + 1 + 2 + 3). + /// + /// This field should only be set for types that use BYTE_ARRAY as their + /// physical type. + pub unencoded_byte_array_data_bytes: Option, + /// When present, there is expected to be one element corresponding to each + /// repetition (i.e. size=max repetition_level+1) where each element + /// represents the number of times the repetition level was observed in the + /// data. + /// + /// This field may be omitted if max_repetition_level is 0 without loss + /// of information. + /// + pub repetition_level_histogram: Option>, + /// Same as repetition_level_histogram except for definition levels. + /// + /// This field may be omitted if max_definition_level is 0 or 1 without + /// loss of information. + /// + pub definition_level_histogram: Option>, +} + +impl SizeStatistics { + pub fn new(unencoded_byte_array_data_bytes: F1, repetition_level_histogram: F2, definition_level_histogram: F3) -> SizeStatistics where F1: Into>, F2: Into>>, F3: Into>> { + SizeStatistics { + unencoded_byte_array_data_bytes: unencoded_byte_array_data_bytes.into(), + repetition_level_histogram: repetition_level_histogram.into(), + definition_level_histogram: definition_level_histogram.into(), + } + } +} + +impl crate::thrift::TSerializable for SizeStatistics { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option> = None; + let mut f_3: Option> = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i64()?; + f_1 = Some(val); + }, + 2 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_0 = i_prot.read_i64()?; + val.push(list_elem_0); + } + i_prot.read_list_end()?; + f_2 = Some(val); + }, + 3 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_1 = i_prot.read_i64()?; + val.push(list_elem_1); + } + i_prot.read_list_end()?; + f_3 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = SizeStatistics { + unencoded_byte_array_data_bytes: f_1, + repetition_level_histogram: f_2, + definition_level_histogram: f_3, + }; + Ok(ret) + } + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("SizeStatistics"); + o_prot.write_struct_begin(&struct_ident)?; + if let Some(fld_var) = self.unencoded_byte_array_data_bytes { + o_prot.write_field_begin(&TFieldIdentifier::new("unencoded_byte_array_data_bytes", TType::I64, 1))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.repetition_level_histogram { + o_prot.write_field_begin(&TFieldIdentifier::new("repetition_level_histogram", TType::List, 2))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.definition_level_histogram { + o_prot.write_field_begin(&TFieldIdentifier::new("definition_level_histogram", TType::List, 3))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + // // Statistics // @@ -1123,7 +1263,7 @@ impl crate::thrift::TSerializable for NullType { /// To maintain forward-compatibility in v1, implementations using this logical /// type must also set scale and precision on the annotated SchemaElement. /// -/// Allowed for physical types: INT32, INT64, FIXED, and BINARY +/// Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY. #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DecimalType { pub scale: i32, @@ -1620,7 +1760,7 @@ impl crate::thrift::TSerializable for IntType { /// Embedded JSON logical type annotation /// -/// Allowed for physical types: BINARY +/// Allowed for physical types: BYTE_ARRAY #[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct JsonType { } @@ -1660,7 +1800,7 @@ impl crate::thrift::TSerializable for JsonType { /// Embedded BSON logical type annotation /// -/// Allowed for physical types: BINARY +/// Allowed for physical types: BYTE_ARRAY #[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct BsonType { } @@ -2146,7 +2286,12 @@ impl crate::thrift::TSerializable for SchemaElement { /// Data page header #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DataPageHeader { - /// Number of values, including NULLs, in this data page. * + /// Number of values, including NULLs, in this data page. + /// + /// If a OffsetIndex is present, a page must begin at a row + /// boundary (repetition_level = 0). Otherwise, pages may begin + /// within a row (repetition_level > 0). + /// pub num_values: i32, /// Encoding used for this data page * pub encoding: Encoding, @@ -2154,7 +2299,7 @@ pub struct DataPageHeader { pub definition_level_encoding: Encoding, /// Encoding used for repetition levels * pub repetition_level_encoding: Encoding, - /// Optional statistics for the data in this page* + /// Optional statistics for the data in this page * pub statistics: Option, } @@ -2390,21 +2535,24 @@ pub struct DataPageHeaderV2 { /// Number of NULL values, in this data page. /// Number of non-null = num_values - num_nulls which is also the number of values in the data section * pub num_nulls: i32, - /// Number of rows in this data page. which means pages change on record boundaries (r = 0) * + /// Number of rows in this data page. Every page must begin at a + /// row boundary (repetition_level = 0): rows must **not** be + /// split across page boundaries when using V2 data pages. + /// pub num_rows: i32, /// Encoding used for data in this page * pub encoding: Encoding, - /// length of the definition levels + /// Length of the definition levels pub definition_levels_byte_length: i32, - /// length of the repetition levels + /// Length of the repetition levels pub repetition_levels_byte_length: i32, - /// whether the values are compressed. + /// Whether the values are compressed. /// Which means the section of the page between /// definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) /// is compressed with the compression_codec. /// If missing it is considered compressed pub is_compressed: Option, - /// optional statistics for the data in this page * + /// Optional statistics for the data in this page * pub statistics: Option, } @@ -3207,10 +3355,10 @@ impl crate::thrift::TSerializable for KeyValue { // SortingColumn // -/// Wrapper struct to specify sort order +/// Sort order within a RowGroup of a leaf column #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct SortingColumn { - /// The column index (in this row group) * + /// The ordinal position of the column (in this row group) * pub column_idx: i32, /// If true, indicates this column is sorted in descending order. * pub descending: bool, @@ -3417,10 +3565,15 @@ pub struct ColumnMetaData { /// Writers should write this field so readers can read the bloom filter /// in a single I/O. pub bloom_filter_length: Option, + /// Optional statistics to help estimate total memory when converted to in-memory + /// representations. The histograms contained in these statistics can + /// also be useful in some cases for more fine-grained nullability/list length + /// filter pushdown. + pub size_statistics: Option, } impl ColumnMetaData { - pub fn new(type_: Type, encodings: Vec, path_in_schema: Vec, codec: CompressionCodec, num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64, key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10, dictionary_page_offset: F11, statistics: F12, encoding_stats: F13, bloom_filter_offset: F14, bloom_filter_length: F15) -> ColumnMetaData where F8: Into>>, F10: Into>, F11: Into>, F12: Into>, F13: Into>>, F14: Into>, F15: Into> { + pub fn new(type_: Type, encodings: Vec, path_in_schema: Vec, codec: CompressionCodec, num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64, key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10, dictionary_page_offset: F11, statistics: F12, encoding_stats: F13, bloom_filter_offset: F14, bloom_filter_length: F15, size_statistics: F16) -> ColumnMetaData where F8: Into>>, F10: Into>, F11: Into>, F12: Into>, F13: Into>>, F14: Into>, F15: Into>, F16: Into> { ColumnMetaData { type_, encodings, @@ -3437,6 +3590,7 @@ impl ColumnMetaData { encoding_stats: encoding_stats.into(), bloom_filter_offset: bloom_filter_offset.into(), bloom_filter_length: bloom_filter_length.into(), + size_statistics: size_statistics.into(), } } } @@ -3459,6 +3613,7 @@ impl crate::thrift::TSerializable for ColumnMetaData { let mut f_13: Option> = None; let mut f_14: Option = None; let mut f_15: Option = None; + let mut f_16: Option = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -3474,8 +3629,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_0 = Encoding::read_from_in_protocol(i_prot)?; - val.push(list_elem_0); + let list_elem_2 = Encoding::read_from_in_protocol(i_prot)?; + val.push(list_elem_2); } i_prot.read_list_end()?; f_2 = Some(val); @@ -3484,8 +3639,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_1 = i_prot.read_string()?; - val.push(list_elem_1); + let list_elem_3 = i_prot.read_string()?; + val.push(list_elem_3); } i_prot.read_list_end()?; f_3 = Some(val); @@ -3510,8 +3665,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_2 = KeyValue::read_from_in_protocol(i_prot)?; - val.push(list_elem_2); + let list_elem_4 = KeyValue::read_from_in_protocol(i_prot)?; + val.push(list_elem_4); } i_prot.read_list_end()?; f_8 = Some(val); @@ -3536,8 +3691,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_3 = PageEncodingStats::read_from_in_protocol(i_prot)?; - val.push(list_elem_3); + let list_elem_5 = PageEncodingStats::read_from_in_protocol(i_prot)?; + val.push(list_elem_5); } i_prot.read_list_end()?; f_13 = Some(val); @@ -3550,6 +3705,10 @@ impl crate::thrift::TSerializable for ColumnMetaData { let val = i_prot.read_i32()?; f_15 = Some(val); }, + 16 => { + let val = SizeStatistics::read_from_in_protocol(i_prot)?; + f_16 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -3581,6 +3740,7 @@ impl crate::thrift::TSerializable for ColumnMetaData { encoding_stats: f_13, bloom_filter_offset: f_14, bloom_filter_length: f_15, + size_statistics: f_16, }; Ok(ret) } @@ -3662,6 +3822,11 @@ impl crate::thrift::TSerializable for ColumnMetaData { o_prot.write_i32(fld_var)?; o_prot.write_field_end()? } + if let Some(ref fld_var) = self.size_statistics { + o_prot.write_field_begin(&TFieldIdentifier::new("size_statistics", TType::Struct, 16))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -3741,8 +3906,8 @@ impl crate::thrift::TSerializable for EncryptionWithColumnKey { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_4 = i_prot.read_string()?; - val.push(list_elem_4); + let list_elem_6 = i_prot.read_string()?; + val.push(list_elem_6); } i_prot.read_list_end()?; f_1 = Some(val); @@ -3881,11 +4046,19 @@ pub struct ColumnChunk { /// metadata. This path is relative to the current file. /// pub file_path: Option, - /// Byte offset in file_path to the ColumnMetaData * + /// Deprecated: Byte offset in file_path to the ColumnMetaData + /// + /// Past use of this field has been inconsistent, with some implementations + /// using it to point to the ColumnMetaData and some using it to point to + /// the first page in the column chunk. In many cases, the ColumnMetaData at this + /// location is wrong. This field is now deprecated and should not be used. + /// Writers should set this field to 0 if no ColumnMetaData has been written outside + /// the footer. pub file_offset: i64, - /// Column metadata for this chunk. This is the same content as what is at - /// file_path/file_offset. Having it here has it replicated in the file - /// metadata. + /// Column metadata for this chunk. Some writers may also replicate this at the + /// location pointed to by file_path/file_offset. + /// Note: while marked as optional, this field is in fact required by most major + /// Parquet implementations. As such, writers MUST populate this field. /// pub meta_data: Option, /// File offset of ColumnChunk's OffsetIndex * @@ -4107,8 +4280,8 @@ impl crate::thrift::TSerializable for RowGroup { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_5 = ColumnChunk::read_from_in_protocol(i_prot)?; - val.push(list_elem_5); + let list_elem_7 = ColumnChunk::read_from_in_protocol(i_prot)?; + val.push(list_elem_7); } i_prot.read_list_end()?; f_1 = Some(val); @@ -4125,8 +4298,8 @@ impl crate::thrift::TSerializable for RowGroup { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_6 = SortingColumn::read_from_in_protocol(i_prot)?; - val.push(list_elem_6); + let list_elem_8 = SortingColumn::read_from_in_protocol(i_prot)?; + val.push(list_elem_8); } i_prot.read_list_end()?; f_4 = Some(val); @@ -4331,8 +4504,9 @@ pub struct PageLocation { /// Size of the page, including header. Sum of compressed_page_size and header /// length pub compressed_page_size: i32, - /// Index within the RowGroup of the first row of the page; this means pages - /// change on record boundaries (r = 0). + /// Index within the RowGroup of the first row of the page. When an + /// OffsetIndex is present, pages must begin on row boundaries + /// (repetition_level = 0). pub first_row_index: i64, } @@ -4409,17 +4583,28 @@ impl crate::thrift::TSerializable for PageLocation { // OffsetIndex // +/// Optional offsets for each data page in a ColumnChunk. +/// +/// Forms part of the page index, along with ColumnIndex. +/// +/// OffsetIndex may be present even if ColumnIndex is not. #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct OffsetIndex { /// PageLocations, ordered by increasing PageLocation.offset. It is required /// that page_locations\[i\].first_row_index < page_locations\[i+1\].first_row_index. pub page_locations: Vec, + /// Unencoded/uncompressed size for BYTE_ARRAY types. + /// + /// See documention for unencoded_byte_array_data_bytes in SizeStatistics for + /// more details on this field. + pub unencoded_byte_array_data_bytes: Option>, } impl OffsetIndex { - pub fn new(page_locations: Vec) -> OffsetIndex { + pub fn new(page_locations: Vec, unencoded_byte_array_data_bytes: F2) -> OffsetIndex where F2: Into>> { OffsetIndex { page_locations, + unencoded_byte_array_data_bytes: unencoded_byte_array_data_bytes.into(), } } } @@ -4428,6 +4613,7 @@ impl crate::thrift::TSerializable for OffsetIndex { fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; + let mut f_2: Option> = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -4439,12 +4625,22 @@ impl crate::thrift::TSerializable for OffsetIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_7 = PageLocation::read_from_in_protocol(i_prot)?; - val.push(list_elem_7); + let list_elem_9 = PageLocation::read_from_in_protocol(i_prot)?; + val.push(list_elem_9); } i_prot.read_list_end()?; f_1 = Some(val); }, + 2 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_10 = i_prot.read_i64()?; + val.push(list_elem_10); + } + i_prot.read_list_end()?; + f_2 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -4455,6 +4651,7 @@ impl crate::thrift::TSerializable for OffsetIndex { verify_required_field_exists("OffsetIndex.page_locations", &f_1)?; let ret = OffsetIndex { page_locations: f_1.expect("auto-generated code should have checked for presence of required fields"), + unencoded_byte_array_data_bytes: f_2, }; Ok(ret) } @@ -4468,6 +4665,15 @@ impl crate::thrift::TSerializable for OffsetIndex { } o_prot.write_list_end()?; o_prot.write_field_end()?; + if let Some(ref fld_var) = self.unencoded_byte_array_data_bytes { + o_prot.write_field_begin(&TFieldIdentifier::new("unencoded_byte_array_data_bytes", TType::List, 2))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -4477,8 +4683,14 @@ impl crate::thrift::TSerializable for OffsetIndex { // ColumnIndex // -/// Description for ColumnIndex. -/// Each ``\[i\] refers to the page at OffsetIndex.page_locations\[i\] +/// Optional statistics for each data page in a ColumnChunk. +/// +/// Forms part the page index, along with OffsetIndex. +/// +/// If this structure is present, OffsetIndex must also be present. +/// +/// For each field in this structure, \[i\] refers to the page at +/// OffsetIndex.page_locations\[i\] #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct ColumnIndex { /// A list of Boolean values to determine the validity of the corresponding @@ -4504,16 +4716,33 @@ pub struct ColumnIndex { pub boundary_order: BoundaryOrder, /// A list containing the number of null values for each page * pub null_counts: Option>, + /// Contains repetition level histograms for each page + /// concatenated together. The repetition_level_histogram field on + /// SizeStatistics contains more details. + /// + /// When present the length should always be (number of pages * + /// (max_repetition_level + 1)) elements. + /// + /// Element 0 is the first element of the histogram for the first page. + /// Element (max_repetition_level + 1) is the first element of the histogram + /// for the second page. + /// + pub repetition_level_histograms: Option>, + /// Same as repetition_level_histograms except for definitions levels. + /// + pub definition_level_histograms: Option>, } impl ColumnIndex { - pub fn new(null_pages: Vec, min_values: Vec>, max_values: Vec>, boundary_order: BoundaryOrder, null_counts: F5) -> ColumnIndex where F5: Into>> { + pub fn new(null_pages: Vec, min_values: Vec>, max_values: Vec>, boundary_order: BoundaryOrder, null_counts: F5, repetition_level_histograms: F6, definition_level_histograms: F7) -> ColumnIndex where F5: Into>>, F6: Into>>, F7: Into>> { ColumnIndex { null_pages, min_values, max_values, boundary_order, null_counts: null_counts.into(), + repetition_level_histograms: repetition_level_histograms.into(), + definition_level_histograms: definition_level_histograms.into(), } } } @@ -4526,6 +4755,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let mut f_3: Option>> = None; let mut f_4: Option = None; let mut f_5: Option> = None; + let mut f_6: Option> = None; + let mut f_7: Option> = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -4537,8 +4768,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_8 = i_prot.read_bool()?; - val.push(list_elem_8); + let list_elem_11 = i_prot.read_bool()?; + val.push(list_elem_11); } i_prot.read_list_end()?; f_1 = Some(val); @@ -4547,8 +4778,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec> = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_9 = i_prot.read_bytes()?; - val.push(list_elem_9); + let list_elem_12 = i_prot.read_bytes()?; + val.push(list_elem_12); } i_prot.read_list_end()?; f_2 = Some(val); @@ -4557,8 +4788,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec> = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_10 = i_prot.read_bytes()?; - val.push(list_elem_10); + let list_elem_13 = i_prot.read_bytes()?; + val.push(list_elem_13); } i_prot.read_list_end()?; f_3 = Some(val); @@ -4571,12 +4802,32 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_11 = i_prot.read_i64()?; - val.push(list_elem_11); + let list_elem_14 = i_prot.read_i64()?; + val.push(list_elem_14); } i_prot.read_list_end()?; f_5 = Some(val); }, + 6 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_15 = i_prot.read_i64()?; + val.push(list_elem_15); + } + i_prot.read_list_end()?; + f_6 = Some(val); + }, + 7 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_16 = i_prot.read_i64()?; + val.push(list_elem_16); + } + i_prot.read_list_end()?; + f_7 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -4594,6 +4845,8 @@ impl crate::thrift::TSerializable for ColumnIndex { max_values: f_3.expect("auto-generated code should have checked for presence of required fields"), boundary_order: f_4.expect("auto-generated code should have checked for presence of required fields"), null_counts: f_5, + repetition_level_histograms: f_6, + definition_level_histograms: f_7, }; Ok(ret) } @@ -4633,6 +4886,24 @@ impl crate::thrift::TSerializable for ColumnIndex { o_prot.write_list_end()?; o_prot.write_field_end()? } + if let Some(ref fld_var) = self.repetition_level_histograms { + o_prot.write_field_begin(&TFieldIdentifier::new("repetition_level_histograms", TType::List, 6))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.definition_level_histograms { + o_prot.write_field_begin(&TFieldIdentifier::new("definition_level_histograms", TType::List, 7))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -4992,8 +5263,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_12 = SchemaElement::read_from_in_protocol(i_prot)?; - val.push(list_elem_12); + let list_elem_17 = SchemaElement::read_from_in_protocol(i_prot)?; + val.push(list_elem_17); } i_prot.read_list_end()?; f_2 = Some(val); @@ -5006,8 +5277,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_13 = RowGroup::read_from_in_protocol(i_prot)?; - val.push(list_elem_13); + let list_elem_18 = RowGroup::read_from_in_protocol(i_prot)?; + val.push(list_elem_18); } i_prot.read_list_end()?; f_4 = Some(val); @@ -5016,8 +5287,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_14 = KeyValue::read_from_in_protocol(i_prot)?; - val.push(list_elem_14); + let list_elem_19 = KeyValue::read_from_in_protocol(i_prot)?; + val.push(list_elem_19); } i_prot.read_list_end()?; f_5 = Some(val); @@ -5030,8 +5301,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_15 = ColumnOrder::read_from_in_protocol(i_prot)?; - val.push(list_elem_15); + let list_elem_20 = ColumnOrder::read_from_in_protocol(i_prot)?; + val.push(list_elem_20); } i_prot.read_list_end()?; f_7 = Some(val); From 1c12fb8c216b8eaf2c44f76e0fc8579d742d078c Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 11 Jul 2024 11:02:22 -0700 Subject: [PATCH 32/44] pass None for optional size statistics --- parquet/src/file/metadata/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 40922d52bfd4..278d1e464e94 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -790,6 +790,7 @@ impl ColumnChunkMetaData { .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()), bloom_filter_offset: self.bloom_filter_offset, bloom_filter_length: self.bloom_filter_length, + size_statistics: None, } } @@ -1004,6 +1005,8 @@ impl ColumnIndexBuilder { self.max_values, self.boundary_order, self.null_counts, + None, + None, ) } } @@ -1052,7 +1055,7 @@ impl OffsetIndexBuilder { .zip(self.first_row_index_array.iter()) .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index)) .collect::>(); - OffsetIndex::new(locations) + OffsetIndex::new(locations, None) } } From 53cd5fad05608f201159b17bbe75762722a74536 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 11 Jul 2024 11:05:30 -0700 Subject: [PATCH 33/44] escape HTML tags --- parquet/src/format.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/format.rs b/parquet/src/format.rs index c35f779057a6..5074901aebf8 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -4689,7 +4689,7 @@ impl crate::thrift::TSerializable for OffsetIndex { /// /// If this structure is present, OffsetIndex must also be present. /// -/// For each field in this structure, \[i\] refers to the page at +/// For each field in this structure, ``\[i\] refers to the page at /// OffsetIndex.page_locations\[i\] #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct ColumnIndex { From 98025ccf1fda4cdf68cd9b1231f6a309f5c96814 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 11 Jul 2024 14:25:44 -0700 Subject: [PATCH 34/44] don't need to escape brackets in arrays --- parquet/src/format.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/format.rs b/parquet/src/format.rs index 5074901aebf8..6c93097b7359 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -659,7 +659,7 @@ pub struct SizeStatistics { /// below. /// /// For example, if a column chunk is dictionary-encoded with dictionary - /// \["a", "bc", "cde"\], and a data page contains the indices \[0, 0, 1, 2\], + /// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], /// then this value for that data page should be 7 (1 + 1 + 2 + 3). /// /// This field should only be set for types that use BYTE_ARRAY as their From 65096ddc2d28043e78bdb365cdb998ea62d32122 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 11 Jul 2024 14:47:27 -0700 Subject: [PATCH 35/44] use consistent naming --- parquet/src/arrow/arrow_writer/byte_array.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index e9538ffda817..7d081e0ac479 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -274,7 +274,7 @@ impl FallbackEncoder { } }; - let var_bytes = Some(self.variable_length_bytes); + let variable_length_bytes = Some(self.variable_length_bytes); self.variable_length_bytes = 0; Ok(DataPageValues { @@ -283,7 +283,7 @@ impl FallbackEncoder { encoding, min_value, max_value, - variable_length_bytes: var_bytes, + variable_length_bytes, }) } } @@ -395,7 +395,7 @@ impl DictEncoder { self.indices.clear(); - let var_bytes = Some(self.variable_length_bytes); + let variable_length_bytes = Some(self.variable_length_bytes); self.variable_length_bytes = 0; DataPageValues { @@ -404,7 +404,7 @@ impl DictEncoder { encoding: Encoding::RLE_DICTIONARY, min_value, max_value, - variable_length_bytes: var_bytes, + variable_length_bytes, } } } From 08065ada187c53a524bcffcdedb9124e13c5da92 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 11 Jul 2024 15:33:42 -0700 Subject: [PATCH 36/44] suggested doc changes --- parquet/src/file/metadata/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 86fb16361550..edfed9fd82b8 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -709,7 +709,9 @@ impl ColumnChunkMetaData { } /// Returns the number of bytes of variable length data after decoding. - /// Only set for BYTE_ARRAY columns. + /// + /// Only set for BYTE_ARRAY columns. This field may not be set by older + /// writers. pub fn unencoded_byte_array_data_bytes(&self) -> Option { self.unencoded_byte_array_data_bytes } @@ -717,6 +719,7 @@ impl ColumnChunkMetaData { /// Returns the repetition level histogram. /// /// The returned value `vec[i]` is how many values are at repetition level `i`. + /// This field may not be set by older writers. pub fn repetition_level_histogram(&self) -> Option<&Vec> { self.repetition_level_histogram.as_ref() } @@ -724,6 +727,7 @@ impl ColumnChunkMetaData { /// Returns the definition level histogram. /// /// The returned value `vec[i]` is how many values are at definition level `i`. + /// This field may not be set by older writers. pub fn definition_level_histogram(&self) -> Option<&Vec> { self.definition_level_histogram.as_ref() } From 1cbd4b76951e2300dd8eb3f35f03f58ac5869a57 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 11 Jul 2024 15:50:57 -0700 Subject: [PATCH 37/44] more suggested doc changes --- parquet/src/file/page_index/index.rs | 7 +++++++ parquet/src/file/page_index/index_reader.rs | 12 ++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 7ad45c2e1000..6c81d4d1df03 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -42,8 +42,15 @@ pub struct PageIndex { /// Null values in the page pub null_count: Option, /// Repetition level histogram for the page + /// + /// `repetition_level_histogram[i]` is a count of how many values are at repetition level `i`. + /// For example, `repetition_level_histogram[0]` indicates how many rows the page contains. pub repetition_level_histogram: Option>, /// Definition level histogram for the page + /// + /// `definition_level_histogram[i]` is a count of how many values are at definition level `i`. + /// For example, `definition_level_histogram[max_definition_level-1]` indicates how many + /// non-null values are present in the page. pub definition_level_histogram: Option>, } diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index ab95fe8d70f9..7358c9626b03 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -46,9 +46,9 @@ pub(crate) fn acc_range(a: Option>, b: Option>) -> Opt /// Returns an empty vector if this row group does not contain a /// [`ColumnIndex`]. /// -/// See [Column Index Documentation] for more details. +/// See [Page Index Documentation] for more details. /// -/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub fn read_columns_indexes( reader: &R, chunks: &[ColumnChunkMetaData], @@ -82,9 +82,9 @@ pub fn read_columns_indexes( /// Return an empty vector if this row group does not contain an /// [`OffsetIndex]`. /// -/// See [Column Index Documentation] for more details. +/// See [Page Index Documentation] for more details. /// -/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub fn read_pages_locations( reader: &R, chunks: &[ColumnChunkMetaData], @@ -118,9 +118,9 @@ pub fn read_pages_locations( /// Returns an empty vector if this row group does not contain an /// [`OffsetIndex`]. /// -/// See [Offset Index Documentation] for more details. +/// See [Page Index Documentation] for more details. /// -/// [Offset Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub fn read_offset_indexes( reader: &R, chunks: &[ColumnChunkMetaData], From dce3513d15a9eeb0e45ec56bf0ee13773c5c8899 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 11 Jul 2024 16:21:10 -0700 Subject: [PATCH 38/44] use more asserts in tests --- parquet/src/file/writer.rs | 91 ++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 52 deletions(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 06d3ebd9f645..232b197e51d5 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1877,21 +1877,22 @@ mod tests { assert_eq!(def_hist[1], 7); }; - if let Some(ref meta_data) = file_metadata.row_groups[0].columns[0].meta_data { - assert!(meta_data.size_statistics.is_some()); - if let Some(ref size_stats) = meta_data.size_statistics { - assert!(size_stats.repetition_level_histogram.is_none()); - assert!(size_stats.definition_level_histogram.is_some()); - assert!(size_stats.unencoded_byte_array_data_bytes.is_some()); - assert_eq!( - unenc_size, - size_stats.unencoded_byte_array_data_bytes.unwrap_or(0) - ); - if let Some(ref def_hist) = size_stats.definition_level_histogram { - check_def_hist(def_hist) - } - } - } + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + let meta_data = file_metadata.row_groups[0].columns[0] + .meta_data + .as_ref() + .unwrap(); + assert!(meta_data.size_statistics.is_some()); + let size_stats = meta_data.size_statistics.as_ref().unwrap(); + + assert!(size_stats.repetition_level_histogram.is_none()); + assert!(size_stats.definition_level_histogram.is_some()); + assert!(size_stats.unencoded_byte_array_data_bytes.is_some()); + assert_eq!( + unenc_size, + size_stats.unencoded_byte_array_data_bytes.unwrap() + ); + check_def_hist(size_stats.definition_level_histogram.as_ref().unwrap()); // check that the read metadata is also correct let options = ReadOptionsBuilder::new().with_page_index().build(); @@ -1906,12 +1907,10 @@ mod tests { assert!(column.definition_level_histogram().is_some()); assert!(column.repetition_level_histogram().is_none()); assert!(column.unencoded_byte_array_data_bytes().is_some()); - if let Some(def_hist) = column.definition_level_histogram() { - check_def_hist(def_hist) - } + check_def_hist(column.definition_level_histogram().unwrap()); assert_eq!( unenc_size, - column.unencoded_byte_array_data_bytes().unwrap_or(0) + column.unencoded_byte_array_data_bytes().unwrap() ); // check histogram in column index as well @@ -1927,9 +1926,8 @@ mod tests { }; assert!(col_idx.repetition_level_histogram().is_none()); - if let Some(def_hist) = col_idx.definition_level_histogram() { - check_def_hist(def_hist) - } + assert!(col_idx.definition_level_histogram().is_some()); + check_def_hist(col_idx.definition_level_histogram().unwrap()); assert!(reader .metadata() @@ -1939,10 +1937,9 @@ mod tests { assert_eq!(unenc_sizes.len(), 1); assert_eq!(unenc_sizes[0].len(), 1); assert!(unenc_sizes[0][0].is_some()); - if let Some(page_sizes) = &unenc_sizes[0][0] { - assert_eq!(page_sizes.len(), 1); - assert_eq!(page_sizes[0], unenc_size); - } + let page_sizes = unenc_sizes[0][0].as_ref().unwrap(); + assert_eq!(page_sizes.len(), 1); + assert_eq!(page_sizes[0], unenc_size); } #[test] @@ -2004,20 +2001,18 @@ mod tests { // check that histograms are set properly in the write and read metadata // also check that unencoded_byte_array_data_bytes is not set - if let Some(ref meta_data) = file_metadata.row_groups[0].columns[0].meta_data { - assert!(meta_data.size_statistics.is_some()); - if let Some(ref size_stats) = meta_data.size_statistics { - assert!(size_stats.repetition_level_histogram.is_some()); - assert!(size_stats.definition_level_histogram.is_some()); - assert!(size_stats.unencoded_byte_array_data_bytes.is_none()); - if let Some(ref def_hist) = size_stats.definition_level_histogram { - check_def_hist(def_hist) - } - if let Some(ref rep_hist) = size_stats.repetition_level_histogram { - check_rep_hist(rep_hist) - } - } - } + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + let meta_data = file_metadata.row_groups[0].columns[0] + .meta_data + .as_ref() + .unwrap(); + assert!(meta_data.size_statistics.is_some()); + let size_stats = meta_data.size_statistics.as_ref().unwrap(); + assert!(size_stats.repetition_level_histogram.is_some()); + assert!(size_stats.definition_level_histogram.is_some()); + assert!(size_stats.unencoded_byte_array_data_bytes.is_none()); + check_def_hist(size_stats.definition_level_histogram.as_ref().unwrap()); + check_rep_hist(size_stats.repetition_level_histogram.as_ref().unwrap()); // check that the read metadata is also correct let options = ReadOptionsBuilder::new().with_page_index().build(); @@ -2032,12 +2027,8 @@ mod tests { assert!(column.definition_level_histogram().is_some()); assert!(column.repetition_level_histogram().is_some()); assert!(column.unencoded_byte_array_data_bytes().is_none()); - if let Some(def_hist) = column.definition_level_histogram() { - check_def_hist(def_hist) - } - if let Some(rep_hist) = column.repetition_level_histogram() { - check_rep_hist(rep_hist) - } + check_def_hist(column.definition_level_histogram().unwrap()); + check_rep_hist(column.repetition_level_histogram().unwrap()); // check histogram in column index as well assert!(reader.metadata().column_index().is_some()); @@ -2051,12 +2042,8 @@ mod tests { unreachable!() }; - if let Some(def_hist) = col_idx.definition_level_histogram() { - check_def_hist(def_hist) - } - if let Some(rep_hist) = col_idx.repetition_level_histogram() { - check_rep_hist(rep_hist) - } + check_def_hist(col_idx.definition_level_histogram().unwrap()); + check_rep_hist(col_idx.repetition_level_histogram().unwrap()); assert!(reader .metadata() From f6618394c7a745c80a4c0c83f9007ea86e3e024b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 11 Jul 2024 20:45:25 -0700 Subject: [PATCH 39/44] move histogram logic into PageMetrics and ColumnMetrics still needs more documentation --- parquet/src/column/writer/mod.rs | 265 +++++++++++++++++++------------ 1 file changed, 162 insertions(+), 103 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 0e4e4063936d..39e7074cfd9f 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -183,8 +183,18 @@ pub struct ColumnCloseResult { pub offset_index: Option, } +/// Creates vector to hold level histogram data. Length will be `max_level + 1`. +/// Because histograms are not necessary when `max_level == 0`, this will return +/// `None` in that case. +fn new_histogram(max_level: i16) -> Option> { + if max_level > 0 { + Some(vec![0; max_level as usize + 1]) + } else { + None + } +} + // Metrics per page -#[derive(Default)] struct PageMetrics { num_buffered_values: u32, num_buffered_rows: u32, @@ -193,6 +203,66 @@ struct PageMetrics { definition_level_histogram: Option>, } +impl PageMetrics { + pub fn new() -> Self { + PageMetrics { + num_buffered_values: 0, + num_buffered_rows: 0, + num_page_nulls: 0, + repetition_level_histogram: None, + definition_level_histogram: None, + } + } + + /// Initialize the repetition level histogram + pub fn with_repetition_level_histogram(mut self, max_level: i16) -> Self { + self.repetition_level_histogram = new_histogram(max_level); + self + } + + /// Initialize the definition level histogram + pub fn with_definition_level_histogram(mut self, max_level: i16) -> Self { + self.definition_level_histogram = new_histogram(max_level); + self + } + + /// Resets the state of this `PageMetrics` to the initial state + /// + /// If histograms have are defined their contents will be reset to zero. + pub fn new_page(&mut self) { + self.num_buffered_values = 0; + self.num_buffered_rows = 0; + self.num_page_nulls = 0; + if let Some(ref mut hist) = self.repetition_level_histogram { + for v in hist { + *v = 0 + } + } + if let Some(ref mut hist) = self.definition_level_histogram { + for v in hist { + *v = 0 + } + } + } + + /// FIXME docs! + pub fn update_repetition_level_histogram(&mut self, levels: &[i16]) { + if let Some(ref mut rep_hist) = self.repetition_level_histogram { + for &level in levels { + rep_hist[level as usize] += 1; + } + } + } + + pub fn update_definition_level_histogram(&mut self, levels: &[i16]) { + if let Some(ref mut def_hist) = self.definition_level_histogram { + for &level in levels { + def_hist[level as usize] += 1; + } + } + } +} + // Metrics per column writer struct ColumnMetrics { total_bytes_written: u64, @@ -211,6 +281,67 @@ struct ColumnMetrics { definition_level_histogram: Option>, } +impl ColumnMetrics { + pub fn new() -> Self { + ColumnMetrics { + total_bytes_written: 0, + total_rows_written: 0, + total_uncompressed_size: 0, + total_compressed_size: 0, + total_num_values: 0, + dictionary_page_offset: None, + data_page_offset: None, + min_column_value: None, + max_column_value: None, + num_column_nulls: 0, + column_distinct_count: None, + variable_length_bytes: None, + repetition_level_histogram: None, + definition_level_histogram: None, + } + } + + /// Initialize the repetition level histogram + pub fn with_repetition_level_histogram(mut self, max_level: i16) -> Self { + self.repetition_level_histogram = new_histogram(max_level); + self + } + + /// Initialize the definition level histogram + pub fn with_definition_level_histogram(mut self, max_level: i16) -> Self { + self.definition_level_histogram = new_histogram(max_level); + self + } + + /// FIXME docs + pub fn update_from_page_metrics(&mut self, page_metrics: &PageMetrics) { + if page_metrics.definition_level_histogram.is_some() + && self.definition_level_histogram.is_some() + { + let chunk_hist = self.definition_level_histogram.as_mut().unwrap(); + let page_hist = page_metrics.definition_level_histogram.as_ref().unwrap(); + for i in 0..page_hist.len() { + chunk_hist[i] += page_hist[i] + } + } + if page_metrics.repetition_level_histogram.is_some() + && self.repetition_level_histogram.is_some() + { + let chunk_hist = self.repetition_level_histogram.as_mut().unwrap(); + let page_hist = page_metrics.repetition_level_histogram.as_ref().unwrap(); + for i in 0..page_hist.len() { + chunk_hist[i] += page_hist[i] + } + } + } + + pub fn update_variable_length_bytes(&mut self, variable_length_bytes: &Option) { + if let Some(var_bytes) = variable_length_bytes { + *self.variable_length_bytes.get_or_insert(0) += var_bytes; + } + } +} + /// Typed column writer for a primitive column. pub type ColumnWriterImpl<'a, T> = GenericColumnWriter<'a, ColumnValueEncoderImpl>; @@ -265,18 +396,18 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // Used for level information encodings.insert(Encoding::RLE); - // histogram data is only collected if there is more than a single level and if - // page or chunk statistics are being collected - let new_histogram_vec = |max_level| { - if statistics_enabled == EnabledStatistics::None || max_level == 0 { - None - } else { - Some(vec![0; max_level as usize + 1]) - } - }; + let mut page_metrics = PageMetrics::new(); + let mut column_metrics = ColumnMetrics::::new(); - let max_rep_level = descr.max_rep_level(); - let max_def_level = descr.max_def_level(); + // Initialize level histograms if collecting page or chunk statistics + if statistics_enabled != EnabledStatistics::None { + page_metrics = page_metrics + .with_repetition_level_histogram(descr.max_rep_level()) + .with_definition_level_histogram(descr.max_def_level()); + column_metrics = column_metrics + .with_repetition_level_histogram(descr.max_rep_level()) + .with_definition_level_histogram(descr.max_def_level()) + } Self { descr, @@ -289,29 +420,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { def_levels_sink: vec![], rep_levels_sink: vec![], data_pages: VecDeque::new(), - page_metrics: PageMetrics { - num_buffered_values: 0, - num_buffered_rows: 0, - num_page_nulls: 0, - repetition_level_histogram: new_histogram_vec(max_rep_level), - definition_level_histogram: new_histogram_vec(max_def_level), - }, - column_metrics: ColumnMetrics { - total_bytes_written: 0, - total_rows_written: 0, - total_uncompressed_size: 0, - total_compressed_size: 0, - total_num_values: 0, - dictionary_page_offset: None, - data_page_offset: None, - min_column_value: None, - max_column_value: None, - num_column_nulls: 0, - column_distinct_count: None, - variable_length_bytes: None, - repetition_level_histogram: new_histogram_vec(max_rep_level), - definition_level_histogram: new_histogram_vec(max_def_level), - }, + page_metrics, + column_metrics, column_index_builder: ColumnIndexBuilder::new(), offset_index_builder: OffsetIndexBuilder::new(), encodings, @@ -552,29 +662,18 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { })?; let mut values_to_write = 0; - - let mut process_def_level = |level| { + for &level in levels { if level == self.descr.max_def_level() { values_to_write += 1; } else { // We must always compute this as it is used to populate v2 pages - self.page_metrics.num_page_nulls += 1; - } - }; - - if let Some(ref mut def_hist) = self.page_metrics.definition_level_histogram { - // Count values and update histogram - for &level in levels { - process_def_level(level); - def_hist[level as usize] += 1; - } - } else { - // Count values - for &level in levels { - process_def_level(level); + self.page_metrics.num_page_nulls += 1 } } + // Update histogram + self.page_metrics.update_definition_level_histogram(levels); + self.def_levels_sink.extend_from_slice(levels); values_to_write } else { @@ -598,19 +697,14 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { )); } - if let Some(ref mut rep_hist) = self.page_metrics.repetition_level_histogram { - // Count the occasions where we start a new row and update histogram - for &level in levels { - self.page_metrics.num_buffered_rows += (level == 0) as u32; - rep_hist[level as usize] += 1; - } - } else { - // Count the occasions where we start a new row - for &level in levels { - self.page_metrics.num_buffered_rows += (level == 0) as u32 - } + // Count the occasions where we start a new row + for &level in levels { + self.page_metrics.num_buffered_rows += (level == 0) as u32 } + // Update histogram + self.page_metrics.update_repetition_level_histogram(levels); + self.rep_levels_sink.extend_from_slice(levels); } else { // Each value is exactly one row. @@ -843,48 +937,17 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { _ => None, }; - if let Some(var_bytes) = values_data.variable_length_bytes { - *self.column_metrics.variable_length_bytes.get_or_insert(0) += var_bytes; - } - // update column and offset index self.update_column_offset_index( page_statistics.as_ref(), values_data.variable_length_bytes, ); - // collect page histograms into chunk histograms and zero out page histograms - // TODO(ets): This could instead just add the vectors, and then allow page_metrics to be reset - // below. Would then need to recreate the histogram vectors, so `new_histogram_vec` above - // would need to become a function. - if let Some(ref mut page_hist) = self.page_metrics.repetition_level_histogram { - if let Some(ref mut chunk_hist) = self.column_metrics.repetition_level_histogram { - assert_eq!(chunk_hist.len(), page_hist.len()); - for i in 0..page_hist.len() { - chunk_hist[i] += page_hist[i]; - page_hist[i] = 0; - } - } else { - // this should never be reached, but zero out histogram just in case - for v in page_hist { - *v = 0; - } - } - } - if let Some(ref mut page_hist) = self.page_metrics.definition_level_histogram { - if let Some(ref mut chunk_hist) = self.column_metrics.definition_level_histogram { - assert_eq!(chunk_hist.len(), page_hist.len()); - for i in 0..page_hist.len() { - chunk_hist[i] += page_hist[i]; - page_hist[i] = 0; - } - } else { - // this should never be reached, but zero out histogram just in case - for v in page_hist { - *v = 0; - } - } - } + // Update histograms and variable_length_bytes in column_metrics + self.column_metrics + .update_from_page_metrics(&self.page_metrics); + self.column_metrics + .update_variable_length_bytes(&values_data.variable_length_bytes); let page_statistics = page_statistics.map(Statistics::from); @@ -989,11 +1052,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // Reset state. self.rep_levels_sink.clear(); self.def_levels_sink.clear(); - - // don't clobber histogram vectors - self.page_metrics.num_buffered_values = 0; - self.page_metrics.num_buffered_rows = 0; - self.page_metrics.num_page_nulls = 0; + self.page_metrics.new_page(); Ok(()) } From 818a61456d226a8f0c4131bcc5dcc9aa695daf30 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 12 Jul 2024 09:31:29 -0700 Subject: [PATCH 40/44] refactor some to reduce code duplication, finish docs --- parquet/src/column/writer/mod.rs | 92 +++++++++++++++++--------------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 39e7074cfd9f..050da34586f1 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -183,7 +183,7 @@ pub struct ColumnCloseResult { pub offset_index: Option, } -/// Creates vector to hold level histogram data. Length will be `max_level + 1`. +/// Creates a vector to hold level histogram data. Length will be `max_level + 1`. /// Because histograms are not necessary when `max_level == 0`, this will return /// `None` in that case. fn new_histogram(max_level: i16) -> Option> { @@ -194,6 +194,17 @@ fn new_histogram(max_level: i16) -> Option> { } } +/// Sum `page_histogram` into `chunk_histogram` +fn update_histogram(chunk_histogram: &mut Option>, page_histogram: &Option>) { + if page_histogram.is_some() && chunk_histogram.is_some() { + let chunk_hist = chunk_histogram.as_mut().unwrap(); + let page_hist = page_histogram.as_ref().unwrap(); + for i in 0..page_hist.len() { + chunk_hist[i] += page_hist[i] + } + } +} + // Metrics per page struct PageMetrics { num_buffered_values: u32, @@ -204,7 +215,7 @@ struct PageMetrics { } impl PageMetrics { - pub fn new() -> Self { + fn new() -> Self { PageMetrics { num_buffered_values: 0, num_buffered_rows: 0, @@ -215,38 +226,38 @@ impl PageMetrics { } /// Initialize the repetition level histogram - pub fn with_repetition_level_histogram(mut self, max_level: i16) -> Self { + fn with_repetition_level_histogram(mut self, max_level: i16) -> Self { self.repetition_level_histogram = new_histogram(max_level); self } /// Initialize the definition level histogram - pub fn with_definition_level_histogram(mut self, max_level: i16) -> Self { + fn with_definition_level_histogram(mut self, max_level: i16) -> Self { self.definition_level_histogram = new_histogram(max_level); self } - /// Resets the state of this `PageMetrics` to the initial state - /// - /// If histograms have are defined their contents will be reset to zero. - pub fn new_page(&mut self) { - self.num_buffered_values = 0; - self.num_buffered_rows = 0; - self.num_page_nulls = 0; - if let Some(ref mut hist) = self.repetition_level_histogram { - for v in hist { - *v = 0 - } - } - if let Some(ref mut hist) = self.definition_level_histogram { + /// Sets all elements of `histogram` to 0 + fn reset_histogram(histogram: &mut Option>) { + if let Some(ref mut hist) = histogram { for v in hist { *v = 0 } } } - /// FIXME docs! - pub fn update_repetition_level_histogram(&mut self, levels: &[i16]) { + /// Resets the state of this `PageMetrics` to the initial state. + /// If histograms have been initialized their contents will be reset to zero. + fn new_page(&mut self) { + self.num_buffered_values = 0; + self.num_buffered_rows = 0; + self.num_page_nulls = 0; + PageMetrics::reset_histogram(&mut self.repetition_level_histogram); + PageMetrics::reset_histogram(&mut self.definition_level_histogram); + } + + /// Updates histogram values using provided repetition levels + fn update_repetition_level_histogram(&mut self, levels: &[i16]) { if let Some(ref mut rep_hist) = self.repetition_level_histogram { for &level in levels { rep_hist[level as usize] += 1; @@ -254,7 +265,8 @@ impl PageMetrics { } } - pub fn update_definition_level_histogram(&mut self, levels: &[i16]) { + /// Updates histogram values using provided definition levels + fn update_definition_level_histogram(&mut self, levels: &[i16]) { if let Some(ref mut def_hist) = self.definition_level_histogram { for &level in levels { def_hist[level as usize] += 1; @@ -282,7 +294,7 @@ struct ColumnMetrics { } impl ColumnMetrics { - pub fn new() -> Self { + fn new() -> Self { ColumnMetrics { total_bytes_written: 0, total_rows_written: 0, @@ -302,40 +314,32 @@ impl ColumnMetrics { } /// Initialize the repetition level histogram - pub fn with_repetition_level_histogram(mut self, max_level: i16) -> Self { + fn with_repetition_level_histogram(mut self, max_level: i16) -> Self { self.repetition_level_histogram = new_histogram(max_level); self } /// Initialize the definition level histogram - pub fn with_definition_level_histogram(mut self, max_level: i16) -> Self { + fn with_definition_level_histogram(mut self, max_level: i16) -> Self { self.definition_level_histogram = new_histogram(max_level); self } - /// FIXME docs - pub fn update_from_page_metrics(&mut self, page_metrics: &PageMetrics) { - if page_metrics.definition_level_histogram.is_some() - && self.definition_level_histogram.is_some() - { - let chunk_hist = self.definition_level_histogram.as_mut().unwrap(); - let page_hist = page_metrics.definition_level_histogram.as_ref().unwrap(); - for i in 0..page_hist.len() { - chunk_hist[i] += page_hist[i] - } - } - if page_metrics.repetition_level_histogram.is_some() - && self.repetition_level_histogram.is_some() - { - let chunk_hist = self.repetition_level_histogram.as_mut().unwrap(); - let page_hist = page_metrics.repetition_level_histogram.as_ref().unwrap(); - for i in 0..page_hist.len() { - chunk_hist[i] += page_hist[i] - } - } + /// Sum the provided PageMetrics histograms into the chunk histograms. Does nothing if + /// page histograms are not initialized. + fn update_from_page_metrics(&mut self, page_metrics: &PageMetrics) { + update_histogram( + &mut self.definition_level_histogram, + &page_metrics.definition_level_histogram, + ); + update_histogram( + &mut self.repetition_level_histogram, + &page_metrics.repetition_level_histogram, + ); } - pub fn update_variable_length_bytes(&mut self, variable_length_bytes: &Option) { + /// Sum the provided page variable_length_bytes into the chunk variable_length_bytes + fn update_variable_length_bytes(&mut self, variable_length_bytes: &Option) { if let Some(var_bytes) = variable_length_bytes { *self.variable_length_bytes.get_or_insert(0) += var_bytes; } From c391dec664261fa3dd8f4a88ad7c0ff217a8c666 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 12 Jul 2024 10:07:32 -0700 Subject: [PATCH 41/44] account for new size statistics in heap size calculations --- parquet/src/file/metadata/memory.rs | 3 +++ parquet/src/file/metadata/mod.rs | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 57b2f7eec0c2..22ee228bcf40 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -97,6 +97,9 @@ impl HeapSize for ColumnChunkMetaData { + self.compression.heap_size() + self.statistics.heap_size() + self.encoding_stats.heap_size() + + self.unencoded_byte_array_data_bytes.heap_size() + + self.repetition_level_histogram.heap_size() + + self.definition_level_histogram.heap_size() } } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index edfed9fd82b8..592ccd62177f 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -214,6 +214,7 @@ impl ParquetMetaData { + self.row_groups.heap_size() + self.column_index.heap_size() + self.offset_index.heap_size() + + self.unencoded_byte_array_data_bytes.heap_size() } /// Override the column index @@ -1439,10 +1440,10 @@ mod tests { vec![PageLocation::new(1, 2, 3)], vec![PageLocation::new(1, 2, 3)], ]]), - None, + Some(vec![vec![Some(vec![10, 20, 30])]]), ); - let bigger_expected_size = 2776; + let bigger_expected_size = 2848; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); assert_eq!(parquet_meta.memory_size(), bigger_expected_size); From 4816a95ae6cd501f786565d41e9dfb1ad31bfeb2 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 12 Jul 2024 10:19:27 -0700 Subject: [PATCH 42/44] add histogram examples to docs --- parquet/src/file/metadata/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 592ccd62177f..e2afff24ae64 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -719,7 +719,8 @@ impl ColumnChunkMetaData { /// Returns the repetition level histogram. /// - /// The returned value `vec[i]` is how many values are at repetition level `i`. + /// The returned value `vec[i]` is how many values are at repetition level `i`. For example, + /// `vec[0]` indicates how many rows the page contains. /// This field may not be set by older writers. pub fn repetition_level_histogram(&self) -> Option<&Vec> { self.repetition_level_histogram.as_ref() @@ -727,7 +728,8 @@ impl ColumnChunkMetaData { /// Returns the definition level histogram. /// - /// The returned value `vec[i]` is how many values are at definition level `i`. + /// The returned value `vec[i]` is how many values are at definition level `i`. For example, + /// `vec[max_definition_level-1]` indicates how many non-null values are present in the page. /// This field may not be set by older writers. pub fn definition_level_histogram(&self) -> Option<&Vec> { self.definition_level_histogram.as_ref() From d92ae20ee700426b965e0e0398a3f36ac94f1e4c Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sat, 13 Jul 2024 17:26:15 -0700 Subject: [PATCH 43/44] add some fixmes --- parquet/src/file/page_index/index.rs | 4 ++++ parquet/src/file/page_index/offset_index.rs | 2 ++ 2 files changed, 6 insertions(+) diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 6c81d4d1df03..afdc0635279b 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -41,6 +41,10 @@ pub struct PageIndex { pub max: Option, /// Null values in the page pub null_count: Option, + + // FIXME(ets): histograms should be stored in NativeIndex, and these should then be + // slices of the histogram vector. That will make conversion back to thrift much easier. + /// Repetition level histogram for the page /// /// `repetition_level_histogram[i]` is a count of how many values are at repetition level `i`. diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index 31305ba57080..aefaab5f91fc 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -47,4 +47,6 @@ impl ParquetOffsetIndex { pub fn unencoded_byte_array_data_bytes(&self) -> Option<&Vec> { self.unencoded_byte_array_data_bytes.as_ref() } + + // FIXME(ets): need to add a `to_offset_index` method to convert back to thrift } From 69dd652284470260ba55c2a9103829f36ea7ffef Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 15 Jul 2024 13:49:36 -0700 Subject: [PATCH 44/44] leave not to self --- parquet/src/file/page_index/index.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index afdc0635279b..072e311d7623 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -42,8 +42,9 @@ pub struct PageIndex { /// Null values in the page pub null_count: Option, - // FIXME(ets): histograms should be stored in NativeIndex, and these should then be + // NOTE: histograms could be stored in NativeIndex, and these could then be // slices of the histogram vector. That will make conversion back to thrift much easier. + // Nevermind...this would require propagating lifetimes up the metadata tree. /// Repetition level histogram for the page ///