From 393aea19e76addcc9fd6f0d94715430c29ca4481 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 11 Jul 2024 10:52:55 -0700 Subject: [PATCH 1/4] update to latest thrift (as of 11 Jul 2024) from parquet-format --- parquet/regen.sh | 2 +- parquet/src/format.rs | 397 +++++++++++++++++++++++++++++++++++------- 2 files changed, 335 insertions(+), 64 deletions(-) diff --git a/parquet/regen.sh b/parquet/regen.sh index d1b82108a018..39999c7872cd 100755 --- a/parquet/regen.sh +++ b/parquet/regen.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2 +REVISION=5b564f3c47679526cf72e54f207013f28f53acc4 SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" diff --git a/parquet/src/format.rs b/parquet/src/format.rs index b210d6ec1b7e..c35f779057a6 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -117,12 +117,12 @@ impl ConvertedType { /// a list is converted into an optional field containing a repeated field for its /// values pub const LIST: ConvertedType = ConvertedType(3); - /// an enum is converted into a binary field + /// an enum is converted into a BYTE_ARRAY field pub const ENUM: ConvertedType = ConvertedType(4); /// A decimal value. /// - /// This may be used to annotate binary or fixed primitive types. The - /// underlying byte array stores the unscaled value encoded as two's + /// This may be used to annotate BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY primitive + /// types. The underlying byte array stores the unscaled value encoded as two's /// complement using big-endian byte order (the most significant byte is the /// zeroth element). The value of the decimal is the value * 10^{-scale}. /// @@ -185,7 +185,7 @@ impl ConvertedType { pub const JSON: ConvertedType = ConvertedType(19); /// An embedded BSON document /// - /// A BSON document embedded within a single BINARY column. + /// A BSON document embedded within a single BYTE_ARRAY column. pub const BSON: ConvertedType = ConvertedType(20); /// An interval of time /// @@ -288,9 +288,9 @@ impl From<&ConvertedType> for i32 { pub struct FieldRepetitionType(pub i32); impl FieldRepetitionType { - /// This field is required (can not be null) and each record has exactly 1 value. + /// This field is required (can not be null) and each row has exactly 1 value. pub const REQUIRED: FieldRepetitionType = FieldRepetitionType(0); - /// The field is optional (can be null) and each record has 0 or 1 values. + /// The field is optional (can be null) and each row has 0 or 1 values. pub const OPTIONAL: FieldRepetitionType = FieldRepetitionType(1); /// The field is repeated and can contain 0 or more values pub const REPEATED: FieldRepetitionType = FieldRepetitionType(2); @@ -379,12 +379,15 @@ impl Encoding { pub const DELTA_BYTE_ARRAY: Encoding = Encoding(7); /// Dictionary encoding: the ids are encoded using the RLE encoding pub const RLE_DICTIONARY: Encoding = Encoding(8); - /// Encoding for floating-point data. + /// Encoding for fixed-width data (FLOAT, DOUBLE, INT32, INT64, FIXED_LEN_BYTE_ARRAY). /// K byte-streams are created where K is the size in bytes of the data type. - /// The individual bytes of an FP value are scattered to the corresponding stream and + /// The individual bytes of a value are scattered to the corresponding stream and /// the streams are concatenated. /// This itself does not reduce the size of the data but can lead to better compression /// afterwards. + /// + /// Added in 2.8 for FLOAT and DOUBLE. + /// Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11. pub const BYTE_STREAM_SPLIT: Encoding = Encoding(9); pub const ENUM_VALUES: &'static [Self] = &[ Self::PLAIN, @@ -634,6 +637,143 @@ impl From<&BoundaryOrder> for i32 { } } +// +// SizeStatistics +// + +/// A structure for capturing metadata for estimating the unencoded, +/// uncompressed size of data written. This is useful for readers to estimate +/// how much memory is needed to reconstruct data in their memory model and for +/// fine grained filter pushdown on nested structures (the histograms contained +/// in this structure can help determine the number of nulls at a particular +/// nesting level and maximum length of lists). +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct SizeStatistics { + /// The number of physical bytes stored for BYTE_ARRAY data values assuming + /// no encoding. This is exclusive of the bytes needed to store the length of + /// each byte array. In other words, this field is equivalent to the `(size + /// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values + /// written)`. To determine unencoded sizes of other types readers can use + /// schema information multiplied by the number of non-null and null values. + /// The number of null/non-null values can be inferred from the histograms + /// below. + /// + /// For example, if a column chunk is dictionary-encoded with dictionary + /// \["a", "bc", "cde"\], and a data page contains the indices \[0, 0, 1, 2\], + /// then this value for that data page should be 7 (1 + 1 + 2 + 3). + /// + /// This field should only be set for types that use BYTE_ARRAY as their + /// physical type. + pub unencoded_byte_array_data_bytes: Option, + /// When present, there is expected to be one element corresponding to each + /// repetition (i.e. size=max repetition_level+1) where each element + /// represents the number of times the repetition level was observed in the + /// data. + /// + /// This field may be omitted if max_repetition_level is 0 without loss + /// of information. + /// + pub repetition_level_histogram: Option>, + /// Same as repetition_level_histogram except for definition levels. + /// + /// This field may be omitted if max_definition_level is 0 or 1 without + /// loss of information. + /// + pub definition_level_histogram: Option>, +} + +impl SizeStatistics { + pub fn new(unencoded_byte_array_data_bytes: F1, repetition_level_histogram: F2, definition_level_histogram: F3) -> SizeStatistics where F1: Into>, F2: Into>>, F3: Into>> { + SizeStatistics { + unencoded_byte_array_data_bytes: unencoded_byte_array_data_bytes.into(), + repetition_level_histogram: repetition_level_histogram.into(), + definition_level_histogram: definition_level_histogram.into(), + } + } +} + +impl crate::thrift::TSerializable for SizeStatistics { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option> = None; + let mut f_3: Option> = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i64()?; + f_1 = Some(val); + }, + 2 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_0 = i_prot.read_i64()?; + val.push(list_elem_0); + } + i_prot.read_list_end()?; + f_2 = Some(val); + }, + 3 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_1 = i_prot.read_i64()?; + val.push(list_elem_1); + } + i_prot.read_list_end()?; + f_3 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = SizeStatistics { + unencoded_byte_array_data_bytes: f_1, + repetition_level_histogram: f_2, + definition_level_histogram: f_3, + }; + Ok(ret) + } + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("SizeStatistics"); + o_prot.write_struct_begin(&struct_ident)?; + if let Some(fld_var) = self.unencoded_byte_array_data_bytes { + o_prot.write_field_begin(&TFieldIdentifier::new("unencoded_byte_array_data_bytes", TType::I64, 1))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.repetition_level_histogram { + o_prot.write_field_begin(&TFieldIdentifier::new("repetition_level_histogram", TType::List, 2))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.definition_level_histogram { + o_prot.write_field_begin(&TFieldIdentifier::new("definition_level_histogram", TType::List, 3))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + // // Statistics // @@ -1123,7 +1263,7 @@ impl crate::thrift::TSerializable for NullType { /// To maintain forward-compatibility in v1, implementations using this logical /// type must also set scale and precision on the annotated SchemaElement. /// -/// Allowed for physical types: INT32, INT64, FIXED, and BINARY +/// Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY. #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DecimalType { pub scale: i32, @@ -1620,7 +1760,7 @@ impl crate::thrift::TSerializable for IntType { /// Embedded JSON logical type annotation /// -/// Allowed for physical types: BINARY +/// Allowed for physical types: BYTE_ARRAY #[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct JsonType { } @@ -1660,7 +1800,7 @@ impl crate::thrift::TSerializable for JsonType { /// Embedded BSON logical type annotation /// -/// Allowed for physical types: BINARY +/// Allowed for physical types: BYTE_ARRAY #[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct BsonType { } @@ -2146,7 +2286,12 @@ impl crate::thrift::TSerializable for SchemaElement { /// Data page header #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DataPageHeader { - /// Number of values, including NULLs, in this data page. * + /// Number of values, including NULLs, in this data page. + /// + /// If a OffsetIndex is present, a page must begin at a row + /// boundary (repetition_level = 0). Otherwise, pages may begin + /// within a row (repetition_level > 0). + /// pub num_values: i32, /// Encoding used for this data page * pub encoding: Encoding, @@ -2154,7 +2299,7 @@ pub struct DataPageHeader { pub definition_level_encoding: Encoding, /// Encoding used for repetition levels * pub repetition_level_encoding: Encoding, - /// Optional statistics for the data in this page* + /// Optional statistics for the data in this page * pub statistics: Option, } @@ -2390,21 +2535,24 @@ pub struct DataPageHeaderV2 { /// Number of NULL values, in this data page. /// Number of non-null = num_values - num_nulls which is also the number of values in the data section * pub num_nulls: i32, - /// Number of rows in this data page. which means pages change on record boundaries (r = 0) * + /// Number of rows in this data page. Every page must begin at a + /// row boundary (repetition_level = 0): rows must **not** be + /// split across page boundaries when using V2 data pages. + /// pub num_rows: i32, /// Encoding used for data in this page * pub encoding: Encoding, - /// length of the definition levels + /// Length of the definition levels pub definition_levels_byte_length: i32, - /// length of the repetition levels + /// Length of the repetition levels pub repetition_levels_byte_length: i32, - /// whether the values are compressed. + /// Whether the values are compressed. /// Which means the section of the page between /// definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) /// is compressed with the compression_codec. /// If missing it is considered compressed pub is_compressed: Option, - /// optional statistics for the data in this page * + /// Optional statistics for the data in this page * pub statistics: Option, } @@ -3207,10 +3355,10 @@ impl crate::thrift::TSerializable for KeyValue { // SortingColumn // -/// Wrapper struct to specify sort order +/// Sort order within a RowGroup of a leaf column #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct SortingColumn { - /// The column index (in this row group) * + /// The ordinal position of the column (in this row group) * pub column_idx: i32, /// If true, indicates this column is sorted in descending order. * pub descending: bool, @@ -3417,10 +3565,15 @@ pub struct ColumnMetaData { /// Writers should write this field so readers can read the bloom filter /// in a single I/O. pub bloom_filter_length: Option, + /// Optional statistics to help estimate total memory when converted to in-memory + /// representations. The histograms contained in these statistics can + /// also be useful in some cases for more fine-grained nullability/list length + /// filter pushdown. + pub size_statistics: Option, } impl ColumnMetaData { - pub fn new(type_: Type, encodings: Vec, path_in_schema: Vec, codec: CompressionCodec, num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64, key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10, dictionary_page_offset: F11, statistics: F12, encoding_stats: F13, bloom_filter_offset: F14, bloom_filter_length: F15) -> ColumnMetaData where F8: Into>>, F10: Into>, F11: Into>, F12: Into>, F13: Into>>, F14: Into>, F15: Into> { + pub fn new(type_: Type, encodings: Vec, path_in_schema: Vec, codec: CompressionCodec, num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64, key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10, dictionary_page_offset: F11, statistics: F12, encoding_stats: F13, bloom_filter_offset: F14, bloom_filter_length: F15, size_statistics: F16) -> ColumnMetaData where F8: Into>>, F10: Into>, F11: Into>, F12: Into>, F13: Into>>, F14: Into>, F15: Into>, F16: Into> { ColumnMetaData { type_, encodings, @@ -3437,6 +3590,7 @@ impl ColumnMetaData { encoding_stats: encoding_stats.into(), bloom_filter_offset: bloom_filter_offset.into(), bloom_filter_length: bloom_filter_length.into(), + size_statistics: size_statistics.into(), } } } @@ -3459,6 +3613,7 @@ impl crate::thrift::TSerializable for ColumnMetaData { let mut f_13: Option> = None; let mut f_14: Option = None; let mut f_15: Option = None; + let mut f_16: Option = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -3474,8 +3629,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_0 = Encoding::read_from_in_protocol(i_prot)?; - val.push(list_elem_0); + let list_elem_2 = Encoding::read_from_in_protocol(i_prot)?; + val.push(list_elem_2); } i_prot.read_list_end()?; f_2 = Some(val); @@ -3484,8 +3639,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_1 = i_prot.read_string()?; - val.push(list_elem_1); + let list_elem_3 = i_prot.read_string()?; + val.push(list_elem_3); } i_prot.read_list_end()?; f_3 = Some(val); @@ -3510,8 +3665,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_2 = KeyValue::read_from_in_protocol(i_prot)?; - val.push(list_elem_2); + let list_elem_4 = KeyValue::read_from_in_protocol(i_prot)?; + val.push(list_elem_4); } i_prot.read_list_end()?; f_8 = Some(val); @@ -3536,8 +3691,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_3 = PageEncodingStats::read_from_in_protocol(i_prot)?; - val.push(list_elem_3); + let list_elem_5 = PageEncodingStats::read_from_in_protocol(i_prot)?; + val.push(list_elem_5); } i_prot.read_list_end()?; f_13 = Some(val); @@ -3550,6 +3705,10 @@ impl crate::thrift::TSerializable for ColumnMetaData { let val = i_prot.read_i32()?; f_15 = Some(val); }, + 16 => { + let val = SizeStatistics::read_from_in_protocol(i_prot)?; + f_16 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -3581,6 +3740,7 @@ impl crate::thrift::TSerializable for ColumnMetaData { encoding_stats: f_13, bloom_filter_offset: f_14, bloom_filter_length: f_15, + size_statistics: f_16, }; Ok(ret) } @@ -3662,6 +3822,11 @@ impl crate::thrift::TSerializable for ColumnMetaData { o_prot.write_i32(fld_var)?; o_prot.write_field_end()? } + if let Some(ref fld_var) = self.size_statistics { + o_prot.write_field_begin(&TFieldIdentifier::new("size_statistics", TType::Struct, 16))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -3741,8 +3906,8 @@ impl crate::thrift::TSerializable for EncryptionWithColumnKey { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_4 = i_prot.read_string()?; - val.push(list_elem_4); + let list_elem_6 = i_prot.read_string()?; + val.push(list_elem_6); } i_prot.read_list_end()?; f_1 = Some(val); @@ -3881,11 +4046,19 @@ pub struct ColumnChunk { /// metadata. This path is relative to the current file. /// pub file_path: Option, - /// Byte offset in file_path to the ColumnMetaData * + /// Deprecated: Byte offset in file_path to the ColumnMetaData + /// + /// Past use of this field has been inconsistent, with some implementations + /// using it to point to the ColumnMetaData and some using it to point to + /// the first page in the column chunk. In many cases, the ColumnMetaData at this + /// location is wrong. This field is now deprecated and should not be used. + /// Writers should set this field to 0 if no ColumnMetaData has been written outside + /// the footer. pub file_offset: i64, - /// Column metadata for this chunk. This is the same content as what is at - /// file_path/file_offset. Having it here has it replicated in the file - /// metadata. + /// Column metadata for this chunk. Some writers may also replicate this at the + /// location pointed to by file_path/file_offset. + /// Note: while marked as optional, this field is in fact required by most major + /// Parquet implementations. As such, writers MUST populate this field. /// pub meta_data: Option, /// File offset of ColumnChunk's OffsetIndex * @@ -4107,8 +4280,8 @@ impl crate::thrift::TSerializable for RowGroup { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_5 = ColumnChunk::read_from_in_protocol(i_prot)?; - val.push(list_elem_5); + let list_elem_7 = ColumnChunk::read_from_in_protocol(i_prot)?; + val.push(list_elem_7); } i_prot.read_list_end()?; f_1 = Some(val); @@ -4125,8 +4298,8 @@ impl crate::thrift::TSerializable for RowGroup { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_6 = SortingColumn::read_from_in_protocol(i_prot)?; - val.push(list_elem_6); + let list_elem_8 = SortingColumn::read_from_in_protocol(i_prot)?; + val.push(list_elem_8); } i_prot.read_list_end()?; f_4 = Some(val); @@ -4331,8 +4504,9 @@ pub struct PageLocation { /// Size of the page, including header. Sum of compressed_page_size and header /// length pub compressed_page_size: i32, - /// Index within the RowGroup of the first row of the page; this means pages - /// change on record boundaries (r = 0). + /// Index within the RowGroup of the first row of the page. When an + /// OffsetIndex is present, pages must begin on row boundaries + /// (repetition_level = 0). pub first_row_index: i64, } @@ -4409,17 +4583,28 @@ impl crate::thrift::TSerializable for PageLocation { // OffsetIndex // +/// Optional offsets for each data page in a ColumnChunk. +/// +/// Forms part of the page index, along with ColumnIndex. +/// +/// OffsetIndex may be present even if ColumnIndex is not. #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct OffsetIndex { /// PageLocations, ordered by increasing PageLocation.offset. It is required /// that page_locations\[i\].first_row_index < page_locations\[i+1\].first_row_index. pub page_locations: Vec, + /// Unencoded/uncompressed size for BYTE_ARRAY types. + /// + /// See documention for unencoded_byte_array_data_bytes in SizeStatistics for + /// more details on this field. + pub unencoded_byte_array_data_bytes: Option>, } impl OffsetIndex { - pub fn new(page_locations: Vec) -> OffsetIndex { + pub fn new(page_locations: Vec, unencoded_byte_array_data_bytes: F2) -> OffsetIndex where F2: Into>> { OffsetIndex { page_locations, + unencoded_byte_array_data_bytes: unencoded_byte_array_data_bytes.into(), } } } @@ -4428,6 +4613,7 @@ impl crate::thrift::TSerializable for OffsetIndex { fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; + let mut f_2: Option> = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -4439,12 +4625,22 @@ impl crate::thrift::TSerializable for OffsetIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_7 = PageLocation::read_from_in_protocol(i_prot)?; - val.push(list_elem_7); + let list_elem_9 = PageLocation::read_from_in_protocol(i_prot)?; + val.push(list_elem_9); } i_prot.read_list_end()?; f_1 = Some(val); }, + 2 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_10 = i_prot.read_i64()?; + val.push(list_elem_10); + } + i_prot.read_list_end()?; + f_2 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -4455,6 +4651,7 @@ impl crate::thrift::TSerializable for OffsetIndex { verify_required_field_exists("OffsetIndex.page_locations", &f_1)?; let ret = OffsetIndex { page_locations: f_1.expect("auto-generated code should have checked for presence of required fields"), + unencoded_byte_array_data_bytes: f_2, }; Ok(ret) } @@ -4468,6 +4665,15 @@ impl crate::thrift::TSerializable for OffsetIndex { } o_prot.write_list_end()?; o_prot.write_field_end()?; + if let Some(ref fld_var) = self.unencoded_byte_array_data_bytes { + o_prot.write_field_begin(&TFieldIdentifier::new("unencoded_byte_array_data_bytes", TType::List, 2))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -4477,8 +4683,14 @@ impl crate::thrift::TSerializable for OffsetIndex { // ColumnIndex // -/// Description for ColumnIndex. -/// Each ``\[i\] refers to the page at OffsetIndex.page_locations\[i\] +/// Optional statistics for each data page in a ColumnChunk. +/// +/// Forms part the page index, along with OffsetIndex. +/// +/// If this structure is present, OffsetIndex must also be present. +/// +/// For each field in this structure, \[i\] refers to the page at +/// OffsetIndex.page_locations\[i\] #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct ColumnIndex { /// A list of Boolean values to determine the validity of the corresponding @@ -4504,16 +4716,33 @@ pub struct ColumnIndex { pub boundary_order: BoundaryOrder, /// A list containing the number of null values for each page * pub null_counts: Option>, + /// Contains repetition level histograms for each page + /// concatenated together. The repetition_level_histogram field on + /// SizeStatistics contains more details. + /// + /// When present the length should always be (number of pages * + /// (max_repetition_level + 1)) elements. + /// + /// Element 0 is the first element of the histogram for the first page. + /// Element (max_repetition_level + 1) is the first element of the histogram + /// for the second page. + /// + pub repetition_level_histograms: Option>, + /// Same as repetition_level_histograms except for definitions levels. + /// + pub definition_level_histograms: Option>, } impl ColumnIndex { - pub fn new(null_pages: Vec, min_values: Vec>, max_values: Vec>, boundary_order: BoundaryOrder, null_counts: F5) -> ColumnIndex where F5: Into>> { + pub fn new(null_pages: Vec, min_values: Vec>, max_values: Vec>, boundary_order: BoundaryOrder, null_counts: F5, repetition_level_histograms: F6, definition_level_histograms: F7) -> ColumnIndex where F5: Into>>, F6: Into>>, F7: Into>> { ColumnIndex { null_pages, min_values, max_values, boundary_order, null_counts: null_counts.into(), + repetition_level_histograms: repetition_level_histograms.into(), + definition_level_histograms: definition_level_histograms.into(), } } } @@ -4526,6 +4755,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let mut f_3: Option>> = None; let mut f_4: Option = None; let mut f_5: Option> = None; + let mut f_6: Option> = None; + let mut f_7: Option> = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -4537,8 +4768,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_8 = i_prot.read_bool()?; - val.push(list_elem_8); + let list_elem_11 = i_prot.read_bool()?; + val.push(list_elem_11); } i_prot.read_list_end()?; f_1 = Some(val); @@ -4547,8 +4778,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec> = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_9 = i_prot.read_bytes()?; - val.push(list_elem_9); + let list_elem_12 = i_prot.read_bytes()?; + val.push(list_elem_12); } i_prot.read_list_end()?; f_2 = Some(val); @@ -4557,8 +4788,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec> = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_10 = i_prot.read_bytes()?; - val.push(list_elem_10); + let list_elem_13 = i_prot.read_bytes()?; + val.push(list_elem_13); } i_prot.read_list_end()?; f_3 = Some(val); @@ -4571,12 +4802,32 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_11 = i_prot.read_i64()?; - val.push(list_elem_11); + let list_elem_14 = i_prot.read_i64()?; + val.push(list_elem_14); } i_prot.read_list_end()?; f_5 = Some(val); }, + 6 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_15 = i_prot.read_i64()?; + val.push(list_elem_15); + } + i_prot.read_list_end()?; + f_6 = Some(val); + }, + 7 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_16 = i_prot.read_i64()?; + val.push(list_elem_16); + } + i_prot.read_list_end()?; + f_7 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -4594,6 +4845,8 @@ impl crate::thrift::TSerializable for ColumnIndex { max_values: f_3.expect("auto-generated code should have checked for presence of required fields"), boundary_order: f_4.expect("auto-generated code should have checked for presence of required fields"), null_counts: f_5, + repetition_level_histograms: f_6, + definition_level_histograms: f_7, }; Ok(ret) } @@ -4633,6 +4886,24 @@ impl crate::thrift::TSerializable for ColumnIndex { o_prot.write_list_end()?; o_prot.write_field_end()? } + if let Some(ref fld_var) = self.repetition_level_histograms { + o_prot.write_field_begin(&TFieldIdentifier::new("repetition_level_histograms", TType::List, 6))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.definition_level_histograms { + o_prot.write_field_begin(&TFieldIdentifier::new("definition_level_histograms", TType::List, 7))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -4992,8 +5263,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_12 = SchemaElement::read_from_in_protocol(i_prot)?; - val.push(list_elem_12); + let list_elem_17 = SchemaElement::read_from_in_protocol(i_prot)?; + val.push(list_elem_17); } i_prot.read_list_end()?; f_2 = Some(val); @@ -5006,8 +5277,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_13 = RowGroup::read_from_in_protocol(i_prot)?; - val.push(list_elem_13); + let list_elem_18 = RowGroup::read_from_in_protocol(i_prot)?; + val.push(list_elem_18); } i_prot.read_list_end()?; f_4 = Some(val); @@ -5016,8 +5287,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_14 = KeyValue::read_from_in_protocol(i_prot)?; - val.push(list_elem_14); + let list_elem_19 = KeyValue::read_from_in_protocol(i_prot)?; + val.push(list_elem_19); } i_prot.read_list_end()?; f_5 = Some(val); @@ -5030,8 +5301,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_15 = ColumnOrder::read_from_in_protocol(i_prot)?; - val.push(list_elem_15); + let list_elem_20 = ColumnOrder::read_from_in_protocol(i_prot)?; + val.push(list_elem_20); } i_prot.read_list_end()?; f_7 = Some(val); From 1c12fb8c216b8eaf2c44f76e0fc8579d742d078c Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 11 Jul 2024 11:02:22 -0700 Subject: [PATCH 2/4] pass None for optional size statistics --- parquet/src/file/metadata/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 40922d52bfd4..278d1e464e94 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -790,6 +790,7 @@ impl ColumnChunkMetaData { .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()), bloom_filter_offset: self.bloom_filter_offset, bloom_filter_length: self.bloom_filter_length, + size_statistics: None, } } @@ -1004,6 +1005,8 @@ impl ColumnIndexBuilder { self.max_values, self.boundary_order, self.null_counts, + None, + None, ) } } @@ -1052,7 +1055,7 @@ impl OffsetIndexBuilder { .zip(self.first_row_index_array.iter()) .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index)) .collect::>(); - OffsetIndex::new(locations) + OffsetIndex::new(locations, None) } } From 53cd5fad05608f201159b17bbe75762722a74536 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 11 Jul 2024 11:05:30 -0700 Subject: [PATCH 3/4] escape HTML tags --- parquet/src/format.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/format.rs b/parquet/src/format.rs index c35f779057a6..5074901aebf8 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -4689,7 +4689,7 @@ impl crate::thrift::TSerializable for OffsetIndex { /// /// If this structure is present, OffsetIndex must also be present. /// -/// For each field in this structure, \[i\] refers to the page at +/// For each field in this structure, ``\[i\] refers to the page at /// OffsetIndex.page_locations\[i\] #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct ColumnIndex { From 98025ccf1fda4cdf68cd9b1231f6a309f5c96814 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 11 Jul 2024 14:25:44 -0700 Subject: [PATCH 4/4] don't need to escape brackets in arrays --- parquet/src/format.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/format.rs b/parquet/src/format.rs index 5074901aebf8..6c93097b7359 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -659,7 +659,7 @@ pub struct SizeStatistics { /// below. /// /// For example, if a column chunk is dictionary-encoded with dictionary - /// \["a", "bc", "cde"\], and a data page contains the indices \[0, 0, 1, 2\], + /// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], /// then this value for that data page should be 7 (1 + 1 + 2 + 3). /// /// This field should only be set for types that use BYTE_ARRAY as their