From 0175d530b3a23cf2c6934ee676ced1b683537733 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 31 Jul 2024 09:09:16 -0700 Subject: [PATCH 1/3] add rough equivalence test --- parquet/src/file/writer.rs | 52 +++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 2b22dd743f48..49bceb7ced70 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -2195,6 +2195,31 @@ mod tests { metadata.finish() } + fn check_columns_are_equivalent(left: &ColumnChunkMetaData, right: &ColumnChunkMetaData) { + assert_eq!(left.column_descr(), right.column_descr()); + assert_eq!(left.encodings(), right.encodings()); + assert_eq!(left.num_values(), right.num_values()); + assert_eq!(left.compressed_size(), right.compressed_size()); + assert_eq!(left.data_page_offset(), right.data_page_offset()); + assert_eq!(left.statistics(), right.statistics()); + assert_eq!(left.offset_index_length(), right.offset_index_length()); + assert_eq!(left.column_index_length(), right.column_index_length()); + } + + fn check_row_groups_are_equivalent(left: &RowGroupMetaData, right: &RowGroupMetaData) { + assert_eq!(left.num_rows(), right.num_rows()); + assert_eq!(left.file_offset(), right.file_offset()); + assert_eq!(left.total_byte_size(), right.total_byte_size()); + assert_eq!(left.schema_descr(), right.schema_descr()); + assert_eq!(left.num_columns(), right.num_columns()); + left.columns() + .iter() + .zip(right.columns().iter()) + .for_each(|(lc, rc)| { + check_columns_are_equivalent(lc, rc); + }); + } + #[tokio::test] #[cfg(feature = "async")] async fn test_encode_parquet_metadata_with_page_index() { @@ -2212,7 +2237,32 @@ mod tests { let decoded_metadata = load_metadata_from_bytes(data.len(), data).await; - assert_eq!(metadata.metadata, decoded_metadata); + // Because the page index offsets will differ, compare invariant parts of the metadata + assert_eq!( + metadata.metadata.file_metadata(), + decoded_metadata.file_metadata() + ); + assert_eq!( + metadata.metadata.column_index(), + decoded_metadata.column_index() + ); + assert_eq!( + metadata.metadata.offset_index(), + decoded_metadata.offset_index() + ); + assert_eq!( + metadata.metadata.num_row_groups(), + decoded_metadata.num_row_groups() + ); + + metadata + .metadata + .row_groups() + .iter() + .zip(decoded_metadata.row_groups().iter()) + .for_each(|(left, right)| { + check_row_groups_are_equivalent(left, right); + }); } #[test] From f188bf825c4160279539a1a2bad7e1e90e54c427 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 31 Jul 2024 09:17:34 -0700 Subject: [PATCH 2/3] one more check --- parquet/src/file/writer.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 49bceb7ced70..4203e36780d2 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -2204,6 +2204,10 @@ mod tests { assert_eq!(left.statistics(), right.statistics()); assert_eq!(left.offset_index_length(), right.offset_index_length()); assert_eq!(left.column_index_length(), right.column_index_length()); + assert_eq!( + left.unencoded_byte_array_data_bytes(), + right.unencoded_byte_array_data_bytes() + ); } fn check_row_groups_are_equivalent(left: &RowGroupMetaData, right: &RowGroupMetaData) { From 57b85d75cf1ec4f9af481b2d3dd1697471de1078 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 31 Jul 2024 09:30:42 -0700 Subject: [PATCH 3/3] make clippy happy --- parquet/src/file/writer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 4203e36780d2..3eb2f8882708 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -2055,6 +2055,7 @@ mod tests { } struct TestMetadata { + #[allow(dead_code)] file_size: usize, metadata: ParquetMetaData, }