From 6859c877cd44cff63c46a1c2e14592538bd09934 Mon Sep 17 00:00:00 2001 From: Ze'ev Maor Date: Sat, 2 Nov 2024 12:30:30 +0200 Subject: [PATCH] Handle primitive REPEATED field not contained in LIST annotated group (#6649) * Handle primitive REPEATED field not contained in LIST annotated group * cargo fmt * Add UT * cargo fmt * comment * clippy * clippy * update parquet-testing module * cargo fmt --------- Co-authored-by: Ze'ev Maor --- parquet-testing | 2 +- parquet/src/record/reader.rs | 137 ++++++++++++++++++++++++++++++++++- 2 files changed, 137 insertions(+), 2 deletions(-) diff --git a/parquet-testing b/parquet-testing index 50af3d8ce206..550368ca77b9 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit 50af3d8ce206990d81014b1862e5ce7380dc3e08 +Subproject commit 550368ca77b97231efead39251a96bd6f8f08c6e diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index 1f9128a8b4f9..fd6ca7cdd57a 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -138,7 +138,17 @@ impl TreeBuilder { .column_descr_ptr(); let col_reader = row_group_reader.get_column_reader(orig_index)?; let column = TripletIter::new(col_descr, col_reader, self.batch_size); - Reader::PrimitiveReader(field, Box::new(column)) + let reader = Reader::PrimitiveReader(field.clone(), Box::new(column)); + if repetition == Repetition::REPEATED { + Reader::RepeatedReader( + field, + curr_def_level - 1, + curr_rep_level - 1, + Box::new(reader), + ) + } else { + reader + } } else { match field.get_basic_info().converted_type() { // List types @@ -1688,6 +1698,131 @@ mod tests { assert_eq!(rows, expected_rows); } + #[test] + fn test_tree_reader_handle_primitive_repeated_fields_with_no_annotation() { + // In this test the REPEATED fields are primitives + let rows = test_file_reader_rows("repeated_primitive_no_list.parquet", None).unwrap(); + let expected_rows = vec![ + row![ + ( + "Int32_list".to_string(), + Field::ListInternal(make_list([0, 1, 2, 3].map(Field::Int).to_vec())) + ), + ( + "String_list".to_string(), + Field::ListInternal(make_list( + ["foo", "zero", "one", "two"] + .map(|s| Field::Str(s.to_string())) + .to_vec() + )) + ), + ( + "group_of_lists".to_string(), + group![ + ( + "Int32_list_in_group".to_string(), + Field::ListInternal(make_list([0, 1, 2, 3].map(Field::Int).to_vec())) + ), + ( + "String_list_in_group".to_string(), + Field::ListInternal(make_list( + ["foo", "zero", "one", "two"] + .map(|s| Field::Str(s.to_string())) + .to_vec() + )) + ) + ] + ) + ], + row![ + ( + "Int32_list".to_string(), + Field::ListInternal(make_list(vec![])) + ), + ( + "String_list".to_string(), + Field::ListInternal(make_list( + ["three"].map(|s| Field::Str(s.to_string())).to_vec() + )) + ), + ( + "group_of_lists".to_string(), + group![ + ( + "Int32_list_in_group".to_string(), + Field::ListInternal(make_list(vec![])) + ), + ( + "String_list_in_group".to_string(), + Field::ListInternal(make_list( + ["three"].map(|s| Field::Str(s.to_string())).to_vec() + )) + ) + ] + ) + ], + row![ + ( + "Int32_list".to_string(), + Field::ListInternal(make_list(vec![Field::Int(4)])) + ), + ( + "String_list".to_string(), + Field::ListInternal(make_list( + ["four"].map(|s| Field::Str(s.to_string())).to_vec() + )) + ), + ( + "group_of_lists".to_string(), + group![ + ( + "Int32_list_in_group".to_string(), + Field::ListInternal(make_list(vec![Field::Int(4)])) + ), + ( + "String_list_in_group".to_string(), + Field::ListInternal(make_list( + ["four"].map(|s| Field::Str(s.to_string())).to_vec() + )) + ) + ] + ) + ], + row![ + ( + "Int32_list".to_string(), + Field::ListInternal(make_list([5, 6, 7, 8].map(Field::Int).to_vec())) + ), + ( + "String_list".to_string(), + Field::ListInternal(make_list( + ["five", "six", "seven", "eight"] + .map(|s| Field::Str(s.to_string())) + .to_vec() + )) + ), + ( + "group_of_lists".to_string(), + group![ + ( + "Int32_list_in_group".to_string(), + Field::ListInternal(make_list([5, 6, 7, 8].map(Field::Int).to_vec())) + ), + ( + "String_list_in_group".to_string(), + Field::ListInternal(make_list( + ["five", "six", "seven", "eight"] + .map(|s| Field::Str(s.to_string())) + .to_vec() + )) + ) + ] + ) + ], + ]; + assert_eq!(rows, expected_rows); + } + fn test_file_reader_rows(file_name: &str, schema: Option) -> Result> { let file = get_test_file(file_name); let file_reader: Box = Box::new(SerializedFileReader::new(file)?);