Skip to content

Commit

Permalink
Handle primitive REPEATED field not contained in LIST annotated group (
Browse files Browse the repository at this point in the history
…#6649)

* Handle primitive REPEATED field not contained in LIST annotated group

* cargo fmt

* Add UT

* cargo fmt

* comment

* clippy

* clippy

* update parquet-testing module

* cargo fmt

---------

Co-authored-by: Ze'ev Maor <[email protected]>
  • Loading branch information
zeevm and Ze'ev Maor authored Nov 2, 2024
1 parent 37cd34d commit 6859c87
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 2 deletions.
137 changes: 136 additions & 1 deletion parquet/src/record/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,17 @@ impl TreeBuilder {
.column_descr_ptr();
let col_reader = row_group_reader.get_column_reader(orig_index)?;
let column = TripletIter::new(col_descr, col_reader, self.batch_size);
Reader::PrimitiveReader(field, Box::new(column))
let reader = Reader::PrimitiveReader(field.clone(), Box::new(column));
if repetition == Repetition::REPEATED {
Reader::RepeatedReader(
field,
curr_def_level - 1,
curr_rep_level - 1,
Box::new(reader),
)
} else {
reader
}
} else {
match field.get_basic_info().converted_type() {
// List types
Expand Down Expand Up @@ -1688,6 +1698,131 @@ mod tests {
assert_eq!(rows, expected_rows);
}

#[test]
fn test_tree_reader_handle_primitive_repeated_fields_with_no_annotation() {
// In this test the REPEATED fields are primitives
let rows = test_file_reader_rows("repeated_primitive_no_list.parquet", None).unwrap();
let expected_rows = vec![
row![
(
"Int32_list".to_string(),
Field::ListInternal(make_list([0, 1, 2, 3].map(Field::Int).to_vec()))
),
(
"String_list".to_string(),
Field::ListInternal(make_list(
["foo", "zero", "one", "two"]
.map(|s| Field::Str(s.to_string()))
.to_vec()
))
),
(
"group_of_lists".to_string(),
group![
(
"Int32_list_in_group".to_string(),
Field::ListInternal(make_list([0, 1, 2, 3].map(Field::Int).to_vec()))
),
(
"String_list_in_group".to_string(),
Field::ListInternal(make_list(
["foo", "zero", "one", "two"]
.map(|s| Field::Str(s.to_string()))
.to_vec()
))
)
]
)
],
row![
(
"Int32_list".to_string(),
Field::ListInternal(make_list(vec![]))
),
(
"String_list".to_string(),
Field::ListInternal(make_list(
["three"].map(|s| Field::Str(s.to_string())).to_vec()
))
),
(
"group_of_lists".to_string(),
group![
(
"Int32_list_in_group".to_string(),
Field::ListInternal(make_list(vec![]))
),
(
"String_list_in_group".to_string(),
Field::ListInternal(make_list(
["three"].map(|s| Field::Str(s.to_string())).to_vec()
))
)
]
)
],
row![
(
"Int32_list".to_string(),
Field::ListInternal(make_list(vec![Field::Int(4)]))
),
(
"String_list".to_string(),
Field::ListInternal(make_list(
["four"].map(|s| Field::Str(s.to_string())).to_vec()
))
),
(
"group_of_lists".to_string(),
group![
(
"Int32_list_in_group".to_string(),
Field::ListInternal(make_list(vec![Field::Int(4)]))
),
(
"String_list_in_group".to_string(),
Field::ListInternal(make_list(
["four"].map(|s| Field::Str(s.to_string())).to_vec()
))
)
]
)
],
row![
(
"Int32_list".to_string(),
Field::ListInternal(make_list([5, 6, 7, 8].map(Field::Int).to_vec()))
),
(
"String_list".to_string(),
Field::ListInternal(make_list(
["five", "six", "seven", "eight"]
.map(|s| Field::Str(s.to_string()))
.to_vec()
))
),
(
"group_of_lists".to_string(),
group![
(
"Int32_list_in_group".to_string(),
Field::ListInternal(make_list([5, 6, 7, 8].map(Field::Int).to_vec()))
),
(
"String_list_in_group".to_string(),
Field::ListInternal(make_list(
["five", "six", "seven", "eight"]
.map(|s| Field::Str(s.to_string()))
.to_vec()
))
)
]
)
],
];
assert_eq!(rows, expected_rows);
}

fn test_file_reader_rows(file_name: &str, schema: Option<Type>) -> Result<Vec<Row>> {
let file = get_test_file(file_name);
let file_reader: Box<dyn FileReader> = Box::new(SerializedFileReader::new(file)?);
Expand Down

0 comments on commit 6859c87

Please sign in to comment.