From 1bc2d114cdbcb57bf631992973462dbdcfadfd78 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 7 Sep 2024 00:23:20 +0800 Subject: [PATCH] GH-43994: [C++][Parquet] Fix schema conversion from two-level encoding nested list --- cpp/src/parquet/arrow/arrow_schema_test.cc | 20 ++++++++++++++++++++ cpp/src/parquet/arrow/schema.cc | 4 ++++ 2 files changed, 24 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 31ead461aa6e2..debb80ebd30f1 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -601,6 +601,26 @@ TEST_F(TestConvertParquetSchema, ParquetLists) { arrow_fields.push_back(::arrow::field("name", arrow_list, false)); } + // Two-level encoding List>: + // optional group nested_list (LIST) { + // repeated group array (LIST) { + // repeated int32 array; + // } + // } + { + auto inner_element = + PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32); + auto outer_element = GroupNode::Make("array", Repetition::REPEATED, {inner_element}, + ConvertedType::LIST); + parquet_fields.push_back(GroupNode::Make("nested_list", Repetition::OPTIONAL, + {outer_element}, ConvertedType::LIST)); + auto arrow_inner_element = ::arrow::field("array", INT32, /*nullable=*/false); + auto arrow_outer_element = + ::arrow::field("array", ::arrow::list(arrow_inner_element), /*nullable=*/false); + auto arrow_list = ::arrow::list(arrow_outer_element); + arrow_fields.push_back(::arrow::field("nested_list", arrow_list, true)); + } + auto arrow_schema = ::arrow::schema(arrow_fields); ASSERT_OK(ConvertSchema(parquet_fields)); diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 1623d80dcb0e4..5b3ced6c56c9f 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -681,6 +681,10 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, // List of primitive type RETURN_NOT_OK( NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, child_field)); + } else if (list_group.field_count() == 1 && list_group.field(0)->is_repeated()) { + // Special case for nested list in two-level list encoding + RETURN_NOT_OK( + NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, child_field)); } else { RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field)); }