From 075721bde4a63baa11075871b52948285ce004f4 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 7 Sep 2024 00:23:20 +0800 Subject: [PATCH 1/6] GH-43994: [C++][Parquet] Fix schema conversion from two-level encoding nested list --- cpp/src/parquet/arrow/arrow_schema_test.cc | 20 ++++++++++++++++++++ cpp/src/parquet/arrow/schema.cc | 4 ++++ 2 files changed, 24 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index d261482d89a5d..0202caa1dabb7 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -601,6 +601,26 @@ TEST_F(TestConvertParquetSchema, ParquetLists) { arrow_fields.push_back(::arrow::field("name", arrow_list, false)); } + // Two-level encoding List>: + // optional group nested_list (LIST) { + // repeated group array (LIST) { + // repeated int32 array; + // } + // } + { + auto inner_element = + PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32); + auto outer_element = GroupNode::Make("array", Repetition::REPEATED, {inner_element}, + ConvertedType::LIST); + parquet_fields.push_back(GroupNode::Make("nested_list", Repetition::OPTIONAL, + {outer_element}, ConvertedType::LIST)); + auto arrow_inner_element = ::arrow::field("array", INT32, /*nullable=*/false); + auto arrow_outer_element = + ::arrow::field("array", ::arrow::list(arrow_inner_element), /*nullable=*/false); + auto arrow_list = ::arrow::list(arrow_outer_element); + arrow_fields.push_back(::arrow::field("nested_list", arrow_list, true)); + } + auto arrow_schema = ::arrow::schema(arrow_fields); ASSERT_OK(ConvertSchema(parquet_fields)); diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 0d009c8d4f1e1..e17561782fcc2 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -681,6 +681,10 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, // List of primitive type RETURN_NOT_OK( NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, child_field)); + } else if (list_group.field_count() == 1 && list_group.field(0)->is_repeated()) { + // Special case for nested list in two-level list encoding + RETURN_NOT_OK( + NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, child_field)); } else { RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field)); } From 0baa2b279383019bfeb0185a01fc6116d3bc1526 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 20 Sep 2024 22:20:16 +0800 Subject: [PATCH 2/6] add a roundtrip test --- .../parquet/arrow/arrow_reader_writer_test.cc | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 78d272ff2458c..8216b6b4d1350 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4085,6 +4085,62 @@ TEST(TestArrowReaderAdHoc, OldDataPageV2) { TryReadDataFile(path); } +TEST(TestArrowReaderAdHoc, LegacyTwoLevelList) { + // Create schema with a nested list of two-level encoding: + constexpr std::string_view kExpectedSchema = + "required group field_id=-1 schema {\n" + " optional group field_id=-1 nested_list (List) {\n" + " repeated group field_id=-1 array (List) {\n" + " repeated int32 field_id=-1 array;\n" + " }\n" + " }\n" + "}\n"; + auto inner_element = PrimitiveNode::Make("array", Repetition::REPEATED, Type::INT32); + auto outer_element = GroupNode::Make("array", Repetition::REPEATED, {inner_element}, + ConvertedType::LIST); + auto nested_list = GroupNode::Make("nested_list", Repetition::OPTIONAL, {outer_element}, + ConvertedType::LIST); + auto schema_node = GroupNode::Make("schema", Repetition::REQUIRED, {nested_list}); + + // Create a Parquet writer to write values of nested list + auto sink = CreateOutputStream(); + auto file_writer = ParquetFileWriter::Open( + sink, std::dynamic_pointer_cast(schema_node)); + auto row_group_writer = file_writer->AppendRowGroup(); + auto int_writer = dynamic_cast(row_group_writer->NextColumn()); + ASSERT_TRUE(int_writer != nullptr); + + // Directly write a single row of nested list: [[1, 2],[3, 4]] + constexpr int64_t kNumValues = 4; + std::array rep_levels = {0, 2, 1, 2}; + std::array def_levels = {3, 3, 3, 3}; + std::array values = {1, 2, 3, 4}; + int_writer->WriteBatch(kNumValues, def_levels.data(), rep_levels.data(), values.data()); + file_writer->Close(); + + // Read schema and verify it applies two-level encoding of list type + ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); + auto source = std::make_shared<::arrow::io::BufferReader>(buffer); + auto file_reader = ParquetFileReader::Open(source); + ASSERT_EQ(kExpectedSchema, file_reader->metadata()->schema()->ToString()); + + // Read and verify data + std::unique_ptr reader; + ASSERT_OK(FileReader::Make(default_memory_pool(), std::move(file_reader), &reader)); + std::shared_ptr table; + ASSERT_OK(reader->ReadTable(&table)); + + auto arrow_inner_element = + ::arrow::field("array", ::arrow::int32(), /*nullable=*/false); + auto arrow_outer_element = + ::arrow::field("array", ::arrow::list(arrow_inner_element), /*nullable=*/false); + auto arrow_list = ::arrow::list(arrow_outer_element); + auto arrow_schema = + ::arrow::schema({::arrow::field("nested_list", arrow_list, /*nullable=*/true)}); + auto expected_table = ::arrow::TableFromJSON(arrow_schema, {R"([[[[1,2],[3,4]]]])"}); + ::arrow::AssertTablesEqual(*expected_table, *table); +} + class TestArrowReaderAdHocSparkAndHvr : public ::testing::TestWithParam< std::tuple>> {}; From 7d9bbb494e8ee01341c34163c203ec2d5a4d6394 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 24 Oct 2024 14:38:08 +0800 Subject: [PATCH 3/6] add interop test --- .../parquet/arrow/arrow_reader_writer_test.cc | 123 ++++++++++-------- cpp/submodules/parquet-testing | 2 +- 2 files changed, 72 insertions(+), 53 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 8216b6b4d1350..3a3c1e815e779 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4086,59 +4086,78 @@ TEST(TestArrowReaderAdHoc, OldDataPageV2) { } TEST(TestArrowReaderAdHoc, LegacyTwoLevelList) { - // Create schema with a nested list of two-level encoding: - constexpr std::string_view kExpectedSchema = - "required group field_id=-1 schema {\n" - " optional group field_id=-1 nested_list (List) {\n" - " repeated group field_id=-1 array (List) {\n" - " repeated int32 field_id=-1 array;\n" - " }\n" - " }\n" - "}\n"; - auto inner_element = PrimitiveNode::Make("array", Repetition::REPEATED, Type::INT32); - auto outer_element = GroupNode::Make("array", Repetition::REPEATED, {inner_element}, - ConvertedType::LIST); - auto nested_list = GroupNode::Make("nested_list", Repetition::OPTIONAL, {outer_element}, - ConvertedType::LIST); - auto schema_node = GroupNode::Make("schema", Repetition::REQUIRED, {nested_list}); - - // Create a Parquet writer to write values of nested list - auto sink = CreateOutputStream(); - auto file_writer = ParquetFileWriter::Open( - sink, std::dynamic_pointer_cast(schema_node)); - auto row_group_writer = file_writer->AppendRowGroup(); - auto int_writer = dynamic_cast(row_group_writer->NextColumn()); - ASSERT_TRUE(int_writer != nullptr); - - // Directly write a single row of nested list: [[1, 2],[3, 4]] - constexpr int64_t kNumValues = 4; - std::array rep_levels = {0, 2, 1, 2}; - std::array def_levels = {3, 3, 3, 3}; - std::array values = {1, 2, 3, 4}; - int_writer->WriteBatch(kNumValues, def_levels.data(), rep_levels.data(), values.data()); - file_writer->Close(); - - // Read schema and verify it applies two-level encoding of list type - ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); - auto source = std::make_shared<::arrow::io::BufferReader>(buffer); - auto file_reader = ParquetFileReader::Open(source); - ASSERT_EQ(kExpectedSchema, file_reader->metadata()->schema()->ToString()); + auto VerifyData = [](std::unique_ptr file_reader) { + // Expected Parquet schema of legacy two-level encoding + constexpr std::string_view kExpectedLegacyList = + "required group field_id=-1 a (List) {\n" + " repeated group field_id=-1 array (List) {\n" + " repeated int32 field_id=-1 array;\n" + " }\n" + "}\n"; + + // Expected Arrow schema and data + auto arrow_inner_list = + field("array", list(field("array", ::arrow::int32(), /*nullable=*/false)), + /*nullable=*/false); + auto arrow_outer_list = list(arrow_inner_list); + auto arrow_schema = + ::arrow::schema({field("a", arrow_outer_list, /*nullable=*/false)}); + auto expected_table = TableFromJSON(arrow_schema, {R"([[[[1,2],[3,4]]]])"}); + + // Verify Parquet schema + auto root_group = file_reader->metadata()->schema()->group_node(); + ASSERT_EQ(1, root_group->field_count()); + std::stringstream nodeStr; + PrintSchema(root_group->field(0).get(), nodeStr); + ASSERT_EQ(kExpectedLegacyList, nodeStr.str()); + + // Verify Arrow schema and data + std::unique_ptr reader; + ASSERT_OK_NO_THROW( + FileReader::Make(default_memory_pool(), std::move(file_reader), &reader)); + std::shared_ptr
table; + ASSERT_OK(reader->ReadTable(&table)); + AssertTablesEqual(*expected_table, *table); + }; - // Read and verify data - std::unique_ptr reader; - ASSERT_OK(FileReader::Make(default_memory_pool(), std::move(file_reader), &reader)); - std::shared_ptr
table; - ASSERT_OK(reader->ReadTable(&table)); - - auto arrow_inner_element = - ::arrow::field("array", ::arrow::int32(), /*nullable=*/false); - auto arrow_outer_element = - ::arrow::field("array", ::arrow::list(arrow_inner_element), /*nullable=*/false); - auto arrow_list = ::arrow::list(arrow_outer_element); - auto arrow_schema = - ::arrow::schema({::arrow::field("nested_list", arrow_list, /*nullable=*/true)}); - auto expected_table = ::arrow::TableFromJSON(arrow_schema, {R"([[[[1,2],[3,4]]]])"}); - ::arrow::AssertTablesEqual(*expected_table, *table); + // Round-trip test for Parquet C++ reader and writer + { + // Create Parquet schema of legacy two-level encoding + auto inner_list = GroupNode::Make("array", Repetition::REPEATED, + {schema::Int32("array", Repetition::REPEATED)}, + LogicalType::List()); + auto outer_list = + GroupNode::Make("a", Repetition::REQUIRED, {inner_list}, LogicalType::List()); + auto schema_node = GroupNode::Make("schema", Repetition::REQUIRED, {outer_list}); + + // Create a Parquet writer to write values of nested list + auto sink = CreateOutputStream(); + auto file_writer = + ParquetFileWriter::Open(sink, std::dynamic_pointer_cast(schema_node)); + auto row_group_writer = file_writer->AppendRowGroup(); + auto int_writer = dynamic_cast(row_group_writer->NextColumn()); + ASSERT_TRUE(int_writer != nullptr); + + // Directly write a single row of nested list: [[1, 2],[3, 4]] + constexpr int64_t kNumValues = 4; + constexpr std::array kRepLevels = {0, 2, 1, 2}; + constexpr std::array kDefLevels = {2, 2, 2, 2}; + constexpr std::array kValues = {1, 2, 3, 4}; + int_writer->WriteBatch(kNumValues, kDefLevels.data(), kRepLevels.data(), + kValues.data()); + file_writer->Close(); + ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); + + // Read schema and verify it applies two-level encoding of list type + ASSERT_NO_FATAL_FAILURE( + VerifyData(ParquetFileReader::Open(std::make_shared(buffer)))); + } + + // Interoperability test for Parquet file generated by parquet-java + { + auto path = std::string(test::get_data_dir()) + "/old_list_structure.parquet"; + ASSERT_NO_FATAL_FAILURE(VerifyData(ParquetFileReader::OpenFile(path))); + } } class TestArrowReaderAdHocSparkAndHvr diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index cb7a9674142c1..a7f1d288e693d 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit cb7a9674142c137367bf75a01b79c6e214a73199 +Subproject commit a7f1d288e693dbb08e3199851c4eb2140ff8dff2 From 3262601f30a8c35d1bacba9b8d9ecd63e42770cc Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 30 Oct 2024 17:58:49 +0800 Subject: [PATCH 4/6] reorganize logic and add comment --- cpp/src/parquet/arrow/schema.cc | 98 +++++++++++++++++++-------------- 1 file changed, 58 insertions(+), 40 deletions(-) diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index e17561782fcc2..c56ed4df1084a 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -513,11 +513,13 @@ Status PopulateLeaf(int column_index, const std::shared_ptr& field, } // Special case mentioned in the format spec: -// If the name is array or ends in _tuple, this should be a list of struct -// even for single child elements. -bool HasStructListName(const GroupNode& node) { +// If the name is array or uses the parent's name with `_tuple` appended, +// this should be: +// - a list of list or map type if the repeated group node is LIST- or MAP-annotated. +// - otherwise, a list of struct even for single child elements. +bool HasListElementName(const GroupNode& node, const GroupNode& parent) { ::std::string_view name{node.name()}; - return name == "array" || EndsWith(name, "_tuple"); + return name == "array" || name == (parent.name() + "_tuple"); } Status GroupToStruct(const GroupNode& node, LevelInfo current_levels, @@ -598,9 +600,9 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels, ctx->LinkParent(value_field, key_value_field); // required/optional group name=whatever { - // repeated group name=key_values{ + // repeated group name=key_values { // required TYPE key; - // required/optional TYPE value; + // required/optional TYPE value; // } // } // @@ -651,42 +653,54 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated(); if (list_node.is_group()) { - // Resolve 3-level encoding - // - // required/optional group name=whatever { - // repeated group name=list { - // required/optional TYPE item; - // } - // } - // - // yields list ?nullable - // - // We distinguish the special case that we have - // - // required/optional group name=whatever { - // repeated group name=array or $SOMETHING_tuple { - // required/optional TYPE item; - // } - // } - // - // In this latter case, the inner type of the list should be a struct - // rather than a primitive value - // - // yields list not null> ?nullable const auto& list_group = static_cast(list_node); - // Special case mentioned in the format spec: - // If the name is array or ends in _tuple, this should be a list of struct - // even for single child elements. - if (list_group.field_count() == 1 && !HasStructListName(list_group)) { - // List of primitive type - RETURN_NOT_OK( - NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, child_field)); - } else if (list_group.field_count() == 1 && list_group.field(0)->is_repeated()) { - // Special case for nested list in two-level list encoding - RETURN_NOT_OK( - NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, child_field)); - } else { + if (list_group.field_count() > 1) { + // The inner type of the list should be a struct when there are multiple fields + // in the repeated group RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field)); + } else if (list_group.field_count() == 1) { + const auto& repeated_field = list_group.field(0); + if (repeated_field->is_repeated()) { + // Special case where the inner type might be a list with two-level encoding + // like below: + // + // required/optional group name=SOMETHING (LIST) { + // repeated group array (LIST) { + // repeated TYPE item; + // } + // } + // + // yields list not null> ?nullable + RETURN_NOT_OK( + NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field)); + } else if (HasListElementName(list_group, group)) { + // We distinguish the special case that we have + // + // required/optional group name=SOMETHING { + // repeated group name=array or $SOMETHING_tuple { + // required/optional TYPE item; + // } + // } + // + // The inner type of the list should be a struct rather than a primitive value + // + // yields list not null> ?nullable + RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field)); + } else { + // Resolve 3-level encoding + // + // required/optional group name=whatever { + // repeated group name=list { + // required/optional TYPE item; + // } + // } + // + // yields list ?nullable + RETURN_NOT_OK( + NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field)); + } + } else { + return Status::Invalid("Group must have at least one child."); } } else { // Two-level list encoding @@ -694,6 +708,10 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, // required/optional group LIST { // repeated TYPE; // } + // + // TYPE is a primitive type + // + // yields list ?nullable const auto& primitive_node = static_cast(list_node); int column_index = ctx->schema->GetColumnIndex(primitive_node); ASSIGN_OR_RAISE(std::shared_ptr type, From 0f09216247cda9b0ff2fd6877af01a3468fe24bd Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 31 Oct 2024 13:50:16 +0800 Subject: [PATCH 5/6] add cases for list and list>> --- cpp/src/parquet/arrow/arrow_schema_test.cc | 109 ++++++++++++++++++--- cpp/src/parquet/arrow/schema.cc | 20 +++- 2 files changed, 114 insertions(+), 15 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 0202caa1dabb7..0d439e0119ba9 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -18,6 +18,7 @@ #include #include +#include "gmock/gmock-matchers.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -602,23 +603,55 @@ TEST_F(TestConvertParquetSchema, ParquetLists) { } // Two-level encoding List>: - // optional group nested_list (LIST) { + // optional group my_list (LIST) { // repeated group array (LIST) { // repeated int32 array; // } // } { - auto inner_element = + auto inner_array = PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32); - auto outer_element = GroupNode::Make("array", Repetition::REPEATED, {inner_element}, - ConvertedType::LIST); - parquet_fields.push_back(GroupNode::Make("nested_list", Repetition::OPTIONAL, - {outer_element}, ConvertedType::LIST)); - auto arrow_inner_element = ::arrow::field("array", INT32, /*nullable=*/false); - auto arrow_outer_element = - ::arrow::field("array", ::arrow::list(arrow_inner_element), /*nullable=*/false); - auto arrow_list = ::arrow::list(arrow_outer_element); - arrow_fields.push_back(::arrow::field("nested_list", arrow_list, true)); + auto outer_array = GroupNode::Make("array", Repetition::REPEATED, {inner_array}, + ConvertedType::LIST); + parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL, + {outer_array}, ConvertedType::LIST)); + auto arrow_inner_array = ::arrow::field("array", INT32, /*nullable=*/false); + auto arrow_outer_array = + ::arrow::field("array", ::arrow::list(arrow_inner_array), /*nullable=*/false); + auto arrow_list = ::arrow::list(arrow_outer_array); + arrow_fields.push_back(::arrow::field("my_list", arrow_list, true)); + } + + // List> in three-level list encoding: + // optional group my_list (LIST) { + // repeated group list { + // required group element (MAP) { + // repeated group key_value { + // required binary key (STRING); + // optional binary value (STRING); + // } + // } + // } + // } + { + auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, ParquetType::BYTE_ARRAY, + ConvertedType::UTF8); + auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL, + ParquetType::BYTE_ARRAY, ConvertedType::UTF8); + auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, value}); + auto element = + GroupNode::Make("element", Repetition::REQUIRED, {key_value}, ConvertedType::MAP); + auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); + parquet_fields.push_back( + GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, ConvertedType::LIST)); + + auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false); + auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true); + auto arrow_element = ::arrow::field( + "element", std::make_shared<::arrow::MapType>(arrow_key, arrow_value), + /*nullable=*/false); + auto arrow_list = ::arrow::list(arrow_element); + arrow_fields.push_back(::arrow::field("my_list", arrow_list, /*nullable=*/true)); } auto arrow_schema = ::arrow::schema(arrow_fields); @@ -747,6 +780,60 @@ TEST_F(TestConvertParquetSchema, ParquetRepeatedNestedSchema) { ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); } +TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) { + // List> in two-level list encoding: + // + // optional group my_list (LIST) { + // repeated group array (MAP) { + // repeated group key_value { + // required binary key (STRING); + // optional binary value (STRING); + // } + // } + // } + { + auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, ParquetType::BYTE_ARRAY, + ConvertedType::UTF8); + auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL, + ParquetType::BYTE_ARRAY, ConvertedType::UTF8); + auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, value}); + auto array = + GroupNode::Make("array", Repetition::REPEATED, {key_value}, ConvertedType::MAP); + std::vector parquet_fields; + parquet_fields.push_back( + GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST)); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Group with one repeated child must be LIST-annotated."), + ConvertSchema(parquet_fields)); + } + + // List>: outer list is two-level encoding, inner list is three-level + // + // optional group my_list (LIST) { + // repeated group array (LIST) { + // repeated group list { + // required binary element (STRING); + // } + // } + // } + { + auto element = PrimitiveNode::Make("element", Repetition::REQUIRED, + ParquetType::BYTE_ARRAY, ConvertedType::UTF8); + auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); + auto array = + GroupNode::Make("array", Repetition::REPEATED, {list}, ConvertedType::LIST); + std::vector parquet_fields; + parquet_fields.push_back( + GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST)); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("LIST-annotated groups must not be repeated."), + ConvertSchema(parquet_fields)); + } +} + Status ArrowSchemaToParquetMetadata(std::shared_ptr<::arrow::Schema>& arrow_schema, std::shared_ptr& metadata) { ARROW_ASSIGN_OR_RAISE( diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index c56ed4df1084a..41e2350b08d4a 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -633,9 +633,12 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, if (group.field_count() != 1) { return Status::Invalid("LIST-annotated groups must have a single child."); } - if (group.is_repeated()) { - return Status::Invalid("LIST-annotated groups must not be repeated."); - } + + // The Parquet spec requires that LIST-annotated group cannot be repeated when + // it applies normal three-level encoding. We need to figure out legacy list + // structures and do not enforce this rule for them. + bool is_legacy_list_structure = true; + current_levels.Increment(group); out->children.resize(group.field_count()); @@ -671,8 +674,11 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, // } // // yields list not null> ?nullable + if (!list_group.logical_type()->is_list()) { + return Status::Invalid("Group with one repeated child must be LIST-annotated."); + } RETURN_NOT_OK( - NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field)); + ListToSchemaField(list_group, current_levels, ctx, out, child_field)); } else if (HasListElementName(list_group, group)) { // We distinguish the special case that we have // @@ -696,6 +702,7 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, // } // // yields list ?nullable + is_legacy_list_structure = false; RETURN_NOT_OK( NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field)); } @@ -721,6 +728,11 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, RETURN_NOT_OK( PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field)); } + + if (!is_legacy_list_structure && group.is_repeated()) { + return Status::Invalid("LIST-annotated groups must not be repeated."); + } + out->field = ::arrow::field(group.name(), ::arrow::list(child_field->field), group.is_optional(), FieldIdMetadata(group.field_id())); out->level_info = current_levels; From 487cc0b0474be027033c0d8466460a2d36a0ba32 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 31 Oct 2024 23:39:25 +0800 Subject: [PATCH 6/6] fix tests --- cpp/src/parquet/arrow/arrow_schema_test.cc | 4 +++- cpp/src/parquet/arrow/schema.cc | 21 +++++++++------------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 0d439e0119ba9..a6e04e54259c9 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -1953,7 +1953,9 @@ TEST_F(TestLevels, ListErrors) { { ::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make( "child_list", Repetition::REPEATED, - {PrimitiveNode::Make("bool", Repetition::REPEATED, ParquetType::BOOLEAN)}, + {GroupNode::Make("list", Repetition::REPEATED, + {PrimitiveNode::Make("element", Repetition::REQUIRED, + ParquetType::BOOLEAN)})}, LogicalType::List())); ASSERT_RAISES(Invalid, error); std::string expected("LIST-annotated groups must not be repeated."); diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 41e2350b08d4a..0ee595508fec4 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -633,11 +633,9 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, if (group.field_count() != 1) { return Status::Invalid("LIST-annotated groups must have a single child."); } - - // The Parquet spec requires that LIST-annotated group cannot be repeated when - // it applies normal three-level encoding. We need to figure out legacy list - // structures and do not enforce this rule for them. - bool is_legacy_list_structure = true; + if (group.is_repeated()) { + return Status::Invalid("LIST-annotated groups must not be repeated."); + } current_levels.Increment(group); @@ -677,8 +675,13 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, if (!list_group.logical_type()->is_list()) { return Status::Invalid("Group with one repeated child must be LIST-annotated."); } + // LIST-annotated group with three-level encoding cannot be repeated. + if (repeated_field->is_group() && + !static_cast(*repeated_field).field(0)->is_repeated()) { + return Status::Invalid("LIST-annotated groups must not be repeated."); + } RETURN_NOT_OK( - ListToSchemaField(list_group, current_levels, ctx, out, child_field)); + NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field)); } else if (HasListElementName(list_group, group)) { // We distinguish the special case that we have // @@ -702,7 +705,6 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, // } // // yields list ?nullable - is_legacy_list_structure = false; RETURN_NOT_OK( NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field)); } @@ -728,11 +730,6 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, RETURN_NOT_OK( PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field)); } - - if (!is_legacy_list_structure && group.is_repeated()) { - return Status::Invalid("LIST-annotated groups must not be repeated."); - } - out->field = ::arrow::field(group.name(), ::arrow::list(child_field->field), group.is_optional(), FieldIdMetadata(group.field_id())); out->level_info = current_levels;