From f70ae7961fb3fc4aca9528f5f66a6071500567de Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 24 Oct 2024 14:38:08 +0800 Subject: [PATCH] add interop test --- .../parquet/arrow/arrow_reader_writer_test.cc | 123 ++++++++++-------- cpp/submodules/parquet-testing | 2 +- 2 files changed, 72 insertions(+), 53 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 1086c23539004..43cc57bd9ac21 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4094,59 +4094,78 @@ TEST(TestArrowReaderAdHoc, OldDataPageV2) { } TEST(TestArrowReaderAdHoc, LegacyTwoLevelList) { - // Create schema with a nested list of two-level encoding: - constexpr std::string_view kExpectedSchema = - "required group field_id=-1 schema {\n" - " optional group field_id=-1 nested_list (List) {\n" - " repeated group field_id=-1 array (List) {\n" - " repeated int32 field_id=-1 array;\n" - " }\n" - " }\n" - "}\n"; - auto inner_element = PrimitiveNode::Make("array", Repetition::REPEATED, Type::INT32); - auto outer_element = GroupNode::Make("array", Repetition::REPEATED, {inner_element}, - ConvertedType::LIST); - auto nested_list = GroupNode::Make("nested_list", Repetition::OPTIONAL, {outer_element}, - ConvertedType::LIST); - auto schema_node = GroupNode::Make("schema", Repetition::REQUIRED, {nested_list}); - - // Create a Parquet writer to write values of nested list - auto sink = CreateOutputStream(); - auto file_writer = ParquetFileWriter::Open( - sink, std::dynamic_pointer_cast(schema_node)); - auto row_group_writer = file_writer->AppendRowGroup(); - auto int_writer = dynamic_cast(row_group_writer->NextColumn()); - ASSERT_TRUE(int_writer != nullptr); - - // Directly write a single row of nested list: [[1, 2],[3, 4]] - constexpr int64_t kNumValues = 4; - std::array rep_levels = {0, 2, 1, 2}; - std::array def_levels = {3, 3, 3, 3}; - std::array values = {1, 2, 3, 4}; - int_writer->WriteBatch(kNumValues, def_levels.data(), rep_levels.data(), values.data()); - file_writer->Close(); - - // Read schema and verify it applies two-level encoding of list type - ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); - auto source = std::make_shared<::arrow::io::BufferReader>(buffer); - auto file_reader = ParquetFileReader::Open(source); - ASSERT_EQ(kExpectedSchema, file_reader->metadata()->schema()->ToString()); + auto VerifyData = [](std::unique_ptr file_reader) { + // Expected Parquet schema of legacy two-level encoding + constexpr std::string_view kExpectedLegacyList = + "required group field_id=-1 a (List) {\n" + " repeated group field_id=-1 array (List) {\n" + " repeated int32 field_id=-1 array;\n" + " }\n" + "}\n"; + + // Expected Arrow schema and data + auto arrow_inner_list = + field("array", list(field("array", ::arrow::int32(), /*nullable=*/false)), + /*nullable=*/false); + auto arrow_outer_list = list(arrow_inner_list); + auto arrow_schema = + ::arrow::schema({field("a", arrow_outer_list, /*nullable=*/false)}); + auto expected_table = TableFromJSON(arrow_schema, {R"([[[[1,2],[3,4]]]])"}); + + // Verify Parquet schema + auto root_group = file_reader->metadata()->schema()->group_node(); + ASSERT_EQ(1, root_group->field_count()); + std::stringstream nodeStr; + PrintSchema(root_group->field(0).get(), nodeStr); + ASSERT_EQ(kExpectedLegacyList, nodeStr.str()); + + // Verify Arrow schema and data + std::unique_ptr reader; + ASSERT_OK_NO_THROW( + FileReader::Make(default_memory_pool(), std::move(file_reader), &reader)); + std::shared_ptr table; + ASSERT_OK(reader->ReadTable(&table)); + AssertTablesEqual(*expected_table, *table); + }; - // Read and verify data - std::unique_ptr reader; - ASSERT_OK(FileReader::Make(default_memory_pool(), std::move(file_reader), &reader)); - std::shared_ptr
table; - ASSERT_OK(reader->ReadTable(&table)); - - auto arrow_inner_element = - ::arrow::field("array", ::arrow::int32(), /*nullable=*/false); - auto arrow_outer_element = - ::arrow::field("array", ::arrow::list(arrow_inner_element), /*nullable=*/false); - auto arrow_list = ::arrow::list(arrow_outer_element); - auto arrow_schema = - ::arrow::schema({::arrow::field("nested_list", arrow_list, /*nullable=*/true)}); - auto expected_table = ::arrow::TableFromJSON(arrow_schema, {R"([[[[1,2],[3,4]]]])"}); - ::arrow::AssertTablesEqual(*expected_table, *table); + // Round-trip test for Parquet C++ reader and writer + { + // Create Parquet schema of legacy two-level encoding + auto inner_list = GroupNode::Make("array", Repetition::REPEATED, + {schema::Int32("array", Repetition::REPEATED)}, + LogicalType::List()); + auto outer_list = + GroupNode::Make("a", Repetition::REQUIRED, {inner_list}, LogicalType::List()); + auto schema_node = GroupNode::Make("schema", Repetition::REQUIRED, {outer_list}); + + // Create a Parquet writer to write values of nested list + auto sink = CreateOutputStream(); + auto file_writer = + ParquetFileWriter::Open(sink, std::dynamic_pointer_cast(schema_node)); + auto row_group_writer = file_writer->AppendRowGroup(); + auto int_writer = dynamic_cast(row_group_writer->NextColumn()); + ASSERT_TRUE(int_writer != nullptr); + + // Directly write a single row of nested list: [[1, 2],[3, 4]] + constexpr int64_t kNumValues = 4; + constexpr std::array kRepLevels = {0, 2, 1, 2}; + constexpr std::array kDefLevels = {2, 2, 2, 2}; + constexpr std::array kValues = {1, 2, 3, 4}; + int_writer->WriteBatch(kNumValues, kDefLevels.data(), kRepLevels.data(), + kValues.data()); + file_writer->Close(); + ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); + + // Read schema and verify it applies two-level encoding of list type + ASSERT_NO_FATAL_FAILURE( + VerifyData(ParquetFileReader::Open(std::make_shared(buffer)))); + } + + // Interoperability test for Parquet file generated by parquet-java + { + auto path = std::string(test::get_data_dir()) + "/old_list_structure.parquet"; + ASSERT_NO_FATAL_FAILURE(VerifyData(ParquetFileReader::OpenFile(path))); + } } class TestArrowReaderAdHocSparkAndHvr diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index cb7a9674142c1..a7f1d288e693d 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit cb7a9674142c137367bf75a01b79c6e214a73199 +Subproject commit a7f1d288e693dbb08e3199851c4eb2140ff8dff2