diff --git a/velox/dwio/parquet/reader/Metadata.cpp b/velox/dwio/parquet/reader/Metadata.cpp index 771e68e8a595..6234e11c47b7 100644 --- a/velox/dwio/parquet/reader/Metadata.cpp +++ b/velox/dwio/parquet/reader/Metadata.cpp @@ -219,6 +219,11 @@ ColumnChunkMetaDataPtr::getColumnStatistics( thriftColumnChunkPtr(ptr_)->meta_data.statistics, *type, numRows); }; +thrift::ColumnMetaData ColumnChunkMetaDataPtr::getColumnMetadata() { + VELOX_CHECK(hasStatistics()); + return thriftColumnChunkPtr(ptr_)->meta_data; +} + int64_t ColumnChunkMetaDataPtr::dataPageOffset() const { return thriftColumnChunkPtr(ptr_)->meta_data.data_page_offset; } diff --git a/velox/dwio/parquet/reader/Metadata.h b/velox/dwio/parquet/reader/Metadata.h index f99d46656d8c..599c0a55980e 100644 --- a/velox/dwio/parquet/reader/Metadata.h +++ b/velox/dwio/parquet/reader/Metadata.h @@ -18,6 +18,7 @@ #include "velox/dwio/common/Statistics.h" #include "velox/dwio/common/compression/Compression.h" +#include "velox/dwio/parquet/thrift/ParquetThriftTypes.h" namespace facebook::velox::parquet { @@ -42,6 +43,9 @@ class ColumnChunkMetaDataPtr { const TypePtr type, int64_t numRows); + /// Return the ColumnMetadata + thrift::ColumnMetaData getColumnMetadata(); + /// Number of values. int64_t numValues() const; diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index 05091f4f0e14..12ff744a73a3 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -1156,4 +1156,8 @@ FileMetaDataPtr ParquetReader::fileMetaData() const { return readerBase_->fileMetaData(); } +const thrift::FileMetaData& ParquetReader::thriftFileMetaData() const { + return readerBase_->thriftFileMetaData(); +} + } // namespace facebook::velox::parquet diff --git a/velox/dwio/parquet/reader/ParquetReader.h b/velox/dwio/parquet/reader/ParquetReader.h index de6d7a9966dc..567d21e756a5 100644 --- a/velox/dwio/parquet/reader/ParquetReader.h +++ b/velox/dwio/parquet/reader/ParquetReader.h @@ -106,6 +106,8 @@ class ParquetReader : public dwio::common::Reader { FileMetaDataPtr fileMetaData() const; + const thrift::FileMetaData& thriftFileMetaData() const; + private: std::shared_ptr readerBase_; }; diff --git a/velox/dwio/parquet/tests/writer/CMakeLists.txt b/velox/dwio/parquet/tests/writer/CMakeLists.txt index 01e718dc22c4..dfa2980d6673 100644 --- a/velox/dwio/parquet/tests/writer/CMakeLists.txt +++ b/velox/dwio/parquet/tests/writer/CMakeLists.txt @@ -48,3 +48,22 @@ target_link_libraries( ${TEST_LINK_LIBS} GTest::gtest fmt::fmt) + +add_executable(velox_dwio_parquet_writer_test MetadataTest.cpp + StatisticsTest.cpp) + +add_test( + NAME velox_dwio_parquet_writer_test + COMMAND velox_dwio_parquet_writer_test + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + +target_link_libraries( + velox_dwio_parquet_writer_test + velox_dwio_arrow_parquet_writer_test_lib + GTest::gmock + GTest::gtest + GTest::gtest_main + arrow + arrow_testing + velox_dwio_native_parquet_reader + velox_temp_path) diff --git a/velox/dwio/parquet/writer/arrow/tests/MetadataTest.cpp b/velox/dwio/parquet/tests/writer/MetadataTest.cpp similarity index 90% rename from velox/dwio/parquet/writer/arrow/tests/MetadataTest.cpp rename to velox/dwio/parquet/tests/writer/MetadataTest.cpp index f65087088435..e975b23923dd 100644 --- a/velox/dwio/parquet/writer/arrow/tests/MetadataTest.cpp +++ b/velox/dwio/parquet/tests/writer/MetadataTest.cpp @@ -26,9 +26,13 @@ #include "velox/dwio/parquet/writer/arrow/Statistics.h" #include "velox/dwio/parquet/writer/arrow/ThriftInternal.h" #include "velox/dwio/parquet/writer/arrow/Types.h" -#include "velox/dwio/parquet/writer/arrow/tests/FileReader.h" #include "velox/dwio/parquet/writer/arrow/tests/TestUtil.h" +#include "velox/dwio/parquet/reader/ParquetReader.h" +#include "velox/exec/tests/utils/TempFilePath.h" + +#include + namespace facebook::velox::parquet::arrow { namespace metadata { @@ -394,21 +398,47 @@ TEST(Metadata, TestAddKeyValueMetadata) { file_writer->AddKeyValueMetadata(kv_meta_ignored), ParquetException); PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish()); - auto source = std::make_shared<::arrow::io::BufferReader>(buffer); - auto file_reader = ParquetFileReader::Open(source); - ASSERT_NE(nullptr, file_reader->metadata()); - ASSERT_NE(nullptr, file_reader->metadata()->key_value_metadata()); - auto read_kv_meta = file_reader->metadata()->key_value_metadata(); + // Write the buffer to a temp file + auto temp_file = exec::test::TempFilePath::create(); + auto file_path = temp_file->getPath(); + auto local_write_file = + std::make_unique(file_path, false, false); + auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(buffer); + auto buffer_to_string = buffer_reader->buffer()->ToString(); + local_write_file->append(buffer_to_string); + local_write_file->close(); + memory::MemoryManager::testingSetInstance({}); + std::shared_ptr root_pool; + std::shared_ptr leaf_pool; + root_pool = memory::memoryManager()->addRootPool("MetadataTest"); + leaf_pool = root_pool->addLeafChild("MetadataTest"); + dwio::common::ReaderOptions readerOptions{leaf_pool.get()}; + auto input = std::make_unique( + std::make_shared(file_path), readerOptions.memoryPool()); + auto reader = + std::make_unique(std::move(input), readerOptions); + auto thrift_meta = reader->thriftFileMetaData(); + ASSERT_NE(0, thrift_meta.key_value_metadata.size()); // Verify keys that were added before file writer was closed are present. for (int i = 1; i <= 3; ++i) { - auto index = std::to_string(i); - PARQUET_ASSIGN_OR_THROW(auto value, read_kv_meta->Get("test_key_" + index)); + auto key = thrift_meta.key_value_metadata[i - 1].key; + auto value = thrift_meta.key_value_metadata[i - 1].value; + + auto index = std::to_string(i + 1); + if (i == 3) { + index = std::to_string(1); + } + + EXPECT_EQ("test_key_" + index, key); EXPECT_EQ("test_value_" + index, value); } // Verify keys that were added after file writer was closed are not present. - EXPECT_FALSE(read_kv_meta->Contains("test_key_4")); + for (const auto& kv : thrift_meta.key_value_metadata) { + EXPECT_NE("test_key_4", kv.key); + EXPECT_NE("test_value_4", kv.value); + } } // TODO: disabled as they require Arrow parquet data dir. @@ -498,15 +528,36 @@ TEST(Metadata, TestSortingColumns) { file_writer->Close(); PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish()); - auto source = std::make_shared<::arrow::io::BufferReader>(buffer); - auto file_reader = ParquetFileReader::Open(source); - - ASSERT_NE(nullptr, file_reader->metadata()); - ASSERT_EQ(1, file_reader->metadata()->num_row_groups()); - auto row_group_reader = file_reader->RowGroup(0); - auto* row_group_read_metadata = row_group_reader->metadata(); - ASSERT_NE(nullptr, row_group_read_metadata); - EXPECT_EQ(sorting_columns, row_group_read_metadata->sorting_columns()); + + // Write the buffer to a temp file + auto temp_file = exec::test::TempFilePath::create(); + auto file_path = temp_file->getPath(); + auto local_write_file = + std::make_unique(file_path, false, false); + auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(buffer); + auto buffer_to_string = buffer_reader->buffer()->ToString(); + local_write_file->append(buffer_to_string); + local_write_file->close(); + memory::MemoryManager::testingSetInstance({}); + std::shared_ptr root_pool; + std::shared_ptr leaf_pool; + root_pool = memory::memoryManager()->addRootPool("MetadataTest"); + leaf_pool = root_pool->addLeafChild("MetadataTest"); + dwio::common::ReaderOptions readerOptions{leaf_pool.get()}; + auto input = std::make_unique( + std::make_shared(file_path), readerOptions.memoryPool()); + auto reader = + std::make_unique(std::move(input), readerOptions); + ASSERT_EQ(1, reader->thriftFileMetaData().row_groups.size()); + auto row_groups = reader->thriftFileMetaData().row_groups; + ASSERT_NE(true, row_groups.empty()); + auto row_group_sorting_columns = row_groups[0].sorting_columns; + EXPECT_EQ( + sorting_columns[0].column_idx, row_group_sorting_columns[0].column_idx); + EXPECT_EQ( + sorting_columns[0].descending, row_group_sorting_columns[0].descending); + EXPECT_EQ( + sorting_columns[0].nulls_first, row_group_sorting_columns[0].nulls_first); } TEST(ApplicationVersion, Basics) { diff --git a/velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp b/velox/dwio/parquet/tests/writer/StatisticsTest.cpp similarity index 94% rename from velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp rename to velox/dwio/parquet/tests/writer/StatisticsTest.cpp index 3b7029dab0e4..9d3b7635d977 100644 --- a/velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp +++ b/velox/dwio/parquet/tests/writer/StatisticsTest.cpp @@ -37,6 +37,7 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/ubsan.h" +#include "velox/dwio/parquet/reader/ParquetReader.h" #include "velox/dwio/parquet/writer/arrow/ColumnWriter.h" #include "velox/dwio/parquet/writer/arrow/FileWriter.h" #include "velox/dwio/parquet/writer/arrow/Platform.h" @@ -45,8 +46,8 @@ #include "velox/dwio/parquet/writer/arrow/ThriftInternal.h" #include "velox/dwio/parquet/writer/arrow/Types.h" #include "velox/dwio/parquet/writer/arrow/tests/ColumnReader.h" -#include "velox/dwio/parquet/writer/arrow/tests/FileReader.h" #include "velox/dwio/parquet/writer/arrow/tests/TestUtil.h" +#include "velox/exec/tests/utils/TempFilePath.h" using arrow::default_memory_pool; using arrow::MemoryPool; @@ -494,18 +495,37 @@ class TestStatistics : public PrimitiveTypedTest { ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); auto source = std::make_shared<::arrow::io::BufferReader>(buffer); - auto file_reader = ParquetFileReader::Open(source); - auto rg_reader = file_reader->RowGroup(0); - auto column_chunk = rg_reader->metadata()->ColumnChunk(0); - if (!column_chunk->is_stats_set()) - return; - std::shared_ptr stats = column_chunk->statistics(); - // check values after serialization + deserialization - EXPECT_EQ(null_count, stats->null_count()); - EXPECT_EQ(num_values - null_count, stats->num_values()); + + // Write the buffer to a temp file + auto temp_file = exec::test::TempFilePath::create(); + auto file_path = temp_file->getPath(); + auto local_write_file = + std::make_unique(file_path, false, false); + auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(buffer); + auto buffer_to_string = buffer_reader->buffer()->ToString(); + local_write_file->append(buffer_to_string); + local_write_file->close(); + memory::MemoryManager::testingSetInstance({}); + std::shared_ptr root_pool; + std::shared_ptr leaf_pool; + root_pool = memory::memoryManager()->addRootPool("StatisticsTest"); + leaf_pool = root_pool->addLeafChild("StatisticsTest"); + dwio::common::ReaderOptions readerOptions{leaf_pool.get()}; + auto input = std::make_unique( + std::make_shared(file_path), readerOptions.memoryPool()); + auto reader = + std::make_unique(std::move(input), readerOptions); + auto row_group = reader->fileMetaData().rowGroup(0); + auto column_chunk = row_group.columnChunk(0); + auto column_metadata = column_chunk.getColumnMetadata(); + auto stats = column_metadata.statistics; + EXPECT_EQ(null_count, stats.null_count); EXPECT_TRUE(expected_stats->HasMinMax()); - EXPECT_EQ(expected_stats->EncodeMin(), stats->EncodeMin()); - EXPECT_EQ(expected_stats->EncodeMax(), stats->EncodeMax()); + EXPECT_EQ(expected_stats->EncodeMin(), stats.min_value); + EXPECT_EQ(expected_stats->EncodeMax(), stats.max_value); + auto column_stats = + column_chunk.getColumnStatistics(INTEGER(), row_group.numRows()); + EXPECT_EQ(num_values - null_count, column_stats->getNumberOfValues()); } }; @@ -1015,19 +1035,32 @@ class TestStatisticsSortOrder : public ::testing::Test { void VerifyParquetStats() { ASSERT_OK_AND_ASSIGN(auto pbuffer, parquet_sink_->Finish()); - // Create a ParquetReader instance - std::unique_ptr parquet_reader = ParquetFileReader::Open( - std::make_shared<::arrow::io::BufferReader>(pbuffer)); - - // Get the File MetaData - std::shared_ptr file_metadata = parquet_reader->metadata(); - std::shared_ptr rg_metadata = file_metadata->RowGroup(0); + // Write the pbuffer to a temp file + auto temp_file = exec::test::TempFilePath::create(); + auto file_path = temp_file->getPath(); + auto local_write_file = + std::make_unique(file_path, false, false); + auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(pbuffer); + auto buffer_to_string = buffer_reader->buffer()->ToString(); + local_write_file->append(buffer_to_string); + local_write_file->close(); + memory::MemoryManager::testingSetInstance({}); + std::shared_ptr root_pool; + std::shared_ptr leaf_pool; + root_pool = memory::memoryManager()->addRootPool("StatisticsTest"); + leaf_pool = root_pool->addLeafChild("StatisticsTest"); + dwio::common::ReaderOptions readerOptions{leaf_pool.get()}; + auto input = std::make_unique( + std::make_shared(file_path), readerOptions.memoryPool()); + auto reader = + std::make_unique(std::move(input), readerOptions); + auto row_group = reader->fileMetaData().rowGroup(0); for (int i = 0; i < static_cast(fields_.size()); i++) { ARROW_SCOPED_TRACE("Statistics for field #", i); - std::shared_ptr cc_metadata = - rg_metadata->ColumnChunk(i); - EXPECT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); - EXPECT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); + auto column_chunk = row_group.columnChunk(i); + auto cc_metadata = column_chunk.getColumnMetadata(); + EXPECT_EQ(stats_[i].min(), cc_metadata.statistics.min_value); + EXPECT_EQ(stats_[i].max(), cc_metadata.statistics.max_value); } } @@ -1264,34 +1297,6 @@ TEST(TestByteArrayStatisticsFromArrow, LargeStringType) { TestByteArrayStatisticsFromArrow<::arrow::LargeStringType>(); } -// Ensure UNKNOWN sort order is handled properly -using TestStatisticsSortOrderFLBA = TestStatisticsSortOrder; - -TEST_F(TestStatisticsSortOrderFLBA, UnknownSortOrder) { - this->fields_.push_back(schema::PrimitiveNode::Make( - "Column 0", - Repetition::REQUIRED, - Type::FIXED_LEN_BYTE_ARRAY, - ConvertedType::INTERVAL, - FLBA_LENGTH)); - this->SetUpSchema(); - this->WriteParquet(); - - ASSERT_OK_AND_ASSIGN(auto pbuffer, parquet_sink_->Finish()); - - // Create a ParquetReader instance - std::unique_ptr parquet_reader = ParquetFileReader::Open( - std::make_shared<::arrow::io::BufferReader>(pbuffer)); - // Get the File MetaData - std::shared_ptr file_metadata = parquet_reader->metadata(); - std::shared_ptr rg_metadata = file_metadata->RowGroup(0); - std::shared_ptr cc_metadata = - rg_metadata->ColumnChunk(0); - - // stats should not be set for UNKNOWN sort order - ASSERT_FALSE(cc_metadata->is_stats_set()); -} - template < typename Stats, typename Array, diff --git a/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt b/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt index 6ed5e1e8e86c..22aa47c000b2 100644 --- a/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt +++ b/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt @@ -20,11 +20,9 @@ add_executable( EncodingTest.cpp FileDeserializeTest.cpp FileSerializeTest.cpp - MetadataTest.cpp PageIndexTest.cpp PropertiesTest.cpp SchemaTest.cpp - StatisticsTest.cpp TypesTest.cpp) add_test(velox_dwio_arrow_parquet_writer_test