From 4467d41256937dbb527d57c46179c61b0568241d Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 16 Dec 2024 11:07:33 -0500 Subject: [PATCH] GH-45015: [C++][Parquet] Allow configuring the default footer read size (#45016) ### Rationale for this change Reading the footer for a parquet file whose file metadata is >64KB can require multiple round trips to a high latency file system like S3. Allowing this default read size to be configurable allows for reducing the round trips if it's known up front that the file might potentially have a large amount of metadata. ### What changes are included in this PR? A `footer_read_size_` property is added to parquet `ReaderProperties` along with a getter/setter. This is then utilized in the file reader's `GetFooterReadSize` method. * GitHub Issue: #45015 Lead-authored-by: Matt Topol Co-authored-by: mwish Signed-off-by: Matt Topol --- cpp/src/parquet/file_reader.cc | 5 ++--- cpp/src/parquet/properties.h | 10 ++++++++++ cpp/src/parquet/properties_test.cc | 1 + 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 3cc42ae370217..1c9b2323de500 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -83,8 +83,6 @@ bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) { } } // namespace -// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file -static constexpr int64_t kDefaultFooterReadSize = 64 * 1024; static constexpr uint32_t kFooterSize = 8; // For PARQUET-816 @@ -482,7 +480,8 @@ class SerializedFile : public ParquetFileReader::Contents { "Parquet file size is ", source_size_, " bytes, smaller than the minimum file footer (", kFooterSize, " bytes)"); } - return std::min(source_size_, kDefaultFooterReadSize); + + return std::min(static_cast(source_size_), properties_.footer_read_size()); } // Validate the magic bytes and get the length of the full footer. diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 7f2e371df66d7..a8e4430a03d82 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -56,6 +56,9 @@ constexpr int32_t kDefaultThriftStringSizeLimit = 100 * 1000 * 1000; // kDefaultStringSizeLimit. constexpr int32_t kDefaultThriftContainerSizeLimit = 1000 * 1000; +// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file +constexpr int64_t kDefaultFooterReadSize = 64 * 1024; + class PARQUET_EXPORT ReaderProperties { public: explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool()) @@ -120,6 +123,12 @@ class PARQUET_EXPORT ReaderProperties { page_checksum_verification_ = check_crc; } + // Set the default read size to read the footer from a file. For high latency + // file systems and files with large metadata (>64KB) this can increase performance + // by reducing the number of round-trips to retrieve the entire file metadata. + void set_footer_read_size(size_t size) { footer_read_size_ = size; } + size_t footer_read_size() const { return footer_read_size_; } + private: MemoryPool* pool_; int64_t buffer_size_ = kDefaultBufferSize; @@ -129,6 +138,7 @@ class PARQUET_EXPORT ReaderProperties { bool page_checksum_verification_ = false; // Used with a RecordReader. bool read_dense_for_nullable_ = false; + size_t footer_read_size_ = kDefaultFooterReadSize; std::shared_ptr file_decryption_properties_; }; diff --git a/cpp/src/parquet/properties_test.cc b/cpp/src/parquet/properties_test.cc index b2c574413abf7..35fc11565914e 100644 --- a/cpp/src/parquet/properties_test.cc +++ b/cpp/src/parquet/properties_test.cc @@ -35,6 +35,7 @@ TEST(TestReaderProperties, Basics) { ReaderProperties props; ASSERT_EQ(props.buffer_size(), kDefaultBufferSize); + ASSERT_EQ(props.footer_read_size(), kDefaultFooterReadSize); ASSERT_FALSE(props.is_buffered_stream_enabled()); ASSERT_FALSE(props.page_checksum_verification()); }