From f3d46398d3c81d9575ffd77ce3b86d4b993a4888 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 22 May 2024 15:06:04 +0200 Subject: [PATCH] GH-41760: [C++][Parquet] Add file metadata read/write benchmark (#41761) Following the discussions on the Parquet ML (see [this thread](https://lists.apache.org/thread/5jyhzkwyrjk9z52g0b49g31ygnz73gxo) and [this thread](https://lists.apache.org/thread/vs3w2z5bk6s3c975rrkqdttr1dpsdn7h)), and the various complaints about poor Parquet metadata performance on wide schemas, this adds a benchmark to measure the overhead of Parquet file metadata parsing or serialization for different numbers of row groups and columns. Sample output: ``` ----------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ----------------------------------------------------------------------------------------------------------------------- WriteFileMetadataAndData/num_columns:1/num_row_groups:1 11743 ns 11741 ns 59930 data_size=54 file_size=290 items_per_second=85.1726k/s WriteFileMetadataAndData/num_columns:1/num_row_groups:100 843137 ns 842920 ns 832 data_size=5.4k file_size=20.486k items_per_second=1.18635k/s WriteFileMetadataAndData/num_columns:1/num_row_groups:1000 8232304 ns 8230294 ns 85 data_size=54k file_size=207.687k items_per_second=121.502/s WriteFileMetadataAndData/num_columns:10/num_row_groups:1 101214 ns 101190 ns 6910 data_size=540 file_size=2.11k items_per_second=9.8824k/s WriteFileMetadataAndData/num_columns:10/num_row_groups:100 8026185 ns 8024361 ns 87 data_size=54k file_size=193.673k items_per_second=124.621/s WriteFileMetadataAndData/num_columns:10/num_row_groups:1000 81370293 ns 81343455 ns 8 data_size=540k file_size=1.94392M items_per_second=12.2936/s WriteFileMetadataAndData/num_columns:100/num_row_groups:1 955862 ns 955528 ns 733 data_size=5.4k file_size=20.694k items_per_second=1.04654k/s WriteFileMetadataAndData/num_columns:100/num_row_groups:100 80115516 ns 80086117 ns 9 data_size=540k file_size=1.94729M items_per_second=12.4866/s WriteFileMetadataAndData/num_columns:100/num_row_groups:1000 856428565 ns 856065370 ns 1 data_size=5.4M file_size=19.7673M items_per_second=1.16814/s WriteFileMetadataAndData/num_columns:1000/num_row_groups:1 9330003 ns 9327439 ns 75 data_size=54k file_size=211.499k items_per_second=107.211/s WriteFileMetadataAndData/num_columns:1000/num_row_groups:100 834609159 ns 834354590 ns 1 data_size=5.4M file_size=19.9623M items_per_second=1.19853/s ReadFileMetadata/num_columns:1/num_row_groups:1 3824 ns 3824 ns 182381 data_size=54 file_size=290 items_per_second=261.518k/s ReadFileMetadata/num_columns:1/num_row_groups:100 88519 ns 88504 ns 7879 data_size=5.4k file_size=20.486k items_per_second=11.299k/s ReadFileMetadata/num_columns:1/num_row_groups:1000 849558 ns 849391 ns 825 data_size=54k file_size=207.687k items_per_second=1.17731k/s ReadFileMetadata/num_columns:10/num_row_groups:1 19918 ns 19915 ns 35449 data_size=540 file_size=2.11k items_per_second=50.2138k/s ReadFileMetadata/num_columns:10/num_row_groups:100 715822 ns 715667 ns 975 data_size=54k file_size=193.673k items_per_second=1.3973k/s ReadFileMetadata/num_columns:10/num_row_groups:1000 7017008 ns 7015432 ns 100 data_size=540k file_size=1.94392M items_per_second=142.543/s ReadFileMetadata/num_columns:100/num_row_groups:1 175988 ns 175944 ns 3958 data_size=5.4k file_size=20.694k items_per_second=5.68363k/s ReadFileMetadata/num_columns:100/num_row_groups:100 6814382 ns 6812781 ns 103 data_size=540k file_size=1.94729M items_per_second=146.783/s ReadFileMetadata/num_columns:100/num_row_groups:1000 77858645 ns 77822157 ns 9 data_size=5.4M file_size=19.7673M items_per_second=12.8498/s ReadFileMetadata/num_columns:1000/num_row_groups:1 1670001 ns 1669563 ns 419 data_size=54k file_size=211.499k items_per_second=598.959/s ReadFileMetadata/num_columns:1000/num_row_groups:100 77339599 ns 77292924 ns 9 data_size=5.4M file_size=19.9623M items_per_second=12.9378/s ``` * GitHub Issue: #41760 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/metadata_benchmark.cc | 156 ++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 cpp/src/parquet/metadata_benchmark.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 93f2e72d8d661..5ac5085a694c8 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -432,6 +432,7 @@ add_parquet_benchmark(column_reader_benchmark) add_parquet_benchmark(column_io_benchmark) add_parquet_benchmark(encoding_benchmark) add_parquet_benchmark(level_conversion_benchmark) +add_parquet_benchmark(metadata_benchmark) add_parquet_benchmark(page_index_benchmark SOURCES page_index_benchmark.cc benchmark_util.cc) add_parquet_benchmark(arrow/reader_writer_benchmark PREFIX "parquet-arrow") diff --git a/cpp/src/parquet/metadata_benchmark.cc b/cpp/src/parquet/metadata_benchmark.cc new file mode 100644 index 0000000000000..97a99be798cbb --- /dev/null +++ b/cpp/src/parquet/metadata_benchmark.cc @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include "arrow/buffer.h" +#include "arrow/io/memory.h" +#include "arrow/util/logging.h" + +#include "parquet/column_writer.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/metadata.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +namespace parquet { + +using ::arrow::Buffer; +using ::arrow::io::BufferOutputStream; +using ::arrow::io::BufferReader; +using schema::GroupNode; +using schema::NodePtr; +using schema::NodeVector; + +class MetadataBenchmark { + public: + explicit MetadataBenchmark(benchmark::State* state) + : MetadataBenchmark(static_cast(state->range(0)), + static_cast(state->range(1))) {} + + MetadataBenchmark(int num_columns, int num_row_groups) + : num_columns_(num_columns), num_row_groups_(num_row_groups) { + NodeVector fields; + for (int i = 0; i < num_columns_; ++i) { + std::stringstream ss; + ss << "col" << i; + fields.push_back(parquet::schema::Int32(ss.str(), Repetition::REQUIRED)); + } + schema_root_ = std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + + WriterProperties::Builder prop_builder; + writer_properties_ = prop_builder.version(ParquetVersion::PARQUET_2_6) + ->disable_dictionary() + ->data_page_version(ParquetDataPageVersion::V2) + ->build(); + } + + std::shared_ptr WriteFile(benchmark::State* state) { + PARQUET_ASSIGN_OR_THROW(auto sink, BufferOutputStream::Create()); + + auto writer = ParquetFileWriter::Open(sink, schema_root_, writer_properties_); + std::vector int32_values(1, 42); + int64_t data_size = 0; + for (int rg = 0; rg < num_row_groups_; ++rg) { + auto row_group_writer = writer->AppendRowGroup(); + for (int col = 0; col < num_columns_; ++col) { + auto col_writer = row_group_writer->NextColumn(); + ARROW_CHECK_EQ(col_writer->type(), Type::INT32); + auto typed_col_writer = static_cast(col_writer); + typed_col_writer->WriteBatch( + /*num_values=*/static_cast(int32_values.size()), + /*def_levels=*/nullptr, /*rep_levels=*/nullptr, int32_values.data()); + typed_col_writer->Close(); + } + row_group_writer->Close(); + data_size += row_group_writer->total_compressed_bytes_written(); + } + writer->Close(); + PARQUET_ASSIGN_OR_THROW(auto buf, sink->Finish()); + state->counters["file_size"] = static_cast(buf->size()); + // Note that "data_size" includes the Thrift page headers + state->counters["data_size"] = static_cast(data_size); + return buf; + } + + void ReadFile(std::shared_ptr contents) { + auto source = std::make_shared(contents); + ReaderProperties props; + auto reader = ParquetFileReader::Open(source, props); + auto metadata = reader->metadata(); + ARROW_CHECK_EQ(metadata->num_columns(), num_columns_); + ARROW_CHECK_EQ(metadata->num_row_groups(), num_row_groups_); + // There should be one row per row group + ARROW_CHECK_EQ(metadata->num_rows(), num_row_groups_); + reader->Close(); + } + + private: + int num_columns_; + int num_row_groups_; + std::shared_ptr schema_root_; + std::shared_ptr writer_properties_; +}; + +void WriteMetadataSetArgs(benchmark::internal::Benchmark* bench) { + bench->ArgNames({"num_columns", "num_row_groups"}); + + for (int num_columns : {1, 10, 100}) { + for (int num_row_groups : {1, 100, 1000}) { + bench->Args({num_columns, num_row_groups}); + } + } + /* For larger num_columns, restrict num_row_groups to small values + * to avoid blowing up benchmark execution time. + */ + for (int num_row_groups : {1, 100}) { + bench->Args({/*num_columns=*/1000, num_row_groups}); + } +} + +void ReadMetadataSetArgs(benchmark::internal::Benchmark* bench) { + WriteMetadataSetArgs(bench); +} + +void WriteFileMetadataAndData(benchmark::State& state) { + MetadataBenchmark benchmark(&state); + + for (auto _ : state) { + auto sink = benchmark.WriteFile(&state); + } + state.SetItemsProcessed(state.iterations()); +} + +void ReadFileMetadata(benchmark::State& state) { + MetadataBenchmark benchmark(&state); + auto contents = benchmark.WriteFile(&state); + + for (auto _ : state) { + benchmark.ReadFile(contents); + } + state.SetItemsProcessed(state.iterations()); +} + +BENCHMARK(WriteFileMetadataAndData)->Apply(WriteMetadataSetArgs); +BENCHMARK(ReadFileMetadata)->Apply(ReadMetadataSetArgs); + +} // namespace parquet