Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-38042: [C++][Benchmark] Add non-stream Codec Compression/Decompression #38067

Merged
merged 5 commits into from
Oct 25, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 63 additions & 4 deletions cpp/src/arrow/util/compression_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"

namespace arrow {
namespace util {
namespace arrow::util {

#ifdef ARROW_WITH_BENCHMARKS_REFERENCE

Expand Down Expand Up @@ -133,6 +132,37 @@ static void ReferenceStreamingCompression(
StreamingCompression(COMPRESSION, data, state);
}

int64_t Compress(Codec* codec, const std::vector<uint8_t>& data,
std::vector<uint8_t>* compressed_data) {
const uint8_t* input = data.data();
int64_t input_len = data.size();
int64_t compressed_size = 0;
int64_t max_compressed_len = codec->MaxCompressedLen(input_len, input);
compressed_data->resize(max_compressed_len);

if (input_len > 0) {
compressed_size = *codec->Compress(input_len, input, compressed_data->size(),
compressed_data->data());
compressed_data->resize(compressed_size);
}
return compressed_size;
}

template <Compression::type COMPRESSION>
static void ReferenceCompression(benchmark::State& state) { // NOLINT non-const reference
auto data = MakeCompressibleData(8 * 1024 * 1024); // 8 MB

auto codec = *Codec::Create(COMPRESSION);

while (state.KeepRunning()) {
std::vector<uint8_t> compressed_data;
auto compressed_size = Compress(codec.get(), data, &compressed_data);
state.counters["ratio"] =
static_cast<double>(data.size()) / static_cast<double>(compressed_size);
}
state.SetBytesProcessed(state.iterations() * data.size());
}

static void StreamingDecompression(
Compression::type compression, const std::vector<uint8_t>& data,
benchmark::State& state) { // NOLINT non-const reference
Expand Down Expand Up @@ -175,27 +205,56 @@ static void ReferenceStreamingDecompression(
StreamingDecompression(COMPRESSION, data, state);
}

template <Compression::type COMPRESSION>
static void ReferenceDecompression(
benchmark::State& state) { // NOLINT non-const reference
auto data = MakeCompressibleData(8 * 1024 * 1024); // 8 MB

auto codec = *Codec::Create(COMPRESSION);

std::vector<uint8_t> compressed_data;
ARROW_UNUSED(Compress(codec.get(), data, &compressed_data));
state.counters["ratio"] =
static_cast<double>(data.size()) / static_cast<double>(compressed_data.size());

std::vector<uint8_t> decompressed_data(data);
while (state.KeepRunning()) {
auto result = codec->Decompress(compressed_data.size(), compressed_data.data(),
decompressed_data.size(), decompressed_data.data());
ARROW_CHECK(result.ok());
ARROW_CHECK(*result == static_cast<int64_t>(decompressed_data.size()));
}
state.SetBytesProcessed(state.iterations() * data.size());
}

#ifdef ARROW_WITH_ZLIB
BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::GZIP);
BENCHMARK_TEMPLATE(ReferenceCompression, Compression::GZIP);
BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::GZIP);
BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::GZIP);
#endif

#ifdef ARROW_WITH_BROTLI
BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::BROTLI);
BENCHMARK_TEMPLATE(ReferenceCompression, Compression::BROTLI);
BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::BROTLI);
BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::BROTLI);
#endif

#ifdef ARROW_WITH_ZSTD
BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::ZSTD);
BENCHMARK_TEMPLATE(ReferenceCompression, Compression::ZSTD);
BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::ZSTD);
BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::ZSTD);
#endif

#ifdef ARROW_WITH_LZ4
BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::LZ4_FRAME);
BENCHMARK_TEMPLATE(ReferenceCompression, Compression::LZ4_FRAME);
BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::LZ4_FRAME);
BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::LZ4_FRAME);
Comment on lines +253 to +255
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is LZ4_FRAME OK?
It seems that Parquet doesn't use LZ4_FRAME.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can even benchmark both LZ4 variants.

Copy link
Member Author

@mapleFU mapleFU Oct 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that Parquet doesn't use LZ4_FRAME

Aha I remember parquet-mr first implement LZ4. And arrow implement a different version ( LZ4_FRAME ). LZ4 stores an extra-length here.

Maybe apache/parquet-format#168 helps

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And I don't think they have too many differences...

Currently I didn't add LZ4. But feel free to add if neccesssary

#endif

#endif

} // namespace util
} // namespace arrow
} // namespace arrow::util
Loading