From 0d19b8471afdc76b0896e858b4bdfe3dd409661b Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 10 Apr 2024 15:41:48 +0800 Subject: [PATCH] add basic test --- cpp/src/parquet/page_index_test.cc | 2 + cpp/src/parquet/size_statistics_test.cc | 165 ++++++++++++++++++++++-- 2 files changed, 153 insertions(+), 14 deletions(-) diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 4db49b4267415..d543a00f759b2 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -852,4 +852,6 @@ TEST_F(PageIndexBuilderTest, TwoRowGroups) { CheckOffsetIndex(/*row_group=*/1, /*column=*/1, page_locations[1][1], final_position); } +// TODO: add test for size stats + } // namespace parquet diff --git a/cpp/src/parquet/size_statistics_test.cc b/cpp/src/parquet/size_statistics_test.cc index 56bbf7edda816..4bef2451ec80c 100644 --- a/cpp/src/parquet/size_statistics_test.cc +++ b/cpp/src/parquet/size_statistics_test.cc @@ -22,13 +22,13 @@ #include -#include "arrow/io/file.h" -#include "arrow/util/float16.h" -#include "parquet/file_reader.h" -#include "parquet/metadata.h" +#include "arrow/buffer.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/bit_util.h" #include "parquet/schema.h" #include "parquet/test_util.h" #include "parquet/thrift_internal.h" +#include "parquet/types.h" namespace parquet { @@ -37,13 +37,9 @@ using namespace parquet::schema; TEST(SizeStatistics, WriteBatchLevels) { std::vector expected_def_level_histogram = {256, 128, 64, 32, 16, 8, 4, 2, 2}; std::vector expected_rep_level_histogram = {256, 128, 64, 32, 32}; - - const int16_t max_def_level = - static_cast(expected_def_level_histogram.size()) - 1; - const int16_t max_rep_level = - static_cast(expected_rep_level_histogram.size()) - 1; - auto descr = - std::make_unique(Int32("a"), max_def_level, max_rep_level); + constexpr int16_t kMaxDefLevel = 8; + constexpr int16_t kMaxRefLevel = 4; + auto descr = std::make_unique(Int32("a"), kMaxDefLevel, kMaxRefLevel); auto builder = SizeStatisticsBuilder::Make(descr.get()); auto write_batch_levels = @@ -98,8 +94,149 @@ TEST(SizeStatistics, WriteRepeatedLevels) { std::vector({55, 65, 95, 145})); } -// TODO: Add tests for write binary variants. -// TODO: Add tests for merge two size statistics. -// TODO: Add tests for thrift serialization. +TEST(SizeStatistics, WriteDenseByteArrayValues) { + constexpr std::string_view kValue = "foo"; + constexpr int kNumValues = 1000; + constexpr int kBatchSize = 64; + const std::vector values(kNumValues, kValue); + + auto descr = std::make_unique( + schema::ByteArray("a"), /*max_def_level=*/0, /*max_rep_level=*/0); + auto builder = SizeStatisticsBuilder::Make(descr.get()); + for (int i = 0; i < kNumValues; i += kBatchSize) { + auto batch_size = std::min(kBatchSize, kNumValues - i); + builder->WriteValues(values.data() + i, batch_size); + } + + auto size_statistics = builder->Build(); + EXPECT_EQ(size_statistics->unencoded_byte_array_data_bytes().value_or(-1), + kNumValues * kValue.size()); +} + +TEST(SizeStatistics, WriteSpacedByteArrayValues) { + constexpr std::string_view kValue = "foo"; + constexpr int kNumValues = 1000; + constexpr int kBatchSize = 63; + const std::vector values(kNumValues, kValue); + ASSERT_OK_AND_ASSIGN(auto not_null_bitmap, ::arrow::AllocateBitmap(kNumValues)); + int not_null_count = 0; + for (int i = 0; i < kNumValues; i++) { + if (i % 3 == 0) { + ::arrow::bit_util::ClearBit(not_null_bitmap->mutable_data(), i); + } else { + ::arrow::bit_util::SetBit(not_null_bitmap->mutable_data(), i); + not_null_count++; + } + } + + auto descr = std::make_unique( + schema::ByteArray("a"), /*max_def_level=*/1, /*max_rep_level=*/0); + auto builder = SizeStatisticsBuilder::Make(descr.get()); + for (int i = 0; i < kNumValues; i += kBatchSize) { + auto batch_size = std::min(kBatchSize, kNumValues - i); + builder->WriteValuesSpaced(values.data() + i, not_null_bitmap->data(), i, batch_size); + } + + auto size_statistics = builder->Build(); + EXPECT_EQ(size_statistics->unencoded_byte_array_data_bytes().value_or(-1), + not_null_count * kValue.size()); +} + +TEST(SizeStatistics, WriteBinaryArray) { + std::vector> arrays = { + ::arrow::ArrayFromJSON(::arrow::binary(), R"(["foo", null, "bar", "baz"])"), + ::arrow::ArrayFromJSON(::arrow::large_binary(), R"(["foo", null, "bar", "baz"])"), + }; + for (const auto& array : arrays) { + auto descr = std::make_unique( + schema::ByteArray("a"), /*max_def_level=*/1, /*max_rep_level=*/0); + auto builder = SizeStatisticsBuilder::Make(descr.get()); + builder->WriteValues(*array); + auto size_statistics = builder->Build(); + EXPECT_EQ(size_statistics->unencoded_byte_array_data_bytes().value_or(-1), 9); + } +} + +TEST(SizeStatistics, MergeStatistics) { + constexpr int kNumValues = 16; + const std::array def_levels = {0, 0, 0, 0, 1, 1, 1, 1, + 2, 2, 2, 2, 3, 3, 3, 3}; + const std::array rep_levels = {0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3}; + const std::vector expected_histogram = {8, 8, 8, 8}; + constexpr std::string_view kByteArrayValue = "foo"; + const std::vector values(kNumValues, + parquet::ByteArray{kByteArrayValue}); + + for (const auto& descr : + {std::make_unique(schema::Int32("a"), /*max_def_level=*/3, + /*max_rep_level=*/3), + std::make_unique(schema::ByteArray("a"), /*max_def_level=*/3, + /*max_rep_level=*/3)}) { + auto builder = SizeStatisticsBuilder::Make(descr.get()); + builder->WriteRepetitionLevels(kNumValues, def_levels.data()); + builder->WriteDefinitionLevels(kNumValues, rep_levels.data()); + if (descr->physical_type() == Type::BYTE_ARRAY) { + builder->WriteValues(values.data(), kNumValues); + } + auto size_statistics_1 = builder->Build(); + + builder->Reset(); + builder->WriteRepetitionLevels(kNumValues, def_levels.data()); + builder->WriteDefinitionLevels(kNumValues, rep_levels.data()); + if (descr->physical_type() == Type::BYTE_ARRAY) { + builder->WriteValues(values.data(), kNumValues); + } + auto size_statistics_2 = builder->Build(); + + size_statistics_1->Merge(*size_statistics_2); + EXPECT_EQ(size_statistics_1->definition_level_histogram(), expected_histogram); + EXPECT_EQ(size_statistics_1->repetition_level_histogram(), expected_histogram); + if (descr->physical_type() == Type::BYTE_ARRAY) { + EXPECT_TRUE(size_statistics_1->unencoded_byte_array_data_bytes().has_value()); + EXPECT_EQ(size_statistics_1->unencoded_byte_array_data_bytes().value(), + kByteArrayValue.size() * kNumValues * 2); + } else { + EXPECT_FALSE(size_statistics_1->unencoded_byte_array_data_bytes().has_value()); + } + } +} + +TEST(SizeStatistics, ThriftSerDe) { + constexpr int kNumValues = 16; + const std::array def_levels = {0, 0, 0, 0, 1, 1, 1, 1, + 2, 2, 2, 2, 3, 3, 3, 3}; + const std::array rep_levels = {0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3}; + const std::vector expected_histogram = {4, 4, 4, 4}; + constexpr std::string_view kByteArrayValue = "foo"; + const std::vector values(kNumValues, + parquet::ByteArray{kByteArrayValue}); + + for (const auto& descr : + {std::make_unique(schema::Int32("a"), /*max_def_level=*/3, + /*max_rep_level=*/3), + std::make_unique(schema::ByteArray("a"), /*max_def_level=*/3, + /*max_rep_level=*/3)}) { + auto builder = SizeStatisticsBuilder::Make(descr.get()); + builder->WriteRepetitionLevels(kNumValues, def_levels.data()); + builder->WriteDefinitionLevels(kNumValues, rep_levels.data()); + if (descr->physical_type() == Type::BYTE_ARRAY) { + builder->WriteValues(values.data(), kNumValues); + } + auto size_statistics = builder->Build(); + auto thrift_statistics = ToThrift(*size_statistics); + auto restored_statistics = SizeStatistics::Make(&thrift_statistics, descr.get()); + EXPECT_EQ(restored_statistics->definition_level_histogram(), expected_histogram); + EXPECT_EQ(restored_statistics->repetition_level_histogram(), expected_histogram); + if (descr->physical_type() == Type::BYTE_ARRAY) { + EXPECT_TRUE(restored_statistics->unencoded_byte_array_data_bytes().has_value()); + EXPECT_EQ(restored_statistics->unencoded_byte_array_data_bytes().value(), + kByteArrayValue.size() * kNumValues); + } else { + EXPECT_FALSE(restored_statistics->unencoded_byte_array_data_bytes().has_value()); + } + } +} } // namespace parquet