Skip to content

Commit

Permalink
add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Aug 5, 2024
1 parent 9249ac3 commit 4242ad1
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 1 deletion.
23 changes: 23 additions & 0 deletions cpp/src/parquet/arrow/arrow_statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,27 @@ INSTANTIATE_TEST_SUITE_P(
/*expected_min=*/"z",
/*expected_max=*/"z"}));

TEST(StatisticsTest, TruncateOnlyHalfMinMax) {
// GH-43382: Tests when we only have min or max, the `HasMinMax` should be false.
std::shared_ptr<::arrow::ResizableBuffer> serialized_data = AllocateBuffer();
auto out_stream = std::make_shared<::arrow::io::BufferOutputStream>(serialized_data);
auto schema = ::arrow::schema({::arrow::field("a", ::arrow::utf8())});
::parquet::WriterProperties::Builder properties_builder;
properties_builder.max_statistics_size(2);
ASSERT_OK_AND_ASSIGN(
std::unique_ptr<FileWriter> writer,
FileWriter::Open(*schema, default_memory_pool(), out_stream,
properties_builder.build(), default_arrow_writer_properties()));
auto table = Table::Make(schema, {ArrayFromJSON(::arrow::utf8(), R"(["a", "abc"])")});
ASSERT_OK(writer->WriteTable(*table, std::numeric_limits<int64_t>::max()));
ASSERT_OK(writer->Close());
ASSERT_OK(out_stream->Close());

auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(serialized_data);
auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader));
std::shared_ptr<FileMetaData> metadata = parquet_reader->metadata();
std::shared_ptr<Statistics> stats = metadata->RowGroup(0)->ColumnChunk(0)->statistics();
ASSERT_FALSE(stats->HasMinMax());
}

} // namespace parquet::arrow
2 changes: 1 addition & 1 deletion cpp/src/parquet/statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ class PARQUET_EXPORT Statistics {
/// \brief The number of non-null values in the column
virtual int64_t num_values() const = 0;

/// \brief Return true if the min and max statistics are set. Obtain
/// \brief Return true if both min and max statistics are set. Obtain
/// with TypedStatistics<T>::min and max
virtual bool HasMinMax() const = 0;

Expand Down
25 changes: 25 additions & 0 deletions cpp/src/parquet/statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1602,5 +1602,30 @@ TEST(TestEncodedStatistics, CopySafe) {
EXPECT_EQ("abc", encoded_statistics.max());
}

TEST(TestEncodedStatistics, ApplyStatSizeLimits) {
EncodedStatistics encoded_statistics;
encoded_statistics.set_min("a");
encoded_statistics.has_min = true;

encoded_statistics.set_max("abc");
encoded_statistics.has_max = true;

encoded_statistics.ApplyStatSizeLimits(2);

ASSERT_TRUE(encoded_statistics.has_min);
ASSERT_EQ("a", encoded_statistics.min());
ASSERT_FALSE(encoded_statistics.has_max);

NodePtr node =
PrimitiveNode::Make("StringColumn", Repetition::REQUIRED, Type::BYTE_ARRAY);
ColumnDescriptor descr(node, 0, 0);
std::shared_ptr<TypedStatistics<::parquet::ByteArrayType>> statistics =
std::dynamic_pointer_cast<TypedStatistics<::parquet::ByteArrayType>>(
Statistics::Make(&descr, &encoded_statistics,
/*num_values=*/1000));
// GH-43382: HasMinMax should be false if one of min/max is not set.
EXPECT_FALSE(statistics->HasMinMax());
}

} // namespace test
} // namespace parquet

0 comments on commit 4242ad1

Please sign in to comment.