From d284bf43acdbb4b6e364109b6720c8c0c3327186 Mon Sep 17 00:00:00 2001 From: Orri Erling Date: Sat, 21 Dec 2024 12:47:44 -0800 Subject: [PATCH] Add optional HLL for distinct value count to StatisticsBuilder Adds an optional HLL counter for distinct values to StatisticsBuilder. This is used in Verax sampling to estimate column cardinalities for scalar types. --- velox/dwio/common/Statistics.h | 14 +++++- .../dwrf/test/TestStatisticsBuilderUtils.cpp | 16 ++++++- velox/dwio/dwrf/writer/CMakeLists.txt | 1 + velox/dwio/dwrf/writer/StatisticsBuilder.cpp | 15 ++++++- velox/dwio/dwrf/writer/StatisticsBuilder.h | 45 ++++++++++++++++--- 5 files changed, 79 insertions(+), 12 deletions(-) diff --git a/velox/dwio/common/Statistics.h b/velox/dwio/common/Statistics.h index 699066c1df4dc..433ff8a7de1de 100644 --- a/velox/dwio/common/Statistics.h +++ b/velox/dwio/common/Statistics.h @@ -81,11 +81,13 @@ class ColumnStatistics { std::optional valueCount, std::optional hasNull, std::optional rawSize, - std::optional size) + std::optional size, + std::optional numDistinct = std::nullopt) : valueCount_(valueCount), hasNull_(hasNull), rawSize_(rawSize), - size_(size) {} + size_(size), + numDistinct_(numDistinct) {} virtual ~ColumnStatistics() = default; @@ -123,6 +125,13 @@ class ColumnStatistics { return size_; } + std::optional numDistinct() const { + return numDistinct_; + } + void setNumDistinct(int64_t count) { + numDistinct_ = count; + } + /** * return string representation of this stats object */ @@ -145,6 +154,7 @@ class ColumnStatistics { std::optional hasNull_; std::optional rawSize_; std::optional size_; + std::optional numDistinct_; }; /** diff --git a/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp b/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp index 6f4afacbd297e..d13b09917b23b 100644 --- a/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp +++ b/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -26,8 +27,6 @@ using namespace facebook::velox::dwio::common; using namespace facebook::velox; using namespace facebook::velox::dwrf; -StatisticsBuilderOptions options{16}; - template std::shared_ptr> makeFlatVector( facebook::velox::memory::MemoryPool* pool, @@ -58,8 +57,15 @@ class TestStatisticsBuilderUtils : public testing::Test { memory::MemoryManager::testingSetInstance({}); } + void SetUp() override { + StatisticsBuilderOptions options{16}; + } + const std::shared_ptr pool_ = memory::memoryManager()->addLeafPool(); + std::unique_ptr allocator_ = + std::make_unique(pool_.get()); + StatisticsBuilderOptions options{16, 100, true, allocator_.get()}; }; TEST_F(TestStatisticsBuilderUtils, addIntegerValues) { @@ -85,6 +91,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) { EXPECT_EQ(10, intStats->getMaximum().value()); EXPECT_EQ(1, intStats->getMinimum().value()); EXPECT_EQ(55, intStats->getSum()); + EXPECT_EQ(10, intStats->numDistinct()); } // add values with null @@ -103,6 +110,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) { EXPECT_EQ(10, intStats->getMaximum().value()); EXPECT_EQ(1, intStats->getMinimum().value()); EXPECT_EQ(106, intStats->getSum().value()); + EXPECT_EQ(10, intStats->numDistinct()); } } @@ -129,6 +137,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) { EXPECT_EQ(10, doubleStats->getMaximum().value()); EXPECT_EQ(1, doubleStats->getMinimum().value()); EXPECT_EQ(55, doubleStats->getSum()); + EXPECT_EQ(10, doubleStats->numDistinct().value()); } // add values with null @@ -147,6 +156,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) { EXPECT_EQ(10, doubleStats->getMaximum().value()); EXPECT_EQ(1, doubleStats->getMinimum().value()); EXPECT_EQ(106, doubleStats->getSum()); + EXPECT_EQ(10, doubleStats->numDistinct().value()); } } @@ -174,6 +184,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) { EXPECT_EQ("j", strStats->getMaximum().value()); EXPECT_EQ("a", strStats->getMinimum().value()); EXPECT_EQ(10, strStats->getTotalLength()); + EXPECT_EQ(10, strStats->numDistinct()); } // add values with null @@ -191,6 +202,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) { EXPECT_EQ("j", strStats->getMaximum().value()); EXPECT_EQ("a", strStats->getMinimum().value()); EXPECT_EQ(19, strStats->getTotalLength().value()); + EXPECT_EQ(10, strStats->numDistinct()); } } diff --git a/velox/dwio/dwrf/writer/CMakeLists.txt b/velox/dwio/dwrf/writer/CMakeLists.txt index 72726d3e79cde..3150c30e55b8f 100644 --- a/velox/dwio/dwrf/writer/CMakeLists.txt +++ b/velox/dwio/dwrf/writer/CMakeLists.txt @@ -28,6 +28,7 @@ velox_add_library( velox_link_libraries( velox_dwio_dwrf_writer + velox_common_hyperloglog velox_dwio_common velox_dwio_dwrf_common velox_dwio_dwrf_utils diff --git a/velox/dwio/dwrf/writer/StatisticsBuilder.cpp b/velox/dwio/dwrf/writer/StatisticsBuilder.cpp index 3940e29f17b11..5a2178247a8d5 100644 --- a/velox/dwio/dwrf/writer/StatisticsBuilder.cpp +++ b/velox/dwio/dwrf/writer/StatisticsBuilder.cpp @@ -93,6 +93,13 @@ void StatisticsBuilder::merge( // Merge size mergeCount(size_, other.getSize()); } + if (hll_) { + if (auto* otherBuilder = dynamic_cast(&other)) { + if (otherBuilder->hll_) { + hll_->mergeWith(*otherBuilder->hll_); + } + } + } } void StatisticsBuilder::toProto(proto::ColumnStatistics& stats) const { @@ -115,8 +122,12 @@ std::unique_ptr StatisticsBuilder::build() proto::ColumnStatistics stats; toProto(stats); StatsContext context{WriterVersion_CURRENT}; - return buildColumnStatisticsFromProto( - ColumnStatisticsWrapper(&stats), context); + auto result = + buildColumnStatisticsFromProto(ColumnStatisticsWrapper(&stats), context); + if (hll_) { + result->setNumDistinct(hll_->cardinality()); + } + return result; } std::unique_ptr StatisticsBuilder::create( diff --git a/velox/dwio/dwrf/writer/StatisticsBuilder.h b/velox/dwio/dwrf/writer/StatisticsBuilder.h index 35832145a302c..e734d784293b6 100644 --- a/velox/dwio/dwrf/writer/StatisticsBuilder.h +++ b/velox/dwio/dwrf/writer/StatisticsBuilder.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include "velox/dwio/dwrf/common/Config.h" #include "velox/dwio/dwrf/common/Statistics.h" #include "velox/dwio/dwrf/common/wrap/dwrf-proto-wrapper.h" @@ -76,11 +77,18 @@ inline dwio::common::KeyInfo constructKey(const dwrf::proto::KeyInfo& keyInfo) { struct StatisticsBuilderOptions { explicit StatisticsBuilderOptions( uint32_t stringLengthLimit, - std::optional initialSize = std::nullopt) - : stringLengthLimit{stringLengthLimit}, initialSize{initialSize} {} + std::optional initialSize = std::nullopt, + bool countDistincts = false, + HashStringAllocator* allocator = nullptr) + : stringLengthLimit{stringLengthLimit}, + initialSize{initialSize}, + countDistincts(countDistincts), + allocator(allocator) {} uint32_t stringLengthLimit; std::optional initialSize; + bool countDistincts{false}; + HashStringAllocator* allocator; static StatisticsBuilderOptions fromConfig(const Config& config) { return StatisticsBuilderOptions{config.get(Config::STRING_STATS_LIMIT)}; @@ -90,10 +98,16 @@ struct StatisticsBuilderOptions { /* * Base class for stats builder. Stats builder is used in writer and file merge * to collect and merge stats. + * It can also be used for gathering stats in ad hoc sampling. In this case it + * may also count distinct values if enabled in 'options'. */ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics { public: - explicit StatisticsBuilder(const StatisticsBuilderOptions& options) + /// Constructs with 'options'. If 'options' enable count distinct and + /// 'disableNumDistinct' is true, distinct values will not be counted. + explicit StatisticsBuilder( + const StatisticsBuilderOptions& options, + bool disableNumDistinct = false) : options_{options} { init(); } @@ -132,6 +146,18 @@ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics { } } + template + void addHash(const T& data) { + if (hll_) { + hll_->insertHash(folly::hasher()(data)); + } + } + + int64_t cardinality() { + VELOX_CHECK(hll_); + return hll_->cardinality(); + } + /* * Merge stats of same type. This is used in writer to aggregate file level * stats. @@ -170,17 +196,21 @@ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics { hasNull_ = false; rawSize_ = 0; size_ = options_.initialSize; + if (options_.countDistincts) { + hll_ = std::make_shared(options_.allocator); + } } protected: StatisticsBuilderOptions options_; + std::shared_ptr hll_; }; class BooleanStatisticsBuilder : public StatisticsBuilder, public dwio::common::BooleanColumnStatistics { public: explicit BooleanStatisticsBuilder(const StatisticsBuilderOptions& options) - : StatisticsBuilder{options} { + : StatisticsBuilder{options, true} { init(); } @@ -229,6 +259,7 @@ class IntegerStatisticsBuilder : public StatisticsBuilder, max_ = value; } addWithOverflowCheck(sum_, value, count); + addHash(value); } void merge( @@ -278,6 +309,7 @@ class DoubleStatisticsBuilder : public StatisticsBuilder, if (max_.has_value() && value > max_.value()) { max_ = value; } + addHash(value); // value * count sometimes is not same as adding values (count) times. So // add in a loop if (sum_.has_value()) { @@ -342,6 +374,7 @@ class StringStatisticsBuilder : public StatisticsBuilder, max_ = value; } } + addHash(value); addWithOverflowCheck(length_, value.size(), count); } @@ -375,7 +408,7 @@ class BinaryStatisticsBuilder : public StatisticsBuilder, public dwio::common::BinaryColumnStatistics { public: explicit BinaryStatisticsBuilder(const StatisticsBuilderOptions& options) - : StatisticsBuilder{options} { + : StatisticsBuilder{options, true} { init(); } @@ -409,7 +442,7 @@ class MapStatisticsBuilder : public StatisticsBuilder, MapStatisticsBuilder( const Type& type, const StatisticsBuilderOptions& options) - : StatisticsBuilder{options}, + : StatisticsBuilder{options, true}, valueType_{type.as().valueType()} { init(); }