From eb2dc6a24d4617643a1f0b365fc718b456870333 Mon Sep 17 00:00:00 2001 From: yan ma Date: Wed, 31 Jul 2024 03:27:49 +0800 Subject: [PATCH] remove filterRowGroups from MapColumnReader/ListColumnReader --- velox/dwio/common/ScanSpec.cpp | 8 ++++++++ .../dwio/parquet/reader/RepeatedColumnReader.cpp | 15 --------------- velox/dwio/parquet/reader/RepeatedColumnReader.h | 10 ---------- 3 files changed, 8 insertions(+), 25 deletions(-) diff --git a/velox/dwio/common/ScanSpec.cpp b/velox/dwio/common/ScanSpec.cpp index 100e9b79b0c69..005c6eaa47ae9 100644 --- a/velox/dwio/common/ScanSpec.cpp +++ b/velox/dwio/common/ScanSpec.cpp @@ -277,7 +277,15 @@ bool testFilter( const TypePtr& type) { bool mayHaveNull = true; + // Has-null statistics is often not set. Hence, we supplement it with + // number-of-values statistic to detect no-null columns more often. + // Number-of-values is the number of non-null values. When it is equal to + // total number of values, we know there are no nulls. if (stats->getNumberOfValues().has_value()) { + if (stats->getNumberOfValues().value() == 0) { + // Column is all null. + return filter->testNull(); + } mayHaveNull = stats->getNumberOfValues().value() < totalRows; } diff --git a/velox/dwio/parquet/reader/RepeatedColumnReader.cpp b/velox/dwio/parquet/reader/RepeatedColumnReader.cpp index 7b33154da7f5c..c86af99eb9081 100644 --- a/velox/dwio/parquet/reader/RepeatedColumnReader.cpp +++ b/velox/dwio/parquet/reader/RepeatedColumnReader.cpp @@ -211,14 +211,6 @@ void MapColumnReader::read( elementReader_->seekTo(childTargetReadOffset_, false); } -void MapColumnReader::filterRowGroups( - uint64_t rowGroupSize, - const dwio::common::StatsContext& context, - dwio::common::FormatData::FilterRowGroupsResult& result) const { - keyReader_->filterRowGroups(rowGroupSize, context, result); - elementReader_->filterRowGroups(rowGroupSize, context, result); -} - ListColumnReader::ListColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, @@ -314,11 +306,4 @@ void ListColumnReader::read( child_->seekTo(childTargetReadOffset_, false); } -void ListColumnReader::filterRowGroups( - uint64_t rowGroupSize, - const dwio::common::StatsContext& context, - dwio::common::FormatData::FilterRowGroupsResult& result) const { - child_->filterRowGroups(rowGroupSize, context, result); -} - } // namespace facebook::velox::parquet diff --git a/velox/dwio/parquet/reader/RepeatedColumnReader.h b/velox/dwio/parquet/reader/RepeatedColumnReader.h index 317b374b79eef..eaa4baa7d8651 100644 --- a/velox/dwio/parquet/reader/RepeatedColumnReader.h +++ b/velox/dwio/parquet/reader/RepeatedColumnReader.h @@ -97,11 +97,6 @@ class MapColumnReader : public dwio::common::SelectiveMapColumnReader { /// supplied before receiving new lengths. void skipUnreadLengths(); - void filterRowGroups( - uint64_t rowGroupSize, - const dwio::common::StatsContext&, - dwio::common::FormatData::FilterRowGroupsResult&) const override; - private: RepeatedLengths lengths_; RepeatedLengths keyLengths_; @@ -153,11 +148,6 @@ class ListColumnReader : public dwio::common::SelectiveListColumnReader { /// supplied before receiving new lengths. void skipUnreadLengths(); - void filterRowGroups( - uint64_t rowGroupSize, - const dwio::common::StatsContext&, - dwio::common::FormatData::FilterRowGroupsResult&) const override; - private: RepeatedLengths lengths_; arrow::LevelInfo levelInfo_;