From bf7b601de47f9d1da663d56c71e06f6d6f3b3130 Mon Sep 17 00:00:00 2001 From: Binwei Yang Date: Thu, 30 Nov 2023 06:19:29 +0000 Subject: [PATCH] Add processedStrides and processedSplits metrics #6123 --- velox/connectors/hive/SplitReader.cpp | 1 + velox/dwio/common/Statistics.h | 8 ++++++++ velox/dwio/dwrf/reader/DwrfReader.cpp | 1 + velox/dwio/dwrf/reader/DwrfReader.h | 5 ++++- velox/dwio/parquet/reader/ParquetReader.cpp | 1 + velox/exec/tests/PrintPlanWithStatsTest.cpp | 4 ++++ 6 files changed, 19 insertions(+), 1 deletion(-) diff --git a/velox/connectors/hive/SplitReader.cpp b/velox/connectors/hive/SplitReader.cpp index 3c470de2db27..b9ca99f95f43 100644 --- a/velox/connectors/hive/SplitReader.cpp +++ b/velox/connectors/hive/SplitReader.cpp @@ -302,6 +302,7 @@ bool SplitReader::checkIfSplitIsEmpty( } } + ++runtimeStats.processedSplits; return emptySplit_; } diff --git a/velox/dwio/common/Statistics.h b/velox/dwio/common/Statistics.h index db5a91b06413..19f0923e8a45 100644 --- a/velox/dwio/common/Statistics.h +++ b/velox/dwio/common/Statistics.h @@ -529,20 +529,28 @@ struct RuntimeStatistics { // Number of splits skipped based on statistics. int64_t skippedSplits{0}; + // Number of splits processed based on statistics. + int64_t processedSplits{0}; + // Total bytes in splits skipped based on statistics. int64_t skippedSplitBytes{0}; // Number of strides (row groups) skipped based on statistics. int64_t skippedStrides{0}; + // Number of strides (row groups) processed based on statistics. + int64_t processedStrides{0}; + ColumnReaderStatistics columnReaderStatistics; std::unordered_map toMap() { return { {"skippedSplits", RuntimeCounter(skippedSplits)}, + {"processedSplits", RuntimeCounter(processedSplits)}, {"skippedSplitBytes", RuntimeCounter(skippedSplitBytes, RuntimeCounter::Unit::kBytes)}, {"skippedStrides", RuntimeCounter(skippedStrides)}, + {"processedStrides", RuntimeCounter(processedStrides)}, {"flattenStringDictionaryValues", RuntimeCounter(columnReaderStatistics.flattenStringDictionaryValues)}}; } diff --git a/velox/dwio/dwrf/reader/DwrfReader.cpp b/velox/dwio/dwrf/reader/DwrfReader.cpp index c9ea3e762bda..2c01c17b9999 100644 --- a/velox/dwio/dwrf/reader/DwrfReader.cpp +++ b/velox/dwio/dwrf/reader/DwrfReader.cpp @@ -505,6 +505,7 @@ void DwrfRowReader::checkSkipStrides(uint64_t strideSize) { currentStride++; skippedStrides_++; } + processedStrides_++; if (foundStridesToSkip && currentRowInStripe_ < rowsInCurrentStripe_) { getSelectiveColumnReader()->seekToRowGroup(currentStride); } diff --git a/velox/dwio/dwrf/reader/DwrfReader.h b/velox/dwio/dwrf/reader/DwrfReader.h index 503af8069226..bb8a2e923abb 100644 --- a/velox/dwio/dwrf/reader/DwrfReader.h +++ b/velox/dwio/dwrf/reader/DwrfReader.h @@ -110,6 +110,7 @@ class DwrfRowReader : public StrideIndexProvider, void updateRuntimeStats( dwio::common::RuntimeStatistics& stats) const override { stats.skippedStrides += skippedStrides_; + stats.processedStrides += processedStrides_; stats.columnReaderStatistics.flattenStringDictionaryValues += columnReaderStatistics_.flattenStringDictionaryValues; } @@ -203,7 +204,9 @@ class DwrfRowReader : public StrideIndexProvider, std::unordered_map> stripeStridesToSkip_; // Number of skipped strides. int64_t skippedStrides_{0}; - + // Number of processed strides. + int64_t processedStrides_{0}; + // Set to true after clearing filter caches, i.e. adding a dynamic // filter. Causes filters to be re-evaluated against stride stats on // next stride instead of next stripe. diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index a93d7b1aa307..bc073b9bf725 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -910,6 +910,7 @@ class ParquetRowReader::Impl { void updateRuntimeStats(dwio::common::RuntimeStatistics& stats) const { stats.skippedStrides += rowGroups_.size() - rowGroupIds_.size(); + stats.processedStrides += rowGroupIds_.size(); } void resetFilterCaches() { diff --git a/velox/exec/tests/PrintPlanWithStatsTest.cpp b/velox/exec/tests/PrintPlanWithStatsTest.cpp index d986aeb743b4..01d1a15f9a35 100644 --- a/velox/exec/tests/PrintPlanWithStatsTest.cpp +++ b/velox/exec/tests/PrintPlanWithStatsTest.cpp @@ -200,6 +200,8 @@ TEST_F(PrintPlanWithStatsTest, innerJoinWithTableScan) { {" prefetchBytes [ ]* sum: .+, count: 1, min: .+, max: .+"}, {" preloadedSplits[ ]+sum: .+, count: .+, min: .+, max: .+", true}, + {" processedSplits [ ]* sum: 1, count: 1, min: 1, max: 1"}, + {" processedStrides [ ]* sum: 40, count: 1, min: 40, max: 40"}, {" ramReadBytes [ ]* sum: .+, count: 1, min: .+, max: .+"}, {" readyPreloadedSplits[ ]+sum: .+, count: .+, min: .+, max: .+", true}, @@ -293,6 +295,8 @@ TEST_F(PrintPlanWithStatsTest, partialAggregateWithTableScan) { {" overreadBytes[ ]* sum: 0B, count: 1, min: 0B, max: 0B"}, {" prefetchBytes [ ]* sum: .+, count: 1, min: .+, max: .+"}, + {" processedSplits [ ]* sum: 1, count: 1, min: 1, max: 1"}, + {" processedStrides [ ]* sum: 3, count: 1, min: 3, max: 3"}, {" preloadedSplits[ ]+sum: .+, count: .+, min: .+, max: .+", true}, {" ramReadBytes [ ]* sum: .+, count: 1, min: .+, max: .+"},