From 41a0f72d9fd3ed359caa514dcf28d1ed76d75673 Mon Sep 17 00:00:00 2001 From: rui-mo Date: Wed, 16 Aug 2023 13:12:49 +0800 Subject: [PATCH] Add processedStrides and processedSplits metrics --- velox/connectors/hive/SplitReader.cpp | 1 + velox/dwio/common/Statistics.h | 8 ++++++++ velox/dwio/dwrf/reader/DwrfReader.cpp | 1 + velox/dwio/dwrf/reader/DwrfReader.h | 3 +++ velox/dwio/parquet/reader/ParquetReader.cpp | 1 + velox/exec/tests/PrintPlanWithStatsTest.cpp | 4 ++++ 6 files changed, 18 insertions(+) diff --git a/velox/connectors/hive/SplitReader.cpp b/velox/connectors/hive/SplitReader.cpp index 072a5f68d54a..d4d78cb7c7b4 100644 --- a/velox/connectors/hive/SplitReader.cpp +++ b/velox/connectors/hive/SplitReader.cpp @@ -190,6 +190,7 @@ void SplitReader::prepareSplit( runtimeStats.skippedSplitBytes += hiveSplit_->length; return; } + ++runtimeStats.processedSplits; auto& fileType = baseReader_->rowType(); auto columnTypes = adaptColumns(fileType, baseReaderOpts_.getFileSchema()); diff --git a/velox/dwio/common/Statistics.h b/velox/dwio/common/Statistics.h index db5a91b06413..22cc840da522 100644 --- a/velox/dwio/common/Statistics.h +++ b/velox/dwio/common/Statistics.h @@ -529,20 +529,28 @@ struct RuntimeStatistics { // Number of splits skipped based on statistics. int64_t skippedSplits{0}; + // Number of splits processed based on statistics. + int64_t processedSplits{0}; + // Total bytes in splits skipped based on statistics. int64_t skippedSplitBytes{0}; // Number of strides (row groups) skipped based on statistics. int64_t skippedStrides{0}; + // Number of strides (row groups) processed based on statistics. + int64_t processedStrides{0}; + ColumnReaderStatistics columnReaderStatistics; std::unordered_map toMap() { return { {"skippedSplits", RuntimeCounter(skippedSplits)}, + {"processedSplits", RuntimeCounter(processedSplits)}, {"skippedSplitBytes", RuntimeCounter(skippedSplitBytes, RuntimeCounter::Unit::kBytes)}, {"skippedStrides", RuntimeCounter(skippedStrides)}, + {"processedStrides", RuntimeCounter(processedStrides)}, {"flattenStringDictionaryValues", RuntimeCounter(columnReaderStatistics.flattenStringDictionaryValues)}}; } diff --git a/velox/dwio/dwrf/reader/DwrfReader.cpp b/velox/dwio/dwrf/reader/DwrfReader.cpp index e3b12897c539..d553982ad12f 100644 --- a/velox/dwio/dwrf/reader/DwrfReader.cpp +++ b/velox/dwio/dwrf/reader/DwrfReader.cpp @@ -266,6 +266,7 @@ void DwrfRowReader::checkSkipStrides(uint64_t strideSize) { currentStride++; skippedStrides_++; } + processedStrides_++; if (foundStridesToSkip && currentRowInStripe_ < rowsInCurrentStripe_) { selectiveColumnReader_->seekToRowGroup(currentStride); } diff --git a/velox/dwio/dwrf/reader/DwrfReader.h b/velox/dwio/dwrf/reader/DwrfReader.h index d681ebe6612e..6081ebea2110 100644 --- a/velox/dwio/dwrf/reader/DwrfReader.h +++ b/velox/dwio/dwrf/reader/DwrfReader.h @@ -93,6 +93,7 @@ class DwrfRowReader : public StrideIndexProvider, void updateRuntimeStats( dwio::common::RuntimeStatistics& stats) const override { stats.skippedStrides += skippedStrides_; + stats.processedStrides += processedStrides_; stats.columnReaderStatistics.flattenStringDictionaryValues += columnReaderStatistics_.flattenStringDictionaryValues; } @@ -190,6 +191,8 @@ class DwrfRowReader : public StrideIndexProvider, std::unordered_map> stripeStridesToSkip_; // Number of skipped strides. int64_t skippedStrides_{0}; + // Number of processed strides. + int64_t processedStrides_{0}; // Set to true after clearing filter caches, i.e. adding a dynamic // filter. Causes filters to be re-evaluated against stride stats on diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index 2c73cfdb7ed6..929d456ac80f 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -757,6 +757,7 @@ class ParquetRowReader::Impl { void updateRuntimeStats(dwio::common::RuntimeStatistics& stats) const { stats.skippedStrides += rowGroups_.size() - rowGroupIds_.size(); + stats.processedStrides += rowGroupIds_.size(); } void resetFilterCaches() { diff --git a/velox/exec/tests/PrintPlanWithStatsTest.cpp b/velox/exec/tests/PrintPlanWithStatsTest.cpp index ae70387e4f1d..939dd01695a2 100644 --- a/velox/exec/tests/PrintPlanWithStatsTest.cpp +++ b/velox/exec/tests/PrintPlanWithStatsTest.cpp @@ -199,6 +199,8 @@ TEST_F(PrintPlanWithStatsTest, innerJoinWithTableScan) { {" prefetchBytes [ ]* sum: .+, count: 1, min: .+, max: .+"}, {" preloadedSplits[ ]+sum: .+, count: .+, min: .+, max: .+", true}, + {" processedSplits [ ]* sum: 1, count: 1, min: 1, max: 1"}, + {" processedStrides [ ]* sum: 40, count: 1, min: 40, max: 40"}, {" queryThreadIoLatency[ ]* sum: .+, count: .+ min: .+, max: .+"}, {" ramReadBytes [ ]* sum: .+, count: 1, min: .+, max: .+"}, {" readyPreloadedSplits[ ]+sum: .+, count: .+, min: .+, max: .+", @@ -292,6 +294,8 @@ TEST_F(PrintPlanWithStatsTest, partialAggregateWithTableScan) { {" overreadBytes[ ]* sum: 0B, count: 1, min: 0B, max: 0B"}, {" prefetchBytes [ ]* sum: .+, count: 1, min: .+, max: .+"}, + {" processedSplits [ ]* sum: 1, count: 1, min: 1, max: 1"}, + {" processedStrides [ ]* sum: 3, count: 1, min: 3, max: 3"}, {" preloadedSplits[ ]+sum: .+, count: .+, min: .+, max: .+", true}, {" queryThreadIoLatency[ ]* sum: .+, count: .+ min: .+, max: .+"},