From 4364ac5eff70ec82cfe909c5b5769fab426648bf Mon Sep 17 00:00:00 2001 From: Jimmy Lu Date: Fri, 6 Oct 2023 14:46:54 -0700 Subject: [PATCH] Avoid recreating empty output vector repeatedly in HiveDataSource (#6942) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/6942 In some low selectivity query with huge struct columns, we see the empty output gets destroyed and recreated repeatedly and making the query more than 4 times slower. Fix this by caching the empty output vector. Reviewed By: oerling Differential Revision: D50017249 fbshipit-source-id: 5c387ad1ee48ed7268b2c15570040cf7854c7aa9 --- velox/connectors/hive/HiveDataSource.cpp | 4 ++-- velox/connectors/hive/HiveDataSource.h | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/velox/connectors/hive/HiveDataSource.cpp b/velox/connectors/hive/HiveDataSource.cpp index 22a53aecde2b..38be2795d80b 100644 --- a/velox/connectors/hive/HiveDataSource.cpp +++ b/velox/connectors/hive/HiveDataSource.cpp @@ -776,7 +776,7 @@ std::optional HiveDataSource::next( auto rowsRemaining = output_->size(); if (rowsRemaining == 0) { // no rows passed the pushed down filters. - return RowVector::createEmpty(outputType_, pool_); + return getEmptyOutput(); } auto rowVector = std::dynamic_pointer_cast(output_); @@ -791,7 +791,7 @@ std::optional HiveDataSource::next( VELOX_CHECK_LE(rowsRemaining, rowsScanned); if (rowsRemaining == 0) { // No rows passed the remaining filter. - return RowVector::createEmpty(outputType_, pool_); + return getEmptyOutput(); } if (rowsRemaining < rowVector->size()) { diff --git a/velox/connectors/hive/HiveDataSource.h b/velox/connectors/hive/HiveDataSource.h index e687c3ecd9d5..4da7059b27e4 100644 --- a/velox/connectors/hive/HiveDataSource.h +++ b/velox/connectors/hive/HiveDataSource.h @@ -148,6 +148,13 @@ class HiveDataSource : public DataSource { void parseSerdeParameters( const std::unordered_map& serdeParameters); + const RowVectorPtr& getEmptyOutput() { + if (!emptyOutput_) { + emptyOutput_ = RowVector::createEmpty(outputType_, pool_); + } + return emptyOutput_; + } + const RowTypePtr outputType_; // Column handles for the partition key columns keyed on partition key column // name. @@ -160,6 +167,7 @@ class HiveDataSource : public DataSource { std::unique_ptr reader_; std::unique_ptr remainingFilterExprSet_; bool emptySplit_; + RowVectorPtr emptyOutput_; dwio::common::RuntimeStatistics runtimeStats_;