Avoid recreating empty output vector repeatedly in HiveDataSource (#6942

) Summary: Pull Request resolved: #6942 In some low selectivity query with huge struct columns, we see the empty output gets destroyed and recreated repeatedly and making the query more than 4 times slower. Fix this by caching the empty output vector. Reviewed By: oerling Differential Revision: D50017249 fbshipit-source-id: 5c387ad1ee48ed7268b2c15570040cf7854c7aa9
facebookincubator · Oct 6, 2023 · 4364ac5 · 4364ac5
1 parent e2a61da
commit 4364ac5
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 2 deletions.
diff --git a/velox/connectors/hive/HiveDataSource.cpp b/velox/connectors/hive/HiveDataSource.cpp
@@ -776,7 +776,7 @@ std::optional<RowVectorPtr> HiveDataSource::next(
     auto rowsRemaining = output_->size();
     if (rowsRemaining == 0) {
       // no rows passed the pushed down filters.
-      return RowVector::createEmpty(outputType_, pool_);
+      return getEmptyOutput();
     }
 
     auto rowVector = std::dynamic_pointer_cast<RowVector>(output_);
@@ -791,7 +791,7 @@ std::optional<RowVectorPtr> HiveDataSource::next(
       VELOX_CHECK_LE(rowsRemaining, rowsScanned);
       if (rowsRemaining == 0) {
         // No rows passed the remaining filter.
-        return RowVector::createEmpty(outputType_, pool_);
+        return getEmptyOutput();
       }
 
       if (rowsRemaining < rowVector->size()) {

diff --git a/velox/connectors/hive/HiveDataSource.h b/velox/connectors/hive/HiveDataSource.h
@@ -148,6 +148,13 @@ class HiveDataSource : public DataSource {
   void parseSerdeParameters(
       const std::unordered_map<std::string, std::string>& serdeParameters);
 
+  const RowVectorPtr& getEmptyOutput() {
+    if (!emptyOutput_) {
+      emptyOutput_ = RowVector::createEmpty(outputType_, pool_);
+    }
+    return emptyOutput_;
+  }
+
   const RowTypePtr outputType_;
   // Column handles for the partition key columns keyed on partition key column
   // name.
@@ -160,6 +167,7 @@ class HiveDataSource : public DataSource {
   std::unique_ptr<dwio::common::Reader> reader_;
   std::unique_ptr<exec::ExprSet> remainingFilterExprSet_;
   bool emptySplit_;
+  RowVectorPtr emptyOutput_;
 
   dwio::common::RuntimeStatistics runtimeStats_;