Add query configs to turn off expression evaluation optimizations (fa…

…cebookincubator#10902) Summary: Pull Request resolved: facebookincubator#10902 This change adds query configs to individually turn off expression evaluation optimizations like dictionary peeling, dictionary memoization, reusing shared subexpression results and deferring lazy vector loading. The goal is to streamline debugging in production and enable prompt mitigation of bugs or regressions caused by the optimization or surfaced due to it. Note: When peeling is turned off, we still ensure that single arg functions recieve a flat input Differential Revision: D61943875
bikramSingh91 · Sep 3, 2024 · 1fb7610 · 1fb7610
1 parent fd06bd9
commit 1fb7610
Show file tree

Hide file tree

Showing 8 changed files with 356 additions and 38 deletions.
diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h
@@ -357,6 +357,43 @@ class QueryConfig {
   /// Empty string if only want to trace the query metadata.
   static constexpr const char* kQueryTraceNodeIds = "query_trace_node_ids";
 
+  /// Disable optimization in expression evaluation to peel common dictionary
+  /// layer from inputs.
+  static constexpr const char* kDebugDisableExpressionWithPeeling =
+      "debug_disable_expression_with_peeling";
+
+  /// Disable optimization in expression evaluation to re-use cached results for
+  /// common sub-expressions.
+  static constexpr const char* kDebugDisableCommonSubExpressions =
+      "debug_disable_common_sub_expressions";
+
+  /// Disable optimization in expression evaluation to re-use cached results
+  /// between subsequent input batches that are dictionary encoded and have the
+  /// same alphabet(underlying flat vector).
+  static constexpr const char* kDebugDisableExpressionWithMemoization =
+      "debug_disable_expression_with_memoization";
+
+  /// Disable optimization in expression evaluation to delay loading of lazy
+  /// inputs unless required.
+  static constexpr const char* kDebugDisableExpressionWithLazyInputs =
+      "debug_disable_expression_with_lazy_inputs";
+
+  bool debugDisableExpressionsWithPeeling() const {
+    return get<bool>(kDebugDisableExpressionWithPeeling, false);
+  }
+
+  bool debugDisableCommonSubExpressions() const {
+    return get<bool>(kDebugDisableCommonSubExpressions, false);
+  }
+
+  bool debugDisableExpressionsWithMemoization() const {
+    return get<bool>(kDebugDisableExpressionWithMemoization, false);
+  }
+
+  bool debugDisableExpressionsWithLazyInputs() const {
+    return get<bool>(kDebugDisableExpressionWithLazyInputs, false);
+  }
+
   uint64_t queryMaxMemoryPerNode() const {
     return config::toCapacity(
         get<std::string>(kQueryMaxMemoryPerNode, "0B"),

diff --git a/velox/core/tests/QueryConfigTest.cpp b/velox/core/tests/QueryConfigTest.cpp
@@ -123,7 +123,9 @@ TEST_F(QueryConfigTest, enableExpressionEvaluationCacheConfig) {
         execCtx->vectorPool() != nullptr, enableExpressionEvaluationCache);
 
     auto evalCtx = std::make_shared<exec::EvalCtx>(execCtx.get());
-    ASSERT_EQ(evalCtx->cacheEnabled(), enableExpressionEvaluationCache);
+    ASSERT_EQ(
+        evalCtx->dictionaryMemoizationEnabled(),
+        enableExpressionEvaluationCache);
 
     // Test ExecCtx::selectivityVectorPool_.
     auto rows = execCtx->getSelectivityVector(100);
@@ -144,4 +146,58 @@ TEST_F(QueryConfigTest, enableExpressionEvaluationCacheConfig) {
   testConfig(false);
 }
 
+TEST_F(QueryConfigTest, expressionEvaluationRelatedConfigs) {
+  // Verify that the expression evaluation related configs are porpogated
+  // correctly to ExprCtx which is used during expression evaluation. Each
+  // config is individually set and verified.
+  std::shared_ptr<memory::MemoryPool> rootPool{
+      memory::memoryManager()->addRootPool()};
+  std::shared_ptr<memory::MemoryPool> pool{rootPool->addLeafChild("leaf")};
+
+  auto testConfig =
+      [&](std::unordered_map<std::string, std::string> configData) {
+        auto queryCtx =
+            core::QueryCtx::create(nullptr, QueryConfig{std::move(configData)});
+        const auto& queryConfig = queryCtx->queryConfig();
+        auto execCtx =
+            std::make_shared<core::ExecCtx>(pool.get(), queryCtx.get());
+        auto evalCtx = std::make_shared<exec::EvalCtx>(execCtx.get());
+
+        ASSERT_EQ(
+            evalCtx->peelingEnabled(),
+            !queryConfig.debugDisableExpressionsWithPeeling());
+        ASSERT_EQ(
+            evalCtx->sharedSubExpressionReuseEnabled(),
+            !queryConfig.debugDisableCommonSubExpressions());
+        ASSERT_EQ(
+            evalCtx->dictionaryMemoizationEnabled(),
+            !queryConfig.debugDisableExpressionsWithMemoization());
+        ASSERT_EQ(
+            evalCtx->deferredLazyLoadingEnabled(),
+            !queryConfig.debugDisableExpressionsWithLazyInputs());
+      };
+
+  auto createConfig = [&](bool debugDisableExpressionsWithPeeling,
+                          bool debugDisableCommonSubExpressions,
+                          bool debugDisableExpressionsWithMemoization,
+                          bool debugDisableExpressionsWithLazyInputs) -> auto {
+    std::unordered_map<std::string, std::string> configData(
+        {{core::QueryConfig::kDebugDisableExpressionWithPeeling,
+          std::to_string(debugDisableExpressionsWithPeeling)},
+         {core::QueryConfig::kDebugDisableCommonSubExpressions,
+          std::to_string(debugDisableCommonSubExpressions)},
+         {core::QueryConfig::kDebugDisableExpressionWithMemoization,
+          std::to_string(debugDisableExpressionsWithMemoization)},
+         {core::QueryConfig::kDebugDisableExpressionWithLazyInputs,
+          std::to_string(debugDisableExpressionsWithLazyInputs)}});
+    return configData;
+  };
+
+  testConfig({}); // Verify default config.
+  testConfig(createConfig(true, false, false, false));
+  testConfig(createConfig(false, true, false, false));
+  testConfig(createConfig(false, false, true, false));
+  testConfig(createConfig(false, false, false, true));
+}
+
 } // namespace facebook::velox::core::test
diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst
@@ -168,6 +168,22 @@ Expression Evaluation Configuration
      - bool
      - false
      - This flag makes the Row conversion to by applied in a way that the casting row field are matched by name instead of position.
+   * - debug_disable_expression_with_peeling
+     - bool
+     - false
+     - Disable optimization in expression evaluation to peel common dictionary layer from inputs. Should only be used for debugging.
+   * - debug_disable_common_sub_expressions
+     - bool
+     - false
+     - Disable optimization in expression evaluation to re-use cached results for common sub-expressions. Should only be used for debugging.
+   * - debug_disable_expression_with_memoization
+     - bool
+     - false
+     - Disable optimization in expression evaluation to re-use cached results between subsequent input batches that are dictionary encoded and have the same alphabet(underlying flat vector). Should only be used for debugging.
+   * - debug_disable_expression_with_lazy_inputs
+     - bool
+     - false
+     - Disable optimization in expression evaluation to delay loading of lazy inputs unless required. Should only be used for debugging.
 
 Memory Management
 -----------------

diff --git a/velox/expression/EvalCtx.cpp b/velox/expression/EvalCtx.cpp
@@ -29,13 +29,7 @@ EvalCtx::EvalCtx(core::ExecCtx* execCtx, ExprSet* exprSet, const RowVector* row)
     : execCtx_(execCtx),
       exprSet_(exprSet),
       row_(row),
-      cacheEnabled_(execCtx->exprEvalCacheEnabled()),
-      maxSharedSubexprResultsCached_(
-          execCtx->queryCtx()
-              ? execCtx->queryCtx()
-                    ->queryConfig()
-                    .maxSharedSubexprResultsCached()
-              : core::QueryConfig({}).maxSharedSubexprResultsCached()) {
+      optimizationParams_(execCtx_) {
   // TODO Change the API to replace raw pointers with non-const references.
   // Sanity check inputs to prevent crashes.
   VELOX_CHECK_NOT_NULL(execCtx);
@@ -56,13 +50,7 @@ EvalCtx::EvalCtx(core::ExecCtx* execCtx)
     : execCtx_(execCtx),
       exprSet_(nullptr),
       row_(nullptr),
-      cacheEnabled_(execCtx->exprEvalCacheEnabled()),
-      maxSharedSubexprResultsCached_(
-          execCtx->queryCtx()
-              ? execCtx->queryCtx()
-                    ->queryConfig()
-                    .maxSharedSubexprResultsCached()
-              : core::QueryConfig({}).maxSharedSubexprResultsCached()) {
+      optimizationParams_(execCtx_) {
   VELOX_CHECK_NOT_NULL(execCtx);
 }
 

diff --git a/velox/expression/EvalCtx.h b/velox/expression/EvalCtx.h
@@ -519,16 +519,33 @@ class EvalCtx {
     return peeledEncoding_.get();
   }
 
-  /// Returns true if caching in expression evaluation is enabled, such as
-  /// Expr::evalWithMemo.
-  bool cacheEnabled() const {
-    return cacheEnabled_;
+  /// Returns true if dictionary memoization optimization is enabled, which
+  /// allows the reuse of results between consecutive input batches if they are
+  /// dictionary encoded and have the same alphabet(undelying flat vector).
+  bool dictionaryMemoizationEnabled() const {
+    return optimizationParams_.dictionaryMemoizationEnabled_;
   }
 
-  /// Returns the maximum number of distinct inputs to cache results for in a
+  /// Returns the maximum number of distinct inputs to cache results in a
   /// given shared subexpression.
   uint32_t maxSharedSubexprResultsCached() const {
-    return maxSharedSubexprResultsCached_;
+    return optimizationParams_.maxSharedSubexprResultsCached_;
+  }
+
+  /// Returns true if peeling is enabled.
+  bool peelingEnabled() const {
+    return optimizationParams_.peelingEnabled_;
+  }
+
+  /// Returns true if shared subexpression reuse is enabled.
+  bool sharedSubExpressionReuseEnabled() const {
+    return optimizationParams_.sharedSubExpressionReuseEnabled_;
+  }
+
+  /// Returns true if loading lazy inputs are deferred till they need to be
+  /// accessed.
+  bool deferredLazyLoadingEnabled() const {
+    return optimizationParams_.deferredLazyLoadingEnabled_;
   }
 
  private:
@@ -550,8 +567,33 @@ class EvalCtx {
   core::ExecCtx* const execCtx_;
   ExprSet* const exprSet_;
   const RowVector* row_;
-  const bool cacheEnabled_;
-  const uint32_t maxSharedSubexprResultsCached_;
+
+  struct OptimizationParams {
+    explicit OptimizationParams(core::ExecCtx* execCtx) {
+      const core::QueryConfig defaultQueryConfig = core::QueryConfig({});
+
+      const core::QueryConfig& queryConfig = (execCtx && execCtx->queryCtx())
+          ? execCtx->queryCtx()->queryConfig()
+          : defaultQueryConfig;
+
+      dictionaryMemoizationEnabled_ =
+          !queryConfig.debugDisableExpressionsWithMemoization() &&
+          execCtx->exprEvalCacheEnabled();
+      peelingEnabled_ = !queryConfig.debugDisableExpressionsWithPeeling();
+      sharedSubExpressionReuseEnabled_ =
+          !queryConfig.debugDisableCommonSubExpressions();
+      deferredLazyLoadingEnabled_ =
+          !queryConfig.debugDisableExpressionsWithLazyInputs();
+      maxSharedSubexprResultsCached_ =
+          queryConfig.maxSharedSubexprResultsCached();
+    }
+    bool dictionaryMemoizationEnabled_;
+    bool peelingEnabled_;
+    bool sharedSubExpressionReuseEnabled_;
+    bool deferredLazyLoadingEnabled_;
+    uint32_t maxSharedSubexprResultsCached_;
+  };
+  const OptimizationParams optimizationParams_;
   bool inputFlatNoNulls_;
 
   // Corresponds 1:1 to children of 'row_'. Set to an inner vector

diff --git a/velox/expression/Expr.cpp b/velox/expression/Expr.cpp
@@ -708,7 +708,7 @@ void Expr::evalFlatNoNulls(
     EvalCtx& context,
     VectorPtr& result,
     const ExprSet* parentExprSet) {
-  if (shouldEvaluateSharedSubexp()) {
+  if (shouldEvaluateSharedSubexp(context)) {
     evaluateSharedSubexpr(
         rows,
         context,
@@ -819,7 +819,8 @@ void Expr::eval(
   //
   // TODO: Re-work the logic of deciding when to load which field.
   if (!hasConditionals_ || distinctFields_.size() == 1 ||
-      shouldEvaluateSharedSubexp()) {
+      shouldEvaluateSharedSubexp(context) ||
+      !context.deferredLazyLoadingEnabled()) {
     // Load lazy vectors if any.
     for (auto* field : distinctFields_) {
       context.ensureFieldLoaded(field->index(context), rows);
@@ -874,10 +875,8 @@ void Expr::evaluateSharedSubexpr(
   }
 
   if (sharedSubexprResultsIter == sharedSubexprResults_.end()) {
-    auto maxSharedSubexprResultsCached = context.execCtx()
-                                             ->queryCtx()
-                                             ->queryConfig()
-                                             .maxSharedSubexprResultsCached();
+    auto maxSharedSubexprResultsCached =
+        context.maxSharedSubexprResultsCached();
     if (sharedSubexprResults_.size() < maxSharedSubexprResultsCached) {
       // If we have room left in the cache, add it.
       sharedSubexprResultsIter =
@@ -1039,7 +1038,7 @@ Expr::PeelEncodingsResult Expr::peelEncodings(
 
   // If the expression depends on one dictionary, results are cacheable.
   bool mayCache = false;
-  if (context.cacheEnabled()) {
+  if (context.dictionaryMemoizationEnabled()) {
     mayCache = distinctFields_.size() == 1 &&
         VectorEncoding::isDictionary(context.wrapEncoding()) &&
         !peeledVectors[0]->memoDisabled();
@@ -1054,7 +1053,8 @@ void Expr::evalEncodings(
     const SelectivityVector& rows,
     EvalCtx& context,
     VectorPtr& result) {
-  if (deterministic_ && !skipFieldDependentOptimizations()) {
+  if (deterministic_ && !skipFieldDependentOptimizations() &&
+      context.peelingEnabled()) {
     bool hasFlat = false;
     for (auto* field : distinctFields_) {
       if (isFlat(*context.getField(field->index(context)))) {
@@ -1381,7 +1381,7 @@ void Expr::evalAll(
     return;
   }
 
-  if (shouldEvaluateSharedSubexp()) {
+  if (shouldEvaluateSharedSubexp(context)) {
     evaluateSharedSubexpr(
         rows,
         context,
@@ -1462,6 +1462,16 @@ bool Expr::applyFunctionWithPeeling(
     VectorPtr& result) {
   LocalDecodedVector localDecoded(context);
   LocalSelectivityVector newRowsHolder(context);
+  if (!context.peelingEnabled()) {
+    if (inputValues_.size() == 1) {
+      // If we have a single input, velox needs to ensure that the
+      // vectorFunction would receive a flat input.
+      BaseVector::flattenVector(inputValues_[0]);
+      applyFunction(applyRows, context, result);
+      return true;
+    }
+    return false;
+  }
   // Attempt peeling.
   std::vector<VectorPtr> peeledVectors;
   auto peeledEncoding = PeeledEncoding::peel(

diff --git a/velox/expression/Expr.h b/velox/expression/Expr.h
@@ -486,8 +486,9 @@ class Expr {
   /// Evaluation of such expression is optimized by memoizing and reusing
   /// the results of prior evaluations. That logic is implemented in
   /// 'evaluateSharedSubexpr'.
-  bool shouldEvaluateSharedSubexp() const {
-    return deterministic_ && isMultiplyReferenced_ && !inputs_.empty();
+  bool shouldEvaluateSharedSubexp(EvalCtx& context) const {
+    return deterministic_ && isMultiplyReferenced_ && !inputs_.empty() &&
+        context.sharedSubExpressionReuseEnabled();
   }
 
   /// Evaluate common sub-expression. Check if sharedSubexprValues_ already has