microsoft · sezruby · Aug 3, 2021 · Jul 12, 2021 · Jul 14, 2021 · Jul 20, 2021
diff --git a/src/main/scala/com/microsoft/hyperspace/Hyperspace.scala b/src/main/scala/com/microsoft/hyperspace/Hyperspace.scala
@@ -20,7 +20,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession}
 
 import com.microsoft.hyperspace.index._
 import com.microsoft.hyperspace.index.IndexConstants.{OPTIMIZE_MODE_QUICK, REFRESH_MODE_FULL}
-import com.microsoft.hyperspace.index.plananalysis.PlanAnalyzer
+import com.microsoft.hyperspace.index.plananalysis.{CandidateIndexAnalyzer, PlanAnalyzer}
 import com.microsoft.hyperspace.index.rules.ApplyHyperspace
 import com.microsoft.hyperspace.index.sources.FileBasedSourceProviderManager
 
@@ -150,7 +150,7 @@ class Hyperspace(spark: SparkSession) {
   }
 
   /**
-   * Explains how indexes will be applied to the given dataframe.
+   * Explain how indexes will be applied to the given dataframe.
    *
    * @param df dataFrame.
    * @param redirectFunc optional function to redirect output of explain.
@@ -171,6 +171,25 @@ class Hyperspace(spark: SparkSession) {
     indexManager.index(indexName)
   }
 
+  /**
+   * Explain why indexes are not applied to the given dataframe.
+   *
+   * @param df Dataframe
+   * @param indexName Optional index name to filter out the output
+   * @param extended If true, print more verbose messages.
+   * @param redirectFunc Optional function to redirect output
+   */
+  def whyNot(df: DataFrame, indexName: String = "", extended: Boolean = false)(
+      implicit redirectFunc: String => Unit = print): Unit = {
+    withHyperspaceRuleDisabled {
+      if (indexName.nonEmpty) {
+        redirectFunc(CandidateIndexAnalyzer.whyNotIndexString(spark, df, indexName, extended))
+      } else {
+        redirectFunc(CandidateIndexAnalyzer.whyNotIndexesString(spark, df, extended))
+      }
+    }
+  }
+
   private def withHyperspaceRuleDisabled(f: => Unit): Unit = {
     try {
       ApplyHyperspace.disableForIndexMaintenance.set(true)

diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala
@@ -527,10 +527,23 @@ case class IndexLogEntry(
     tags.get((plan, tag)).map(_.asInstanceOf[T])
   }
 
+  def getTagValuesForAllPlan[T](tag: IndexLogEntryTag[T]): Seq[(LogicalPlan, T)] = {
+    tags.filter(entry => entry._1._2.equals(tag)).toSeq.map { case (k, v) =>
+      (k._1, v.asInstanceOf[T])
+    }
+  }
+
   def unsetTagValue[T](plan: LogicalPlan, tag: IndexLogEntryTag[T]): Unit = {
     tags.remove((plan, tag))
   }
 
+  def unsetTagValueForAllPlan[T](tag: IndexLogEntryTag[T]): Unit = {
+    val plansWithTag = tags.keys.filter(_._2.name.equals(tag.name)).map(_._1)
+    plansWithTag.foreach { plan =>
+      tags.remove((plan, tag))
+    }
+  }
+
   def withCachedTag[T](plan: LogicalPlan, tag: IndexLogEntryTag[T])(f: => T): T = {
     getTagValue(plan, tag) match {
       case Some(v) => v

diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntryTags.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntryTags.scala
@@ -18,6 +18,8 @@ package com.microsoft.hyperspace.index
 
 import org.apache.spark.sql.execution.datasources.InMemoryFileIndex
 
+import com.microsoft.hyperspace.index.plananalysis.FilterReason
+
 object IndexLogEntryTags {
   // HYBRIDSCAN_REQUIRED indicates if Hybrid Scan is required for the index or not.
   val HYBRIDSCAN_REQUIRED: IndexLogEntryTag[Boolean] =
@@ -55,10 +57,15 @@ object IndexLogEntryTags {
     IndexLogEntryTag[InMemoryFileIndex]("inMemoryFileIndexHybridScanAppended")
 
   // FILTER_REASONS stores reason strings for disqualification.
-  val FILTER_REASONS: IndexLogEntryTag[Seq[String]] =
-    IndexLogEntryTag[Seq[String]]("filterReasons")
+  val FILTER_REASONS: IndexLogEntryTag[Seq[FilterReason]] =
+    IndexLogEntryTag[Seq[FilterReason]]("filterReasons")
+
+  // APPLIED_INDEX_RULES stores rule's names can apply the index to the plan.
+  val APPLICABLE_INDEX_RULES: IndexLogEntryTag[Seq[String]] =
+    IndexLogEntryTag[Seq[String]]("applicableIndexRules")
 
   // FILTER_REASONS_ENABLED indicates whether whyNotAPI is enabled or not.
-  val FILTER_REASONS_ENABLED: IndexLogEntryTag[Boolean] =
-    IndexLogEntryTag[Boolean]("filterReasonsEnabled")
+  // If it's enabled, FILTER_REASONS and APPLIED_INDEX_RULES info will be tagged.
+  val INDEX_PLAN_ANALYSIS_ENABLED: IndexLogEntryTag[Boolean] =
+    IndexLogEntryTag[Boolean]("indexPlanAnalysisEnabled")
 }
diff --git a/src/main/scala/com/microsoft/hyperspace/index/covering/FilterIndexRule.scala b/src/main/scala/com/microsoft/hyperspace/index/covering/FilterIndexRule.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LeafNode, LogicalPla
 
 import com.microsoft.hyperspace.{ActiveSparkSession, Hyperspace}
 import com.microsoft.hyperspace.index.IndexLogEntryTags
+import com.microsoft.hyperspace.index.plananalysis.FilterReasons
 import com.microsoft.hyperspace.index.rules.{HyperspaceRule, IndexRankFilter, QueryPlanIndexFilter}
 import com.microsoft.hyperspace.index.rules.ApplyHyperspace.{PlanToIndexesMap, PlanToSelectedIndexMap}
 import com.microsoft.hyperspace.index.sources.FileBasedRelation
@@ -96,17 +97,19 @@ object FilterColumnFilter extends QueryPlanIndexFilter {
         withFilterReasonTag(
           plan,
           index,
-          "The first indexed column should be in filter condition columns.") {
+          FilterReasons.NoFirstIndexedColCond(
+            index.derivedDataset.indexedColumns.head,
+            filterColumnNames.mkString(","))) {
           ResolverUtils
             .resolve(spark, index.derivedDataset.indexedColumns.head, filterColumnNames)
             .isDefined
         } &&
         withFilterReasonTag(
           plan,
           index,
-          "Index does not contain required columns. Required columns: " +
-            s"[${(filterColumnNames ++ projectColumnNames).mkString(",")}], Indexed & " +
-            s"included columns: [${(index.derivedDataset.referencedColumns).mkString(",")}]") {
+          FilterReasons.MissingRequiredCol(
+            (filterColumnNames ++ projectColumnNames).toSet.mkString(","),
+            index.derivedDataset.referencedColumns.mkString(","))) {
           ResolverUtils
             .resolve(
               spark,

diff --git a/src/main/scala/com/microsoft/hyperspace/index/covering/JoinIndexRule.scala b/src/main/scala/com/microsoft/hyperspace/index/covering/JoinIndexRule.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan}
 import com.microsoft.hyperspace.Hyperspace
 import com.microsoft.hyperspace.index.{IndexLogEntry, IndexLogEntryTags}
 import com.microsoft.hyperspace.index.covering.JoinAttributeFilter.extractConditions
+import com.microsoft.hyperspace.index.plananalysis.FilterReasons
 import com.microsoft.hyperspace.index.rules.{HyperspaceRule, IndexRankFilter, QueryPlanIndexFilter}
 import com.microsoft.hyperspace.index.rules.ApplyHyperspace.{PlanToIndexesMap, PlanToSelectedIndexMap}
 import com.microsoft.hyperspace.index.sources.FileBasedRelation
@@ -65,17 +66,22 @@ object JoinPlanNodeFilter extends QueryPlanIndexFilter {
         val joinConditionCond = withFilterReasonTag(
           plan,
           leftAndRightIndexes,
-          "Join condition is not eligible. Equi-Joins in simple CNF form are supported. " +
-            "Literal is not supported.") {
+          FilterReasons.NotEligibleJoin("Non equi-join or has literal")) {
           isJoinConditionSupported(condition)
         }
 
         val leftPlanLinearCond =
-          withFilterReasonTag(plan, leftAndRightIndexes, "Left child is not a linear plan.") {
+          withFilterReasonTag(
+            plan,
+            leftAndRightIndexes,
+            FilterReasons.NotEligibleJoin("Non linear left child plan")) {
             isPlanLinear(l)
           }
         val rightPlanLinearCond =
-          withFilterReasonTag(plan, leftAndRightIndexes, "Right child is not a linear plan.") {
+          withFilterReasonTag(
+            plan,
+            leftAndRightIndexes,
+            FilterReasons.NotEligibleJoin("Non linear right child plan")) {
             isPlanLinear(r)
           }
 
@@ -96,7 +102,7 @@ object JoinPlanNodeFilter extends QueryPlanIndexFilter {
         setFilterReasonTag(
           plan,
           candidateIndexes.values.flatten.toSeq,
-          "Not eligible Join - no join condition.")
+          FilterReasons.NotEligibleJoin("No join condition"))
         Map.empty
       case _ =>
         Map.empty
@@ -164,10 +170,7 @@ object JoinAttributeFilter extends QueryPlanIndexFilter {
     if (withFilterReasonTag(
         plan,
         candidateIndexes.head._2 ++ candidateIndexes.last._2,
-        "Each join condition column should come from " +
-          "relations directly and attributes from left plan must exclusively have " +
-          "one-to-one mapping with attributes from right plan. " +
-          "E.g. join(A = B and A = D) is not eligible.") {
+        FilterReasons.NotEligibleJoin("incompatible left and right join columns")) {
         ensureAttributeRequirements(
           JoinIndexRule.leftRelation.get,
           JoinIndexRule.rightRelation.get,
@@ -329,39 +332,35 @@ object JoinColumnFilter extends QueryPlanIndexFilter {
         val lRequiredAllCols = resolve(spark, allRequiredCols(l), lBaseAttrs).get
         val rRequiredAllCols = resolve(spark, allRequiredCols(r), rBaseAttrs).get
 
+        // Make sure required indexed columns are subset of all required columns.
+        assert(
+          resolve(spark, lRequiredIndexedCols, lRequiredAllCols).isDefined &&
+            resolve(spark, rRequiredIndexedCols, rRequiredAllCols).isDefined)
+
+        val lIndexes =
+          getUsableIndexes(
+            plan,
+            candidateIndexes.getOrElse(leftRelation.plan, Nil),
+            lRequiredIndexedCols,
+            lRequiredAllCols,
+            "left")
+        val rIndexes =
+          getUsableIndexes(
+            plan,
+            candidateIndexes.getOrElse(rightRelation.plan, Nil),
+            rRequiredIndexedCols,
+            rRequiredAllCols,
+            "right")
+
         if (withFilterReasonTag(
             plan,
             candidateIndexes.head._2 ++ candidateIndexes.last._2,
-            "Invalid query plan.") {
-            // Make sure required indexed columns are subset of all required columns.
-            resolve(spark, lRequiredIndexedCols, lRequiredAllCols).isDefined &&
-            resolve(spark, rRequiredIndexedCols, rRequiredAllCols).isDefined
-          }) {
-          val lIndexes =
-            getUsableIndexes(
-              plan,
-              candidateIndexes.getOrElse(leftRelation.plan, Nil),
-              lRequiredIndexedCols,
-              lRequiredAllCols)
-          val rIndexes =
-            getUsableIndexes(
-              plan,
-              candidateIndexes.getOrElse(rightRelation.plan, Nil),
-              rRequiredIndexedCols,
-              rRequiredAllCols)
-
-          if (withFilterReasonTag(
-              plan,
-              candidateIndexes.head._2 ++ candidateIndexes.last._2,
-              "No available indexes for left subplan.")(lIndexes.nonEmpty) &&
-            withFilterReasonTag(
-              plan,
-              candidateIndexes.head._2 ++ candidateIndexes.last._2,
-              "No available indexes for right subplan.")(rIndexes.nonEmpty)) {
-            Map(leftRelation.plan -> lIndexes, rightRelation.plan -> rIndexes)
-          } else {
-            Map.empty
-          }
+            FilterReasons.NoAvailJoinIndexPair("left"))(lIndexes.nonEmpty) &&
+          withFilterReasonTag(
+            plan,
+            candidateIndexes.head._2 ++ candidateIndexes.last._2,
+            FilterReasons.NoAvailJoinIndexPair("right"))(rIndexes.nonEmpty)) {
+          Map(leftRelation.plan -> lIndexes, rightRelation.plan -> rIndexes)
         } else {
           Map.empty
         }
@@ -421,25 +420,28 @@ object JoinColumnFilter extends QueryPlanIndexFilter {
       plan: LogicalPlan,
       indexes: Seq[IndexLogEntry],
       requiredIndexCols: Seq[String],
-      allRequiredCols: Seq[String]): Seq[IndexLogEntry] = {
+      allRequiredCols: Seq[String],
+      leftOrRight: String): Seq[IndexLogEntry] = {
     indexes.filter { idx =>
       val allCols = idx.derivedDataset.referencedColumns
       // All required index columns should match one-to-one with all indexed columns and
       // vice-versa. All required columns must be present in the available index columns.
       withFilterReasonTag(
         plan,
         idx,
-        s"All join condition column should be the indexed columns. " +
-          s"Join columns: [${requiredIndexCols
-            .mkString(",")}], Indexed columns: [${idx.indexedColumns.mkString(",")}]") {
+        FilterReasons.NotAllJoinColIndexed(
+          leftOrRight,
+          requiredIndexCols.mkString(","),
+          idx.indexedColumns.mkString(","))) {
         requiredIndexCols.toSet.equals(idx.indexedColumns.toSet)
       } &&
       withFilterReasonTag(
         plan,
         idx,
-        s"Index does not contain all required columns. " +
-          s"Required columns: [${allRequiredCols.mkString(",")}], " +
-          s"Index columns: [${(idx.derivedDataset.referencedColumns).mkString(",")}]") {
+        FilterReasons.MissingIndexedCol(
+          leftOrRight,
+          allRequiredCols.mkString(","),
+          idx.indexedColumns.mkString(","))) {
         allRequiredCols.forall(allCols.contains)
       }
     }
@@ -523,7 +525,7 @@ object JoinRankFilter extends IndexRankFilter {
         setFilterReasonTag(
           plan,
           indexes.head._2 ++ indexes.last._2,
-          "No compatible left and right index pair.")
+          FilterReasons.NoCompatibleJoinIndexPair())
         Map.empty
       }
   }