Add assert

microsoft · Jul 21, 2021 · deed490 · deed490
1 parent 421cbc0
commit deed490
Show file tree

Hide file tree

Showing 11 changed files with 377 additions and 14 deletions.
diff --git a/src/main/scala/com/microsoft/hyperspace/index/covering/FilterIndexRule.scala b/src/main/scala/com/microsoft/hyperspace/index/covering/FilterIndexRule.scala
@@ -99,7 +99,7 @@ object FilterColumnFilter extends QueryPlanIndexFilter {
           index,
           FilterReasons.NoFirstIndexedColCond(
             index.derivedDataset.indexedColumns.head,
-            filterColumnNames.mkString(", "))) {
+            filterColumnNames.mkString(","))) {
           ResolverUtils
             .resolve(spark, index.derivedDataset.indexedColumns.head, filterColumnNames)
             .isDefined

diff --git a/src/main/scala/com/microsoft/hyperspace/index/covering/JoinIndexRule.scala b/src/main/scala/com/microsoft/hyperspace/index/covering/JoinIndexRule.scala
@@ -431,17 +431,17 @@ object JoinColumnFilter extends QueryPlanIndexFilter {
         idx,
         FilterReasons.NotAllJoinColIndexed(
           leftOrRight,
-          requiredIndexCols.mkString(", "),
-          idx.indexedColumns.mkString(", "))) {
+          requiredIndexCols.mkString(","),
+          idx.indexedColumns.mkString(","))) {
         requiredIndexCols.toSet.equals(idx.indexedColumns.toSet)
       } &&
       withFilterReasonTag(
         plan,
         idx,
         FilterReasons.MissingIndexedCol(
           leftOrRight,
-          allRequiredCols.mkString(", "),
-          idx.indexedColumns.mkString(", "))) {
+          allRequiredCols.mkString(","),
+          idx.indexedColumns.mkString(","))) {
         allRequiredCols.forall(allCols.contains)
       }
     }

diff --git a/src/main/scala/com/microsoft/hyperspace/index/plananalysis/CandidateIndexAnalyzer.scala b/src/main/scala/com/microsoft/hyperspace/index/plananalysis/CandidateIndexAnalyzer.scala
@@ -119,7 +119,7 @@ object CandidateIndexAnalyzer extends Logging {
               }
           }
       }
-      .sortBy(r => (r._1, r._3))
+      .sortBy(r => (r._1, r._2, r._3, r._4))
       .distinct
 
     if (res.isEmpty) {
@@ -186,12 +186,14 @@ object CandidateIndexAnalyzer extends Logging {
         matchStr.group(1)
       }
       .toSeq
+      .distinct
+      .sorted
     printIndexNames(appliedIndexNames)
 
     stringBuilder.append("Applicable indexes, but not applied due to priority:")
     stringBuilder.append(newLine)
     val applicableButNotAppliedIndexNames =
-      applicableIndexes.map(_._1.name).distinct.filterNot(appliedIndexNames.contains(_))
+      applicableIndexes.map(_._1.name).distinct.sorted.filterNot(appliedIndexNames.contains(_))
     printIndexNames(applicableButNotAppliedIndexNames)
 
     // Covert reasons to Dataframe rows.
@@ -214,7 +216,7 @@ object CandidateIndexAnalyzer extends Logging {
               }
           }
       }
-      .sortBy(r => (r._1, r._3))
+      .sortBy(r => (r._1, r._2, r._3, r._4, r._5))
       .distinct
 
     import spark.implicits._
@@ -233,6 +235,7 @@ object CandidateIndexAnalyzer extends Logging {
       .filter(row => row._4.equals("SOURCE_DATA_CHANGE"))
       .map(_._2)
       .distinct
+      .sorted
       .filterNot(appliedIndexNames.contains(_))
       .filterNot(applicableButNotAppliedIndexNames.contains(_))
     printIndexNames(indexNamesForOutdated)
@@ -245,6 +248,7 @@ object CandidateIndexAnalyzer extends Logging {
         row._4.equals("COL_SCHEMA_MISMATCH") || row._4.equals("SOURCE_DATA_CHANGE"))
       .map(_._2)
       .distinct
+      .sorted
       .filterNot(appliedIndexNames.contains(_))
       .filterNot(applicableButNotAppliedIndexNames.contains(_))
     printIndexNames(indexNamesForNoApplicablePlan)

diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/ColumnSchemaFilter.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/ColumnSchemaFilter.scala
@@ -34,8 +34,8 @@ object ColumnSchemaFilter extends SourcePlanIndexFilter {
         plan,
         index,
         FilterReasons.ColSchemaMismatch(
-          relationColumnNames.mkString(", "),
-          index.derivedDataset.referencedColumns.mkString(", "))) {
+          relationColumnNames.mkString(","),
+          index.derivedDataset.referencedColumns.mkString(","))) {
         ResolverUtils
           .resolve(spark, index.derivedDataset.referencedColumns, relationColumnNames)
           .isDefined

diff --git a/src/test/resources/expected/spark-2.4/whyNot_allIndex.txt b/src/test/resources/expected/spark-2.4/whyNot_allIndex.txt
@@ -0,0 +1,63 @@
+
+=============================================================
+Plan with Hyperspace & Summary:
+=============================================================
+Join Inner, (c3# = c3#)
+:- Project [c4#, c3#]
+:  +- Filter ((isnotnull(c4#) && (c4# = 2)) && isnotnull(c3#))
+:     +- Relation[c3#,c4#] Hyperspace(Type: CI, Name: leftDfFilterIndex, LogVersion: 1)
++- Project [c5#, c3#]
+   +- Filter ((isnotnull(c5#) && (c5# = 3000)) && isnotnull(c3#))
+      +- Relation[c3#,c5#] Hyperspace(Type: CI, Name: rightDfFilterIndex, LogVersion: 1)
+
+Applied indexes:
+- leftDfFilterIndex
+- rightDfFilterIndex
+
+Applicable indexes, but not applied due to priority:
+- leftDfJoinIndex
+- rightDfJoinIndex
+
+Non-applicable indexes - index is outdated:
+- No such index found.
+
+Non-applicable indexes - no applicable query plan:
+- No such index found.
+
+For more information, please visit: https://microsoft.github.io/hyperspace/docs/why-not-result-analysis
+
+=============================================================
+Plan without Hyperspace & WhyNot reasons:
+=============================================================
+00 Join Inner, (c3# = c3#)
+01 :- Project [c4#, c3#]
+02 :  +- Filter ((isnotnull(c4#) && (c4# = 2)) && isnotnull(c3#))
+03 :     +- Relation[c1#,c2#,c3#,c4#,c5#] parquet
+04 +- Project [c5#, c3#]
+05    +- Filter ((isnotnull(c5#) && (c5# = 3000)) && isnotnull(c3#))
+06       +- Relation[c1#,c2#,c3#,c4#,c5#] parquet
+
++-----------+------------------+---------+-------------------------+------------------------------------------------------------+
+|SubPlan    |IndexName         |IndexType|Reason                   |Message                                                     |
++-----------+------------------+---------+-------------------------+------------------------------------------------------------+
+|Filter @ 2 |leftDfFilterIndex |CI       |MISSING_REQUIRED_COL     |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c4,c3]            |
+|Filter @ 2 |leftDfJoinIndex   |CI       |MISSING_REQUIRED_COL     |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4]            |
+|Filter @ 2 |rightDfFilterIndex|CI       |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c5], filterCols=[c4,c3]                    |
+|Filter @ 2 |rightDfJoinIndex  |CI       |MISSING_REQUIRED_COL     |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c5]            |
+|Filter @ 5 |leftDfFilterIndex |CI       |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c4], filterCols=[c5,c3]                    |
+|Filter @ 5 |leftDfJoinIndex   |CI       |MISSING_REQUIRED_COL     |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4]            |
+|Filter @ 5 |rightDfFilterIndex|CI       |MISSING_REQUIRED_COL     |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c5,c3]            |
+|Filter @ 5 |rightDfJoinIndex  |CI       |MISSING_REQUIRED_COL     |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c5]            |
+|Join @ 0   |leftDfFilterIndex |CI       |NOT_ALL_JOIN_COL_INDEXED |child=[left], joinCols=[c3], indexedCols=[c4]               |
+|Join @ 0   |leftDfFilterIndex |CI       |NOT_ALL_JOIN_COL_INDEXED |child=[right], joinCols=[c3], indexedCols=[c4]              |
+|Join @ 0   |leftDfJoinIndex   |CI       |MISSING_INDEXED_COL      |child=[right], requiredIndexedCols=[c5,c3], IndexedCols=[c3]|
+|Join @ 0   |rightDfFilterIndex|CI       |NOT_ALL_JOIN_COL_INDEXED |child=[left], joinCols=[c3], indexedCols=[c5]               |
+|Join @ 0   |rightDfFilterIndex|CI       |NOT_ALL_JOIN_COL_INDEXED |child=[right], joinCols=[c3], indexedCols=[c5]              |
+|Join @ 0   |rightDfJoinIndex  |CI       |MISSING_INDEXED_COL      |child=[left], requiredIndexedCols=[c4,c3], IndexedCols=[c3] |
+|Project @ 1|leftDfJoinIndex   |CI       |ANOTHER_INDEX_APPLIED    |appliedIndex=[leftDfFilterIndex]                            |
+|Project @ 1|rightDfFilterIndex|CI       |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c5], filterCols=[c4,c3]                    |
+|Project @ 1|rightDfJoinIndex  |CI       |MISSING_REQUIRED_COL     |requiredCols=[c4,c3], indexCols=[c3,c5]                     |
+|Project @ 4|leftDfFilterIndex |CI       |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c4], filterCols=[c5,c3]                    |
+|Project @ 4|leftDfJoinIndex   |CI       |MISSING_REQUIRED_COL     |requiredCols=[c5,c3], indexCols=[c3,c4]                     |
+|Project @ 4|rightDfJoinIndex  |CI       |ANOTHER_INDEX_APPLIED    |appliedIndex=[rightDfFilterIndex]                           |
++-----------+------------------+---------+-------------------------+------------------------------------------------------------+
diff --git a/src/test/resources/expected/spark-2.4/whyNot_indexName.txt b/src/test/resources/expected/spark-2.4/whyNot_indexName.txt
@@ -0,0 +1,48 @@
+
+=============================================================
+Plan with Hyperspace & Summary:
+=============================================================
+Join Inner, (c3# = c3#)
+:- Project [c4#, c3#]
+:  +- Filter ((isnotnull(c4#) && (c4# = 2)) && isnotnull(c3#))
+:     +- Relation[c3#,c4#] Hyperspace(Type: CI, Name: leftDfFilterIndex, LogVersion: 1)
++- Project [c5#, c3#]
+   +- Filter ((isnotnull(c5#) && (c5# = 3000)) && isnotnull(c3#))
+      +- Relation[c3#,c5#] Hyperspace(Type: CI, Name: rightDfFilterIndex, LogVersion: 1)
+
+Applied indexes:
+- leftDfFilterIndex
+- rightDfFilterIndex
+
+Applicable indexes, but not applied due to priority:
+- leftDfJoinIndex
+- rightDfJoinIndex
+
+Non-applicable indexes - index is outdated:
+- No such index found.
+
+Non-applicable indexes - no applicable query plan:
+- No such index found.
+
+For more information, please visit: https://microsoft.github.io/hyperspace/docs/why-not-result-analysis
+
+=============================================================
+Plan without Hyperspace & WhyNot reasons:
+=============================================================
+00 Join Inner, (c3# = c3#)
+01 :- Project [c4#, c3#]
+02 :  +- Filter ((isnotnull(c4#) && (c4# = 2)) && isnotnull(c3#))
+03 :     +- Relation[c1#,c2#,c3#,c4#,c5#] parquet
+04 +- Project [c5#, c3#]
+05    +- Filter ((isnotnull(c5#) && (c5# = 3000)) && isnotnull(c3#))
+06       +- Relation[c1#,c2#,c3#,c4#,c5#] parquet
+
++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
+|SubPlan    |IndexName      |IndexType|Reason               |Message                                                     |VerboseMessage                                                                                                     |
++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
+|Filter @ 2 |leftDfJoinIndex|CI       |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4]            |Index does not contain required columns. Required columns: [c3,c4,c5,c2,c1], Index columns: [c3,c4]                |
+|Filter @ 5 |leftDfJoinIndex|CI       |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4]            |Index does not contain required columns. Required columns: [c3,c4,c5,c2,c1], Index columns: [c3,c4]                |
+|Join @ 0   |leftDfJoinIndex|CI       |MISSING_INDEXED_COL  |child=[right], requiredIndexedCols=[c5,c3], IndexedCols=[c3]|Index does not contain required columns for right subplan. Required indexed columns: [c5,c3], Indexed columns: [c3]|
+|Project @ 1|leftDfJoinIndex|CI       |ANOTHER_INDEX_APPLIED|appliedIndex=[leftDfFilterIndex]                            |Another candidate index is applied: leftDfFilterIndex                                                              |
+|Project @ 4|leftDfJoinIndex|CI       |MISSING_REQUIRED_COL |requiredCols=[c5,c3], indexCols=[c3,c4]                     |Index does not contain required columns. Required columns: [c5,c3], Index columns: [c3,c4]                         |
++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
diff --git a/src/test/resources/expected/spark-3.0/whyNot_allIndex.txt b/src/test/resources/expected/spark-3.0/whyNot_allIndex.txt
@@ -0,0 +1,63 @@
+
+=============================================================
+Plan with Hyperspace & Summary:
+=============================================================
+Join Inner, (c3# = c3#)
+:- Project [c4#, c3#]
+:  +- Filter ((isnotnull(c4#) AND (c4# = 2)) AND isnotnull(c3#))
+:     +- Relation[c3#,c4#] Hyperspace(Type: CI, Name: leftDfFilterIndex, LogVersion: 1)
++- Project [c5#, c3#]
+   +- Filter ((isnotnull(c5#) AND (c5# = 3000)) AND isnotnull(c3#))
+      +- Relation[c3#,c5#] Hyperspace(Type: CI, Name: rightDfFilterIndex, LogVersion: 1)
+
+Applied indexes:
+- leftDfFilterIndex
+- rightDfFilterIndex
+
+Applicable indexes, but not applied due to priority:
+- leftDfJoinIndex
+- rightDfJoinIndex
+
+Non-applicable indexes - index is outdated:
+- No such index found.
+
+Non-applicable indexes - no applicable query plan:
+- No such index found.
+
+For more information, please visit: https://microsoft.github.io/hyperspace/docs/why-not-result-analysis
+
+=============================================================
+Plan without Hyperspace & WhyNot reasons:
+=============================================================
+00 Join Inner, (c3# = c3#)
+01 :- Project [c4#, c3#]
+02 :  +- Filter ((isnotnull(c4#) AND (c4# = 2)) AND isnotnull(c3#))
+03 :     +- Relation[c1#,c2#,c3#,c4#,c5#] parquet
+04 +- Project [c5#, c3#]
+05    +- Filter ((isnotnull(c5#) AND (c5# = 3000)) AND isnotnull(c3#))
+06       +- Relation[c1#,c2#,c3#,c4#,c5#] parquet
+
++-----------+------------------+---------+-------------------------+------------------------------------------------------------+
+|SubPlan    |IndexName         |IndexType|Reason                   |Message                                                     |
++-----------+------------------+---------+-------------------------+------------------------------------------------------------+
+|Filter @ 2 |leftDfFilterIndex |CI       |MISSING_REQUIRED_COL     |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c4,c3]            |
+|Filter @ 2 |leftDfJoinIndex   |CI       |MISSING_REQUIRED_COL     |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4]            |
+|Filter @ 2 |rightDfFilterIndex|CI       |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c5], filterCols=[c4,c3]                    |
+|Filter @ 2 |rightDfJoinIndex  |CI       |MISSING_REQUIRED_COL     |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c5]            |
+|Filter @ 5 |leftDfFilterIndex |CI       |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c4], filterCols=[c5, c3]                   |
+|Filter @ 5 |leftDfJoinIndex   |CI       |MISSING_REQUIRED_COL     |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4]            |
+|Filter @ 5 |rightDfFilterIndex|CI       |MISSING_REQUIRED_COL     |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c5,c3]            |
+|Filter @ 5 |rightDfJoinIndex  |CI       |MISSING_REQUIRED_COL     |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c5]            |
+|Join @ 0   |leftDfFilterIndex |CI       |NOT_ALL_JOIN_COL_INDEXED |child=[left], joinCols=[c3], indexedCols=[c4]               |
+|Join @ 0   |leftDfFilterIndex |CI       |NOT_ALL_JOIN_COL_INDEXED |child=[right], joinCols=[c3], indexedCols=[c4]              |
+|Join @ 0   |leftDfJoinIndex   |CI       |MISSING_INDEXED_COL      |child=[right], requiredIndexedCols=[c5,c3], IndexedCols=[c3]|
+|Join @ 0   |rightDfFilterIndex|CI       |NOT_ALL_JOIN_COL_INDEXED |child=[left], joinCols=[c3], indexedCols=[c5]               |
+|Join @ 0   |rightDfFilterIndex|CI       |NOT_ALL_JOIN_COL_INDEXED |child=[right], joinCols=[c3], indexedCols=[c5]              |
+|Join @ 0   |rightDfJoinIndex  |CI       |MISSING_INDEXED_COL      |child=[left], requiredIndexedCols=[c4,c3], IndexedCols=[c3] |
+|Project @ 1|leftDfJoinIndex   |CI       |ANOTHER_INDEX_APPLIED    |appliedIndex=[leftDfFilterIndex]                            |
+|Project @ 1|rightDfFilterIndex|CI       |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c5], filterCols=[c4,c3]                    |
+|Project @ 1|rightDfJoinIndex  |CI       |MISSING_REQUIRED_COL     |requiredCols=[c4,c3], indexCols=[c3,c5]                     |
+|Project @ 4|leftDfFilterIndex |CI       |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c4], filterCols=[c5,c3]                    |
+|Project @ 4|leftDfJoinIndex   |CI       |MISSING_REQUIRED_COL     |requiredCols=[c5,c3], indexCols=[c3,c4]                     |
+|Project @ 4|rightDfJoinIndex  |CI       |ANOTHER_INDEX_APPLIED    |appliedIndex=[rightDfFilterIndex]                           |
++-----------+------------------+---------+-------------------------+------------------------------------------------------------+
diff --git a/src/test/resources/expected/spark-3.0/whyNot_indexName.txt b/src/test/resources/expected/spark-3.0/whyNot_indexName.txt
@@ -0,0 +1,48 @@
+
+=============================================================
+Plan with Hyperspace & Summary:
+=============================================================
+Join Inner, (c3# = c3#)
+:- Project [c4#, c3#]
+:  +- Filter ((isnotnull(c4#) AND (c4# = 2)) AND isnotnull(c3#))
+:     +- Relation[c3#,c4#] Hyperspace(Type: CI, Name: leftDfFilterIndex, LogVersion: 1)
++- Project [c5#, c3#]
+   +- Filter ((isnotnull(c5#) AND (c5# = 3000)) AND isnotnull(c3#))
+      +- Relation[c3#,c5#] Hyperspace(Type: CI, Name: rightDfFilterIndex, LogVersion: 1)
+
+Applied indexes:
+- leftDfFilterIndex
+- rightDfFilterIndex
+
+Applicable indexes, but not applied due to priority:
+- leftDfJoinIndex
+- rightDfJoinIndex
+
+Non-applicable indexes - index is outdated:
+- No such index found.
+
+Non-applicable indexes - no applicable query plan:
+- No such index found.
+
+For more information, please visit: https://microsoft.github.io/hyperspace/docs/why-not-result-analysis
+
+=============================================================
+Plan without Hyperspace & WhyNot reasons:
+=============================================================
+00 Join Inner, (c3# = c3#)
+01 :- Project [c4#, c3#]
+02 :  +- Filter ((isnotnull(c4#) AND (c4# = 2)) AND isnotnull(c3#))
+03 :     +- Relation[c1#,c2#,c3#,c4#,c5#] parquet
+04 +- Project [c5#, c3#]
+05    +- Filter ((isnotnull(c5#) AND (c5# = 3000)) AND isnotnull(c3#))
+06       +- Relation[c1#,c2#,c3#,c4#,c5#] parquet
+
++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
+|SubPlan    |IndexName      |IndexType|Reason               |Message                                                     |VerboseMessage                                                                                                     |
++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+
+|Filter @ 2 |leftDfJoinIndex|CI       |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4]            |Index does not contain required columns. Required columns: [c3,c4,c5,c2,c1], Index columns: [c3,c4]                |
+|Filter @ 5 |leftDfJoinIndex|CI       |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4]            |Index does not contain required columns. Required columns: [c3,c4,c5,c2,c1], Index columns: [c3,c4]                |
+|Join @ 0   |leftDfJoinIndex|CI       |MISSING_INDEXED_COL  |child=[right], requiredIndexedCols=[c5,c3], IndexedCols=[c3]|Index does not contain required columns for right subplan. Required indexed columns: [c5,c3], Indexed columns: [c3]|
+|Project @ 1|leftDfJoinIndex|CI       |ANOTHER_INDEX_APPLIED|appliedIndex=[leftDfFilterIndex]                            |Another candidate index is applied: leftDfFilterIndex                                                              |
+|Project @ 4|leftDfJoinIndex|CI       |MISSING_REQUIRED_COL |requiredCols=[c5,c3], indexCols=[c3,c4]                     |Index does not contain required columns. Required columns: [c5,c3], Index columns: [c3,c4]                         |
++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+