diff --git a/src/main/scala/com/microsoft/hyperspace/index/covering/FilterIndexRule.scala b/src/main/scala/com/microsoft/hyperspace/index/covering/FilterIndexRule.scala index 24ec60f79..fc510aec4 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/covering/FilterIndexRule.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/covering/FilterIndexRule.scala @@ -99,7 +99,7 @@ object FilterColumnFilter extends QueryPlanIndexFilter { index, FilterReasons.NoFirstIndexedColCond( index.derivedDataset.indexedColumns.head, - filterColumnNames.mkString(", "))) { + filterColumnNames.mkString(","))) { ResolverUtils .resolve(spark, index.derivedDataset.indexedColumns.head, filterColumnNames) .isDefined diff --git a/src/main/scala/com/microsoft/hyperspace/index/covering/JoinIndexRule.scala b/src/main/scala/com/microsoft/hyperspace/index/covering/JoinIndexRule.scala index 8f2441263..eabc2aad3 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/covering/JoinIndexRule.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/covering/JoinIndexRule.scala @@ -431,8 +431,8 @@ object JoinColumnFilter extends QueryPlanIndexFilter { idx, FilterReasons.NotAllJoinColIndexed( leftOrRight, - requiredIndexCols.mkString(", "), - idx.indexedColumns.mkString(", "))) { + requiredIndexCols.mkString(","), + idx.indexedColumns.mkString(","))) { requiredIndexCols.toSet.equals(idx.indexedColumns.toSet) } && withFilterReasonTag( @@ -440,8 +440,8 @@ object JoinColumnFilter extends QueryPlanIndexFilter { idx, FilterReasons.MissingIndexedCol( leftOrRight, - allRequiredCols.mkString(", "), - idx.indexedColumns.mkString(", "))) { + allRequiredCols.mkString(","), + idx.indexedColumns.mkString(","))) { allRequiredCols.forall(allCols.contains) } } diff --git a/src/main/scala/com/microsoft/hyperspace/index/plananalysis/CandidateIndexAnalyzer.scala b/src/main/scala/com/microsoft/hyperspace/index/plananalysis/CandidateIndexAnalyzer.scala index 2227b1f84..16dd3e641 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/plananalysis/CandidateIndexAnalyzer.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/plananalysis/CandidateIndexAnalyzer.scala @@ -119,7 +119,7 @@ object CandidateIndexAnalyzer extends Logging { } } } - .sortBy(r => (r._1, r._3)) + .sortBy(r => (r._1, r._2, r._3, r._4)) .distinct if (res.isEmpty) { @@ -186,12 +186,14 @@ object CandidateIndexAnalyzer extends Logging { matchStr.group(1) } .toSeq + .distinct + .sorted printIndexNames(appliedIndexNames) stringBuilder.append("Applicable indexes, but not applied due to priority:") stringBuilder.append(newLine) val applicableButNotAppliedIndexNames = - applicableIndexes.map(_._1.name).distinct.filterNot(appliedIndexNames.contains(_)) + applicableIndexes.map(_._1.name).distinct.sorted.filterNot(appliedIndexNames.contains(_)) printIndexNames(applicableButNotAppliedIndexNames) // Covert reasons to Dataframe rows. @@ -214,7 +216,7 @@ object CandidateIndexAnalyzer extends Logging { } } } - .sortBy(r => (r._1, r._3)) + .sortBy(r => (r._1, r._2, r._3, r._4, r._5)) .distinct import spark.implicits._ @@ -233,6 +235,7 @@ object CandidateIndexAnalyzer extends Logging { .filter(row => row._4.equals("SOURCE_DATA_CHANGE")) .map(_._2) .distinct + .sorted .filterNot(appliedIndexNames.contains(_)) .filterNot(applicableButNotAppliedIndexNames.contains(_)) printIndexNames(indexNamesForOutdated) @@ -245,6 +248,7 @@ object CandidateIndexAnalyzer extends Logging { row._4.equals("COL_SCHEMA_MISMATCH") || row._4.equals("SOURCE_DATA_CHANGE")) .map(_._2) .distinct + .sorted .filterNot(appliedIndexNames.contains(_)) .filterNot(applicableButNotAppliedIndexNames.contains(_)) printIndexNames(indexNamesForNoApplicablePlan) diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/ColumnSchemaFilter.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/ColumnSchemaFilter.scala index cecb98476..dfd8d25b1 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/rules/ColumnSchemaFilter.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/rules/ColumnSchemaFilter.scala @@ -34,8 +34,8 @@ object ColumnSchemaFilter extends SourcePlanIndexFilter { plan, index, FilterReasons.ColSchemaMismatch( - relationColumnNames.mkString(", "), - index.derivedDataset.referencedColumns.mkString(", "))) { + relationColumnNames.mkString(","), + index.derivedDataset.referencedColumns.mkString(","))) { ResolverUtils .resolve(spark, index.derivedDataset.referencedColumns, relationColumnNames) .isDefined diff --git a/src/test/resources/expected/spark-2.4/whyNot_allIndex.txt b/src/test/resources/expected/spark-2.4/whyNot_allIndex.txt new file mode 100644 index 000000000..96809a4ef --- /dev/null +++ b/src/test/resources/expected/spark-2.4/whyNot_allIndex.txt @@ -0,0 +1,63 @@ + +============================================================= +Plan with Hyperspace & Summary: +============================================================= +Join Inner, (c3# = c3#) +:- Project [c4#, c3#] +: +- Filter ((isnotnull(c4#) && (c4# = 2)) && isnotnull(c3#)) +: +- Relation[c3#,c4#] Hyperspace(Type: CI, Name: leftDfFilterIndex, LogVersion: 1) ++- Project [c5#, c3#] + +- Filter ((isnotnull(c5#) && (c5# = 3000)) && isnotnull(c3#)) + +- Relation[c3#,c5#] Hyperspace(Type: CI, Name: rightDfFilterIndex, LogVersion: 1) + +Applied indexes: +- leftDfFilterIndex +- rightDfFilterIndex + +Applicable indexes, but not applied due to priority: +- leftDfJoinIndex +- rightDfJoinIndex + +Non-applicable indexes - index is outdated: +- No such index found. + +Non-applicable indexes - no applicable query plan: +- No such index found. + +For more information, please visit: https://microsoft.github.io/hyperspace/docs/why-not-result-analysis + +============================================================= +Plan without Hyperspace & WhyNot reasons: +============================================================= +00 Join Inner, (c3# = c3#) +01 :- Project [c4#, c3#] +02 : +- Filter ((isnotnull(c4#) && (c4# = 2)) && isnotnull(c3#)) +03 : +- Relation[c1#,c2#,c3#,c4#,c5#] parquet +04 +- Project [c5#, c3#] +05 +- Filter ((isnotnull(c5#) && (c5# = 3000)) && isnotnull(c3#)) +06 +- Relation[c1#,c2#,c3#,c4#,c5#] parquet + ++-----------+------------------+---------+-------------------------+------------------------------------------------------------+ +|SubPlan |IndexName |IndexType|Reason |Message | ++-----------+------------------+---------+-------------------------+------------------------------------------------------------+ +|Filter @ 2 |leftDfFilterIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c4,c3] | +|Filter @ 2 |leftDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4] | +|Filter @ 2 |rightDfFilterIndex|CI |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c5], filterCols=[c4,c3] | +|Filter @ 2 |rightDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c5] | +|Filter @ 5 |leftDfFilterIndex |CI |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c4], filterCols=[c5,c3] | +|Filter @ 5 |leftDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4] | +|Filter @ 5 |rightDfFilterIndex|CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c5,c3] | +|Filter @ 5 |rightDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c5] | +|Join @ 0 |leftDfFilterIndex |CI |NOT_ALL_JOIN_COL_INDEXED |child=[left], joinCols=[c3], indexedCols=[c4] | +|Join @ 0 |leftDfFilterIndex |CI |NOT_ALL_JOIN_COL_INDEXED |child=[right], joinCols=[c3], indexedCols=[c4] | +|Join @ 0 |leftDfJoinIndex |CI |MISSING_INDEXED_COL |child=[right], requiredIndexedCols=[c5,c3], IndexedCols=[c3]| +|Join @ 0 |rightDfFilterIndex|CI |NOT_ALL_JOIN_COL_INDEXED |child=[left], joinCols=[c3], indexedCols=[c5] | +|Join @ 0 |rightDfFilterIndex|CI |NOT_ALL_JOIN_COL_INDEXED |child=[right], joinCols=[c3], indexedCols=[c5] | +|Join @ 0 |rightDfJoinIndex |CI |MISSING_INDEXED_COL |child=[left], requiredIndexedCols=[c4,c3], IndexedCols=[c3] | +|Project @ 1|leftDfJoinIndex |CI |ANOTHER_INDEX_APPLIED |appliedIndex=[leftDfFilterIndex] | +|Project @ 1|rightDfFilterIndex|CI |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c5], filterCols=[c4,c3] | +|Project @ 1|rightDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c4,c3], indexCols=[c3,c5] | +|Project @ 4|leftDfFilterIndex |CI |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c4], filterCols=[c5,c3] | +|Project @ 4|leftDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c5,c3], indexCols=[c3,c4] | +|Project @ 4|rightDfJoinIndex |CI |ANOTHER_INDEX_APPLIED |appliedIndex=[rightDfFilterIndex] | ++-----------+------------------+---------+-------------------------+------------------------------------------------------------+ diff --git a/src/test/resources/expected/spark-2.4/whyNot_indexName.txt b/src/test/resources/expected/spark-2.4/whyNot_indexName.txt new file mode 100644 index 000000000..09802beef --- /dev/null +++ b/src/test/resources/expected/spark-2.4/whyNot_indexName.txt @@ -0,0 +1,48 @@ + +============================================================= +Plan with Hyperspace & Summary: +============================================================= +Join Inner, (c3# = c3#) +:- Project [c4#, c3#] +: +- Filter ((isnotnull(c4#) && (c4# = 2)) && isnotnull(c3#)) +: +- Relation[c3#,c4#] Hyperspace(Type: CI, Name: leftDfFilterIndex, LogVersion: 1) ++- Project [c5#, c3#] + +- Filter ((isnotnull(c5#) && (c5# = 3000)) && isnotnull(c3#)) + +- Relation[c3#,c5#] Hyperspace(Type: CI, Name: rightDfFilterIndex, LogVersion: 1) + +Applied indexes: +- leftDfFilterIndex +- rightDfFilterIndex + +Applicable indexes, but not applied due to priority: +- leftDfJoinIndex +- rightDfJoinIndex + +Non-applicable indexes - index is outdated: +- No such index found. + +Non-applicable indexes - no applicable query plan: +- No such index found. + +For more information, please visit: https://microsoft.github.io/hyperspace/docs/why-not-result-analysis + +============================================================= +Plan without Hyperspace & WhyNot reasons: +============================================================= +00 Join Inner, (c3# = c3#) +01 :- Project [c4#, c3#] +02 : +- Filter ((isnotnull(c4#) && (c4# = 2)) && isnotnull(c3#)) +03 : +- Relation[c1#,c2#,c3#,c4#,c5#] parquet +04 +- Project [c5#, c3#] +05 +- Filter ((isnotnull(c5#) && (c5# = 3000)) && isnotnull(c3#)) +06 +- Relation[c1#,c2#,c3#,c4#,c5#] parquet + ++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+ +|SubPlan |IndexName |IndexType|Reason |Message |VerboseMessage | ++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+ +|Filter @ 2 |leftDfJoinIndex|CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4] |Index does not contain required columns. Required columns: [c3,c4,c5,c2,c1], Index columns: [c3,c4] | +|Filter @ 5 |leftDfJoinIndex|CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4] |Index does not contain required columns. Required columns: [c3,c4,c5,c2,c1], Index columns: [c3,c4] | +|Join @ 0 |leftDfJoinIndex|CI |MISSING_INDEXED_COL |child=[right], requiredIndexedCols=[c5,c3], IndexedCols=[c3]|Index does not contain required columns for right subplan. Required indexed columns: [c5,c3], Indexed columns: [c3]| +|Project @ 1|leftDfJoinIndex|CI |ANOTHER_INDEX_APPLIED|appliedIndex=[leftDfFilterIndex] |Another candidate index is applied: leftDfFilterIndex | +|Project @ 4|leftDfJoinIndex|CI |MISSING_REQUIRED_COL |requiredCols=[c5,c3], indexCols=[c3,c4] |Index does not contain required columns. Required columns: [c5,c3], Index columns: [c3,c4] | ++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+ diff --git a/src/test/resources/expected/spark-3.0/whyNot_allIndex.txt b/src/test/resources/expected/spark-3.0/whyNot_allIndex.txt new file mode 100644 index 000000000..b3520b5b4 --- /dev/null +++ b/src/test/resources/expected/spark-3.0/whyNot_allIndex.txt @@ -0,0 +1,63 @@ + +============================================================= +Plan with Hyperspace & Summary: +============================================================= +Join Inner, (c3# = c3#) +:- Project [c4#, c3#] +: +- Filter ((isnotnull(c4#) AND (c4# = 2)) AND isnotnull(c3#)) +: +- Relation[c3#,c4#] Hyperspace(Type: CI, Name: leftDfFilterIndex, LogVersion: 1) ++- Project [c5#, c3#] + +- Filter ((isnotnull(c5#) AND (c5# = 3000)) AND isnotnull(c3#)) + +- Relation[c3#,c5#] Hyperspace(Type: CI, Name: rightDfFilterIndex, LogVersion: 1) + +Applied indexes: +- leftDfFilterIndex +- rightDfFilterIndex + +Applicable indexes, but not applied due to priority: +- leftDfJoinIndex +- rightDfJoinIndex + +Non-applicable indexes - index is outdated: +- No such index found. + +Non-applicable indexes - no applicable query plan: +- No such index found. + +For more information, please visit: https://microsoft.github.io/hyperspace/docs/why-not-result-analysis + +============================================================= +Plan without Hyperspace & WhyNot reasons: +============================================================= +00 Join Inner, (c3# = c3#) +01 :- Project [c4#, c3#] +02 : +- Filter ((isnotnull(c4#) AND (c4# = 2)) AND isnotnull(c3#)) +03 : +- Relation[c1#,c2#,c3#,c4#,c5#] parquet +04 +- Project [c5#, c3#] +05 +- Filter ((isnotnull(c5#) AND (c5# = 3000)) AND isnotnull(c3#)) +06 +- Relation[c1#,c2#,c3#,c4#,c5#] parquet + ++-----------+------------------+---------+-------------------------+------------------------------------------------------------+ +|SubPlan |IndexName |IndexType|Reason |Message | ++-----------+------------------+---------+-------------------------+------------------------------------------------------------+ +|Filter @ 2 |leftDfFilterIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c4,c3] | +|Filter @ 2 |leftDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4] | +|Filter @ 2 |rightDfFilterIndex|CI |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c5], filterCols=[c4,c3] | +|Filter @ 2 |rightDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c5] | +|Filter @ 5 |leftDfFilterIndex |CI |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c4], filterCols=[c5, c3] | +|Filter @ 5 |leftDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4] | +|Filter @ 5 |rightDfFilterIndex|CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c5,c3] | +|Filter @ 5 |rightDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c5] | +|Join @ 0 |leftDfFilterIndex |CI |NOT_ALL_JOIN_COL_INDEXED |child=[left], joinCols=[c3], indexedCols=[c4] | +|Join @ 0 |leftDfFilterIndex |CI |NOT_ALL_JOIN_COL_INDEXED |child=[right], joinCols=[c3], indexedCols=[c4] | +|Join @ 0 |leftDfJoinIndex |CI |MISSING_INDEXED_COL |child=[right], requiredIndexedCols=[c5,c3], IndexedCols=[c3]| +|Join @ 0 |rightDfFilterIndex|CI |NOT_ALL_JOIN_COL_INDEXED |child=[left], joinCols=[c3], indexedCols=[c5] | +|Join @ 0 |rightDfFilterIndex|CI |NOT_ALL_JOIN_COL_INDEXED |child=[right], joinCols=[c3], indexedCols=[c5] | +|Join @ 0 |rightDfJoinIndex |CI |MISSING_INDEXED_COL |child=[left], requiredIndexedCols=[c4,c3], IndexedCols=[c3] | +|Project @ 1|leftDfJoinIndex |CI |ANOTHER_INDEX_APPLIED |appliedIndex=[leftDfFilterIndex] | +|Project @ 1|rightDfFilterIndex|CI |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c5], filterCols=[c4,c3] | +|Project @ 1|rightDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c4,c3], indexCols=[c3,c5] | +|Project @ 4|leftDfFilterIndex |CI |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c4], filterCols=[c5,c3] | +|Project @ 4|leftDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c5,c3], indexCols=[c3,c4] | +|Project @ 4|rightDfJoinIndex |CI |ANOTHER_INDEX_APPLIED |appliedIndex=[rightDfFilterIndex] | ++-----------+------------------+---------+-------------------------+------------------------------------------------------------+ diff --git a/src/test/resources/expected/spark-3.0/whyNot_indexName.txt b/src/test/resources/expected/spark-3.0/whyNot_indexName.txt new file mode 100644 index 000000000..81db114e1 --- /dev/null +++ b/src/test/resources/expected/spark-3.0/whyNot_indexName.txt @@ -0,0 +1,48 @@ + +============================================================= +Plan with Hyperspace & Summary: +============================================================= +Join Inner, (c3# = c3#) +:- Project [c4#, c3#] +: +- Filter ((isnotnull(c4#) AND (c4# = 2)) AND isnotnull(c3#)) +: +- Relation[c3#,c4#] Hyperspace(Type: CI, Name: leftDfFilterIndex, LogVersion: 1) ++- Project [c5#, c3#] + +- Filter ((isnotnull(c5#) AND (c5# = 3000)) AND isnotnull(c3#)) + +- Relation[c3#,c5#] Hyperspace(Type: CI, Name: rightDfFilterIndex, LogVersion: 1) + +Applied indexes: +- leftDfFilterIndex +- rightDfFilterIndex + +Applicable indexes, but not applied due to priority: +- leftDfJoinIndex +- rightDfJoinIndex + +Non-applicable indexes - index is outdated: +- No such index found. + +Non-applicable indexes - no applicable query plan: +- No such index found. + +For more information, please visit: https://microsoft.github.io/hyperspace/docs/why-not-result-analysis + +============================================================= +Plan without Hyperspace & WhyNot reasons: +============================================================= +00 Join Inner, (c3# = c3#) +01 :- Project [c4#, c3#] +02 : +- Filter ((isnotnull(c4#) AND (c4# = 2)) AND isnotnull(c3#)) +03 : +- Relation[c1#,c2#,c3#,c4#,c5#] parquet +04 +- Project [c5#, c3#] +05 +- Filter ((isnotnull(c5#) AND (c5# = 3000)) AND isnotnull(c3#)) +06 +- Relation[c1#,c2#,c3#,c4#,c5#] parquet + ++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+ +|SubPlan |IndexName |IndexType|Reason |Message |VerboseMessage | ++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+ +|Filter @ 2 |leftDfJoinIndex|CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4] |Index does not contain required columns. Required columns: [c3,c4,c5,c2,c1], Index columns: [c3,c4] | +|Filter @ 5 |leftDfJoinIndex|CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4] |Index does not contain required columns. Required columns: [c3,c4,c5,c2,c1], Index columns: [c3,c4] | +|Join @ 0 |leftDfJoinIndex|CI |MISSING_INDEXED_COL |child=[right], requiredIndexedCols=[c5,c3], IndexedCols=[c3]|Index does not contain required columns for right subplan. Required indexed columns: [c5,c3], Indexed columns: [c3]| +|Project @ 1|leftDfJoinIndex|CI |ANOTHER_INDEX_APPLIED|appliedIndex=[leftDfFilterIndex] |Another candidate index is applied: leftDfFilterIndex | +|Project @ 4|leftDfJoinIndex|CI |MISSING_REQUIRED_COL |requiredCols=[c5,c3], indexCols=[c3,c4] |Index does not contain required columns. Required columns: [c5,c3], Index columns: [c3,c4] | ++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+ diff --git a/src/test/resources/expected/spark-3.1/whyNot_allIndex.txt b/src/test/resources/expected/spark-3.1/whyNot_allIndex.txt new file mode 100644 index 000000000..aefb6de6d --- /dev/null +++ b/src/test/resources/expected/spark-3.1/whyNot_allIndex.txt @@ -0,0 +1,63 @@ + +============================================================= +Plan with Hyperspace & Summary: +============================================================= +Join Inner, (c3# = c3#) +:- Project [c4#, c3#] +: +- Filter ((isnotnull(c4#) AND (c4# = 2)) AND isnotnull(c3#)) +: +- Relation[c3#,c4#] Hyperspace(Type: CI, Name: leftDfFilterIndex, LogVersion: 1) ++- Project [c5#, c3#] + +- Filter ((isnotnull(c5#) AND (c5# = 3000)) AND isnotnull(c3#)) + +- Relation[c3#,c5#] Hyperspace(Type: CI, Name: rightDfFilterIndex, LogVersion: 1) + +Applied indexes: +- leftDfFilterIndex +- rightDfFilterIndex + +Applicable indexes, but not applied due to priority: +- leftDfJoinIndex +- rightDfJoinIndex + +Non-applicable indexes - index is outdated: +- No such index found. + +Non-applicable indexes - no applicable query plan: +- No such index found. + +For more information, please visit: https://microsoft.github.io/hyperspace/docs/why-not-result-analysis + +============================================================= +Plan without Hyperspace & WhyNot reasons: +============================================================= +00 Join Inner, (c3# = c3#) +01 :- Project [c4#, c3#] +02 : +- Filter ((isnotnull(c4#) AND (c4# = 2)) AND isnotnull(c3#)) +03 : +- Relation[c1#,c2#,c3#,c4#,c5#] parquet +04 +- Project [c5#, c3#] +05 +- Filter ((isnotnull(c5#) AND (c5# = 3000)) AND isnotnull(c3#)) +06 +- Relation[c1#,c2#,c3#,c4#,c5#] parquet + ++-----------+------------------+---------+-------------------------+------------------------------------------------------------+ +|SubPlan |IndexName |IndexType|Reason |Message | ++-----------+------------------+---------+-------------------------+------------------------------------------------------------+ +|Filter @ 2 |leftDfFilterIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c4,c3] | +|Filter @ 2 |leftDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4] | +|Filter @ 2 |rightDfFilterIndex|CI |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c5], filterCols=[c4,c3] | +|Filter @ 2 |rightDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c5] | +|Filter @ 5 |leftDfFilterIndex |CI |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c4], filterCols=[c5,c3] | +|Filter @ 5 |leftDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4] | +|Filter @ 5 |rightDfFilterIndex|CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c5,c3] | +|Filter @ 5 |rightDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c5] | +|Join @ 0 |leftDfFilterIndex |CI |NOT_ALL_JOIN_COL_INDEXED |child=[left], joinCols=[c3], indexedCols=[c4] | +|Join @ 0 |leftDfFilterIndex |CI |NOT_ALL_JOIN_COL_INDEXED |child=[right], joinCols=[c3], indexedCols=[c4] | +|Join @ 0 |leftDfJoinIndex |CI |MISSING_INDEXED_COL |child=[right], requiredIndexedCols=[c5,c3], IndexedCols=[c3]| +|Join @ 0 |rightDfFilterIndex|CI |NOT_ALL_JOIN_COL_INDEXED |child=[left], joinCols=[c3], indexedCols=[c5] | +|Join @ 0 |rightDfFilterIndex|CI |NOT_ALL_JOIN_COL_INDEXED |child=[right], joinCols=[c3], indexedCols=[c5] | +|Join @ 0 |rightDfJoinIndex |CI |MISSING_INDEXED_COL |child=[left], requiredIndexedCols=[c4,c3], IndexedCols=[c3] | +|Project @ 1|leftDfJoinIndex |CI |ANOTHER_INDEX_APPLIED |appliedIndex=[leftDfFilterIndex] | +|Project @ 1|rightDfFilterIndex|CI |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c5], filterCols=[c4,c3] | +|Project @ 1|rightDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c4,c3], indexCols=[c3,c5] | +|Project @ 4|leftDfFilterIndex |CI |NO_FIRST_INDEXED_COL_COND|firstIndexedCol=[c4], filterCols=[c5,c3] | +|Project @ 4|leftDfJoinIndex |CI |MISSING_REQUIRED_COL |requiredCols=[c5,c3], indexCols=[c3,c4] | +|Project @ 4|rightDfJoinIndex |CI |ANOTHER_INDEX_APPLIED |appliedIndex=[rightDfFilterIndex] | ++-----------+------------------+---------+-------------------------+------------------------------------------------------------+ diff --git a/src/test/resources/expected/spark-3.1/whyNot_indexName.txt b/src/test/resources/expected/spark-3.1/whyNot_indexName.txt new file mode 100644 index 000000000..81db114e1 --- /dev/null +++ b/src/test/resources/expected/spark-3.1/whyNot_indexName.txt @@ -0,0 +1,48 @@ + +============================================================= +Plan with Hyperspace & Summary: +============================================================= +Join Inner, (c3# = c3#) +:- Project [c4#, c3#] +: +- Filter ((isnotnull(c4#) AND (c4# = 2)) AND isnotnull(c3#)) +: +- Relation[c3#,c4#] Hyperspace(Type: CI, Name: leftDfFilterIndex, LogVersion: 1) ++- Project [c5#, c3#] + +- Filter ((isnotnull(c5#) AND (c5# = 3000)) AND isnotnull(c3#)) + +- Relation[c3#,c5#] Hyperspace(Type: CI, Name: rightDfFilterIndex, LogVersion: 1) + +Applied indexes: +- leftDfFilterIndex +- rightDfFilterIndex + +Applicable indexes, but not applied due to priority: +- leftDfJoinIndex +- rightDfJoinIndex + +Non-applicable indexes - index is outdated: +- No such index found. + +Non-applicable indexes - no applicable query plan: +- No such index found. + +For more information, please visit: https://microsoft.github.io/hyperspace/docs/why-not-result-analysis + +============================================================= +Plan without Hyperspace & WhyNot reasons: +============================================================= +00 Join Inner, (c3# = c3#) +01 :- Project [c4#, c3#] +02 : +- Filter ((isnotnull(c4#) AND (c4# = 2)) AND isnotnull(c3#)) +03 : +- Relation[c1#,c2#,c3#,c4#,c5#] parquet +04 +- Project [c5#, c3#] +05 +- Filter ((isnotnull(c5#) AND (c5# = 3000)) AND isnotnull(c3#)) +06 +- Relation[c1#,c2#,c3#,c4#,c5#] parquet + ++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+ +|SubPlan |IndexName |IndexType|Reason |Message |VerboseMessage | ++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+ +|Filter @ 2 |leftDfJoinIndex|CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4] |Index does not contain required columns. Required columns: [c3,c4,c5,c2,c1], Index columns: [c3,c4] | +|Filter @ 5 |leftDfJoinIndex|CI |MISSING_REQUIRED_COL |requiredCols=[c3,c4,c5,c2,c1], indexCols=[c3,c4] |Index does not contain required columns. Required columns: [c3,c4,c5,c2,c1], Index columns: [c3,c4] | +|Join @ 0 |leftDfJoinIndex|CI |MISSING_INDEXED_COL |child=[right], requiredIndexedCols=[c5,c3], IndexedCols=[c3]|Index does not contain required columns for right subplan. Required indexed columns: [c5,c3], Indexed columns: [c3]| +|Project @ 1|leftDfJoinIndex|CI |ANOTHER_INDEX_APPLIED|appliedIndex=[leftDfFilterIndex] |Another candidate index is applied: leftDfFilterIndex | +|Project @ 4|leftDfJoinIndex|CI |MISSING_REQUIRED_COL |requiredCols=[c5,c3], indexCols=[c3,c4] |Index does not contain required columns. Required columns: [c5,c3], Index columns: [c3,c4] | ++-----------+---------------+---------+---------------------+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+ diff --git a/src/test/scala/com/microsoft/hyperspace/index/rules/ScoreBasedIndexPlanOptimizerTest.scala b/src/test/scala/com/microsoft/hyperspace/index/rules/ScoreBasedIndexPlanOptimizerTest.scala index 8f5cdc3d6..91053ce86 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/rules/ScoreBasedIndexPlanOptimizerTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/rules/ScoreBasedIndexPlanOptimizerTest.scala @@ -108,14 +108,40 @@ class ScoreBasedIndexPlanOptimizerTest extends QueryTest with HyperspaceSuite { assert(rightChildScore == 50) assert(!rightChildPlan.equals(plan.children.last)) - hyperspace.whyNot(query(leftDf, rightDf)()) - hyperspace.whyNot(query(leftDf, rightDf)(), "leftDfJoinIndex", extended = true) - hyperspace.explain(query(leftDf, rightDf)(), verbose = true) - verifyIndexUsage( query(leftDf, rightDf), getIndexFilesPath(leftDfFilterIndexConfig.indexName) ++ getIndexFilesPath(rightDfFilterIndexConfig.indexName)) + + def normalize(str: String): String = { + // Expression ids are removed before comparison since they can be different. + str.replaceAll("""#(\d+)|subquery(\d+)""", "#") + } + + // Verify whyNot result. + hyperspace.whyNot(query(leftDf, rightDf)()) { o => + val expectedOutput = getExpectedResult("whyNot_allIndex.txt") + .replace(System.lineSeparator(), "\n") + val actual = normalize(o.replace(System.lineSeparator(), "\n")) + // scalastyle:off + (1 to actual.length - 1).find{ i => + if (actual(i) != expectedOutput(i)) { + println(i) + println(actual(i).toInt + " : " + expectedOutput(i).toInt) + true + } else { + false + } + } + assert(actual.equals(expectedOutput), actual) + } + + hyperspace.whyNot(query(leftDf, rightDf)(), "leftDfJoinIndex", extended = true) { o => + val expectedOutput = getExpectedResult("whyNot_indexName.txt") + .replace(System.lineSeparator(), "\n") + val actual = normalize(o.replace(System.lineSeparator(), "\n")) + assert(actual.equals(expectedOutput), actual) + } } } }