diff --git a/pkg/planner/cardinality/cross_estimation.go b/pkg/planner/cardinality/cross_estimation.go index d249a47502855..fb4b754d9cda0 100644 --- a/pkg/planner/cardinality/cross_estimation.go +++ b/pkg/planner/cardinality/cross_estimation.go @@ -156,7 +156,7 @@ func crossEstimateRowCount(sctx planctx.PlanContext, if idxExists && len(idxIDs) > 0 { idxID = idxIDs[0] } - rangeCounts, ok := getColumnRangeCounts(sctx, colUniqueID, ranges, dsTableStats.HistColl, idxID) + rangeCounts, _, ok := getColumnRangeCounts(sctx, colUniqueID, ranges, dsTableStats.HistColl, idxID) if !ok { return 0, false, corr } @@ -166,7 +166,7 @@ func crossEstimateRowCount(sctx planctx.PlanContext, } var rangeCount float64 if idxExists { - rangeCount, err = GetRowCountByIndexRanges(sctx, dsTableStats.HistColl, idxID, convertedRanges) + rangeCount, _, err = GetRowCountByIndexRanges(sctx, dsTableStats.HistColl, idxID, convertedRanges) } else { rangeCount, err = GetRowCountByColumnRanges(sctx, dsTableStats.HistColl, colUniqueID, convertedRanges) } @@ -182,30 +182,30 @@ func crossEstimateRowCount(sctx planctx.PlanContext, } // getColumnRangeCounts estimates row count for each range respectively. -func getColumnRangeCounts(sctx planctx.PlanContext, colID int64, ranges []*ranger.Range, histColl *statistics.HistColl, idxID int64) ([]float64, bool) { +func getColumnRangeCounts(sctx planctx.PlanContext, colID int64, ranges []*ranger.Range, histColl *statistics.HistColl, idxID int64) ([]float64, float64, bool) { var err error - var count float64 + var count, corrCount float64 rangeCounts := make([]float64, len(ranges)) for i, ran := range ranges { if idxID >= 0 { idxHist := histColl.GetIdx(idxID) if statistics.IndexStatsIsInvalid(sctx, idxHist, histColl, idxID) { - return nil, false + return nil, 0, false } - count, err = GetRowCountByIndexRanges(sctx, histColl, idxID, []*ranger.Range{ran}) + count, corrCount, err = GetRowCountByIndexRanges(sctx, histColl, idxID, []*ranger.Range{ran}) } else { colHist := histColl.GetCol(colID) if statistics.ColumnStatsIsInvalid(colHist, sctx, histColl, colID) { - return nil, false + return nil, 0, false } count, err = GetRowCountByColumnRanges(sctx, histColl, colID, []*ranger.Range{ran}) } if err != nil { - return nil, false + return nil, 0, false } rangeCounts[i] = count } - return rangeCounts, true + return rangeCounts, corrCount, true } // convertRangeFromExpectedCnt builds new ranges used to estimate row count we need to scan in table scan before finding specified diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index 08641e20b84dd..78dea02c59d52 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -38,7 +38,7 @@ import ( ) // GetRowCountByIndexRanges estimates the row count by a slice of Range. -func GetRowCountByIndexRanges(sctx planctx.PlanContext, coll *statistics.HistColl, idxID int64, indexRanges []*ranger.Range) (result float64, err error) { +func GetRowCountByIndexRanges(sctx planctx.PlanContext, coll *statistics.HistColl, idxID int64, indexRanges []*ranger.Range) (result float64, corrResult float64, err error) { var name string if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { debugtrace.EnterContextCommon(sctx) @@ -69,7 +69,7 @@ func GetRowCountByIndexRanges(sctx planctx.PlanContext, coll *statistics.HistCol if err == nil && sc.EnableOptimizerCETrace && idx != nil { ceTraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats-Pseudo", uint64(result)) } - return result, err + return result, 0, err } realtimeCnt, modifyCount := coll.GetScaledRealtimeAndModifyCnt(idx) if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { @@ -79,15 +79,16 @@ func GetRowCountByIndexRanges(sctx planctx.PlanContext, coll *statistics.HistCol "Increase Factor", idx.GetIncreaseFactor(realtimeCnt), ) } + corrResult = float64(0) if idx.CMSketch != nil && idx.StatsVer == statistics.Version1 { result, err = getIndexRowCountForStatsV1(sctx, coll, idxID, indexRanges) } else { - result, err = getIndexRowCountForStatsV2(sctx, idx, coll, indexRanges, realtimeCnt, modifyCount) + result, corrResult, err = getIndexRowCountForStatsV2(sctx, idx, coll, indexRanges, realtimeCnt, modifyCount) } if sc.EnableOptimizerCETrace { ceTraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats", uint64(result)) } - return result, errors.Trace(err) + return result, corrResult, errors.Trace(err) } func getIndexRowCountForStatsV1(sctx planctx.PlanContext, coll *statistics.HistColl, idxID int64, indexRanges []*ranger.Range) (float64, error) { @@ -117,7 +118,7 @@ func getIndexRowCountForStatsV1(sctx planctx.PlanContext, coll *statistics.HistC // values in this case. if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) { realtimeCnt, modifyCount := coll.GetScaledRealtimeAndModifyCnt(idx) - count, err := getIndexRowCountForStatsV2(sctx, idx, nil, []*ranger.Range{ran}, realtimeCnt, modifyCount) + count, _, err := getIndexRowCountForStatsV2(sctx, idx, nil, []*ranger.Range{ran}, realtimeCnt, modifyCount) if err != nil { return 0, errors.Trace(err) } @@ -181,7 +182,7 @@ func getIndexRowCountForStatsV1(sctx planctx.PlanContext, coll *statistics.HistC // prefer index stats over column stats if idxIDs, ok := coll.ColUniqueID2IdxIDs[colUniqueID]; ok && len(idxIDs) > 0 { idxID := idxIDs[0] - count, err = GetRowCountByIndexRanges(sctx, coll, idxID, []*ranger.Range{&rang}) + count, _, err = GetRowCountByIndexRanges(sctx, coll, idxID, []*ranger.Range{&rang}) } else { count, err = GetRowCountByColumnRanges(sctx, coll, colUniqueID, []*ranger.Range{&rang}) } @@ -215,26 +216,26 @@ func isSingleColIdxNullRange(idx *statistics.Index, ran *ranger.Range) bool { } // It uses the modifyCount to validate, and realtimeRowCount to adjust the influence of modifications on the table. -func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (float64, error) { +func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (totalCount float64, corrCount float64, err error) { sc := sctx.GetSessionVars().StmtCtx debugTrace := sc.EnableOptimizerDebugTrace if debugTrace { debugtrace.EnterContextCommon(sctx) defer debugtrace.LeaveContextCommon(sctx) } - totalCount := float64(0) isSingleColIdx := len(idx.Info.Columns) == 1 for _, indexRange := range indexRanges { var count float64 - lb, err := codec.EncodeKey(sc.TimeZone(), nil, indexRange.LowVal...) + var lb, rb []byte + lb, err = codec.EncodeKey(sc.TimeZone(), nil, indexRange.LowVal...) err = sc.HandleError(err) if err != nil { - return 0, err + return 0, 0, err } - rb, err := codec.EncodeKey(sc.TimeZone(), nil, indexRange.HighVal...) + rb, err = codec.EncodeKey(sc.TimeZone(), nil, indexRange.HighVal...) err = sc.HandleError(err) if err != nil { - return 0, err + return 0, 0, err } if debugTrace { debugTraceStartEstimateRange(sctx, indexRange, lb, rb, totalCount) @@ -293,13 +294,14 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, // Due to the limitation of calcFraction and convertDatumToScalar, the histogram actually won't estimate anything. // If the first column's range is point. if rangePosition := getOrdinalOfRangeCond(sc, indexRange); rangePosition > 0 && idx.StatsVer >= statistics.Version2 && coll != nil { - var expBackoffSel float64 - expBackoffSel, expBackoffSuccess, err = expBackoffEstimation(sctx, idx, coll, indexRange) + var expBackoffSel, corrSel float64 + expBackoffSel, corrSel, expBackoffSuccess, err = expBackoffEstimation(sctx, idx, coll, indexRange) if err != nil { - return 0, err + return 0, 0, err } if expBackoffSuccess { expBackoffCnt := expBackoffSel * idx.TotalRowCount() + corrCnt := corrSel * idx.TotalRowCount() upperLimit := expBackoffCnt // Use the multi-column stats to calculate the max possible row count of [l, r) @@ -326,6 +328,7 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, expBackoffCnt = upperLimit } count += expBackoffCnt + corrCount += corrCnt } } if !expBackoffSuccess { @@ -335,6 +338,7 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, // If the current table row count has changed, we should scale the row count accordingly. increaseFactor := idx.GetIncreaseFactor(realtimeRowCount) count *= increaseFactor + corrCount *= increaseFactor // handling the out-of-range part if (outOfRangeOnIndex(idx, l) && !(isSingleColIdx && lowIsNull)) || outOfRangeOnIndex(idx, r) { @@ -369,7 +373,7 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, // Don't allow the final result to go below 1 row totalCount = mathutil.Clamp(totalCount, 1, float64(realtimeRowCount)) } - return totalCount, nil + return totalCount, corrCount, nil } var nullKeyBytes, _ = codec.EncodeKey(time.UTC, nil, types.NewDatum(nil)) @@ -429,7 +433,7 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b } // expBackoffEstimation estimate the multi-col cases following the Exponential Backoff. See comment below for details. -func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRange *ranger.Range) (sel float64, success bool, err error) { +func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRange *ranger.Range) (sel float64, corrsel float64, success bool, err error) { if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { debugtrace.EnterContextCommon(sctx) defer func() { @@ -485,7 +489,7 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll continue } foundStats = true - count, err = GetRowCountByIndexRanges(sctx, coll, idxID, tmpRan) + count, _, err = GetRowCountByIndexRanges(sctx, coll, idxID, tmpRan) if err == nil { break } @@ -497,7 +501,7 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll continue } if err != nil { - return 0, false, err + return 0, 0, false, err } singleColumnEstResults = append(singleColumnEstResults, selectivity) } @@ -509,9 +513,9 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll l = 0 }) if l == 1 { - return singleColumnEstResults[0], true, nil + return singleColumnEstResults[0], 0, true, nil } else if l == 0 { - return 0, false, nil + return 0, 0, false, nil } // Do not allow the exponential backoff to go below the available index bound. If the number of predicates // is less than the number of index columns - use 90% of the bound to differentiate a subset from full index match. @@ -524,19 +528,21 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll if l < len(idx.Info.Columns) { idxLowBound /= 0.9 } + // corrsel is the selectivity of the most filtering column + corrsel = min(idxLowBound, singleColumnEstResults[0]) minTwoCol := min(singleColumnEstResults[0], singleColumnEstResults[1], idxLowBound) multTwoCol := singleColumnEstResults[0] * math.Sqrt(singleColumnEstResults[1]) if l == 2 { - return max(minTwoCol, multTwoCol), true, nil + return max(minTwoCol, multTwoCol), corrsel, true, nil } minThreeCol := min(minTwoCol, singleColumnEstResults[2]) multThreeCol := multTwoCol * math.Sqrt(math.Sqrt(singleColumnEstResults[2])) if l == 3 { - return max(minThreeCol, multThreeCol), true, nil + return max(minThreeCol, multThreeCol), corrsel, true, nil } minFourCol := min(minThreeCol, singleColumnEstResults[3]) multFourCol := multThreeCol * math.Sqrt(math.Sqrt(math.Sqrt(singleColumnEstResults[3]))) - return max(minFourCol, multFourCol), true, nil + return max(minFourCol, multFourCol), corrsel, true, nil } // outOfRangeOnIndex checks if the datum is out of the range. diff --git a/pkg/planner/cardinality/selectivity.go b/pkg/planner/cardinality/selectivity.go index 1fe6eb84524b3..4a42857b1029b 100644 --- a/pkg/planner/cardinality/selectivity.go +++ b/pkg/planner/cardinality/selectivity.go @@ -202,11 +202,12 @@ func Selectivity( if err != nil { return 0, nil, errors.Trace(err) } - cnt, err := GetRowCountByIndexRanges(ctx, coll, id, ranges) + cnt, corrCnt, err := GetRowCountByIndexRanges(ctx, coll, id, ranges) if err != nil { return 0, nil, errors.Trace(err) } selectivity := cnt / float64(coll.RealtimeCount) + corrSelectivity := corrCnt / float64(coll.RealtimeCount) nodes = append(nodes, &StatsNode{ Tp: IndexType, ID: id, @@ -214,6 +215,7 @@ func Selectivity( Ranges: ranges, numCols: len(idxStats.Info.Columns), Selectivity: selectivity, + CorrSelectivity: corrSelectivity, partCover: partCover, minAccessCondsForDNFCond: minAccessCondsForDNFCond, }) @@ -543,6 +545,10 @@ type StatsNode struct { mask int64 // Selectivity indicates the Selectivity of this column/index. Selectivity float64 + // CorrSelectivity indicates the Selectivity of this column/index with correlated column. + // That is - it is the selectivity assuming the most filtering column only, and all other + // columns are uncorrelated. + CorrSelectivity float64 // numCols is the number of columns contained in the index or column(which is always 1). numCols int // partCover indicates whether the bit in the mask is for a full cover or partial cover. It is only true diff --git a/pkg/planner/cardinality/selectivity_test.go b/pkg/planner/cardinality/selectivity_test.go index e645f3d863e79..42c51acbed469 100644 --- a/pkg/planner/cardinality/selectivity_test.go +++ b/pkg/planner/cardinality/selectivity_test.go @@ -252,11 +252,11 @@ func TestEstimationForUnknownValues(t *testing.T) { require.Equal(t, 12.2, count) idxID := table.Meta().Indices[0].ID - count, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(30, 30)) + count, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(30, 30)) require.NoError(t, err) require.Equal(t, 0.1, count) - count, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(9, 30)) + count, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(9, 30)) require.NoError(t, err) require.Equal(t, 10.0, count) @@ -286,7 +286,7 @@ func TestEstimationForUnknownValues(t *testing.T) { require.Equal(t, 1.0, count) idxID = table.Meta().Indices[0].ID - count, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(2, 2)) + count, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(2, 2)) require.NoError(t, err) require.Equal(t, 0.0, count) } @@ -377,11 +377,11 @@ func TestEstimationUniqueKeyEqualConds(t *testing.T) { sctx := mock.NewContext() idxID := table.Meta().Indices[0].ID - count, err := cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(7, 7)) + count, _, err := cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(7, 7)) require.NoError(t, err) require.Equal(t, 1.0, count) - count, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(6, 6)) + count, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(6, 6)) require.NoError(t, err) require.Equal(t, 1.0, count) @@ -1011,12 +1011,12 @@ func TestIssue39593(t *testing.T) { sctx := testKit.Session() idxID := tblInfo.Indices[0].ID vals := []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20} - count, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRanges(vals, vals)) + count, _, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRanges(vals, vals)) require.NoError(t, err) // estimated row count without any changes require.Equal(t, float64(360), count) statsTbl.RealtimeCount *= 10 - count, err = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRanges(vals, vals)) + count, _, err = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRanges(vals, vals)) require.NoError(t, err) // estimated row count after mock modify on the table require.Equal(t, float64(3600), count) diff --git a/pkg/planner/core/debugtrace.go b/pkg/planner/core/debugtrace.go index 254c278847ed1..b2ee3166e64d0 100644 --- a/pkg/planner/core/debugtrace.go +++ b/pkg/planner/core/debugtrace.go @@ -227,13 +227,14 @@ func stabilizeGetStatsTblInfo(info *getStatsTblInfo) { */ type accessPathForDebugTrace struct { - IndexName string `json:",omitempty"` - AccessConditions []string - IndexFilters []string - TableFilters []string - PartialPaths []accessPathForDebugTrace `json:",omitempty"` - CountAfterAccess float64 - CountAfterIndex float64 + IndexName string `json:",omitempty"` + AccessConditions []string + IndexFilters []string + TableFilters []string + PartialPaths []accessPathForDebugTrace `json:",omitempty"` + CountAfterAccess float64 + CorrCountAfterAccess float64 + CountAfterIndex float64 } func convertAccessPathForDebugTrace(ctx expression.EvalContext, path *util.AccessPath, out *accessPathForDebugTrace) { @@ -244,6 +245,7 @@ func convertAccessPathForDebugTrace(ctx expression.EvalContext, path *util.Acces out.IndexFilters = expression.ExprsToStringsForDisplay(ctx, path.IndexFilters) out.TableFilters = expression.ExprsToStringsForDisplay(ctx, path.TableFilters) out.CountAfterAccess = path.CountAfterAccess + out.CorrCountAfterAccess = path.CorrCountAfterAccess out.CountAfterIndex = path.CountAfterIndex out.PartialPaths = make([]accessPathForDebugTrace, len(path.PartialIndexPaths)) for i, partialPath := range path.PartialIndexPaths { diff --git a/pkg/planner/core/exhaust_physical_plans.go b/pkg/planner/core/exhaust_physical_plans.go index 3c47bcce17fab..96f315b091c67 100644 --- a/pkg/planner/core/exhaust_physical_plans.go +++ b/pkg/planner/core/exhaust_physical_plans.go @@ -1347,10 +1347,11 @@ func constructInnerIndexScanTask( rowCount = math.Min(rowCount, 1.0) } tmpPath := &util.AccessPath{ - IndexFilters: indexConds, - TableFilters: tblConds, - CountAfterIndex: rowCount, - CountAfterAccess: rowCount, + IndexFilters: indexConds, + TableFilters: tblConds, + CountAfterIndex: rowCount, + CountAfterAccess: rowCount, + CorrCountAfterAccess: 0, } // Assume equal conditions used by index join and other conditions are independent. if len(tblConds) > 0 { diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index ac93c82bdb5f4..d85f3efdabc86 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -728,32 +728,50 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, prop * if statsTbl != nil && rhs.path.Index != nil { rhsHasStatistics = statsTbl.ColAndIdxExistenceMap.HasAnalyzed(rhs.path.Index.ID, true) } - if !lhs.path.IsTablePath() && !rhs.path.IsTablePath() && // Not a table scan - (lhsHasStatistics || rhsHasStatistics) && // At least one index has statistics - (!lhsHasStatistics || !rhsHasStatistics) && // At least one index doesn't have statistics - len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 { // not IndexMerge due to unreliability - lhsTotalEqual := lhs.path.EqCondCount + lhs.path.EqOrInCondCount - rhsTotalEqual := rhs.path.EqCondCount + rhs.path.EqOrInCondCount - if lhsHasStatistics && lhsTotalEqual > 0 && lhsTotalEqual >= rhsTotalEqual { + lhsTotalEqual := lhs.path.EqCondCount + lhs.path.EqOrInCondCount + rhsTotalEqual := rhs.path.EqCondCount + rhs.path.EqOrInCondCount + lhsMoreFilters := (lhsTotalEqual > rhsTotalEqual || (lhsTotalEqual > 0 && lhsTotalEqual == rhsTotalEqual && len(lhs.path.IndexFilters) >= len(rhs.path.IndexFilters))) + rhsMoreFilters := (rhsTotalEqual > lhsTotalEqual || (rhsTotalEqual > 0 && rhsTotalEqual == lhsTotalEqual && len(rhs.path.IndexFilters) >= len(lhs.path.IndexFilters))) + + if len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 && + !lhs.path.IsTablePath() && !rhs.path.IsTablePath() { // Not a table scan + if (lhsHasStatistics || rhsHasStatistics) && // At least one index has statistics + (!lhsHasStatistics || !rhsHasStatistics) { // At least one index doesn't have statistics + if lhsHasStatistics && lhsTotalEqual > 0 && lhsMoreFilters { + return 1 + } + if rhsHasStatistics && rhsTotalEqual > 0 && rhsMoreFilters { + return -1 + } + } + + lhsCorrRatio, rhsCorrRatio := 0.0, 0.0 + if lhs.path.CorrCountAfterAccess > 0 || rhs.path.CorrCountAfterAccess > 0 { + lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess + rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess + } + if lhsMoreFilters && lhsCorrRatio < rhsCorrRatio { return 1 } - if rhsHasStatistics && rhsTotalEqual > 0 && rhsTotalEqual >= lhsTotalEqual { + if rhsMoreFilters && rhsCorrRatio < lhsCorrRatio { return -1 } - } - // This rule is empirical but not always correct. - // If x's range row count is significantly lower than y's, for example, 1000 times, we think x is better. - if lhs.path.CountAfterAccess > 100 && rhs.path.CountAfterAccess > 100 && // to prevent some extreme cases, e.g. 0.01 : 10 - len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 && // not IndexMerge since its row count estimation is not accurate enough - prop.ExpectedCnt == math.MaxFloat64 { // Limit may affect access row count - threshold := float64(fixcontrol.GetIntWithDefault(sctx.GetSessionVars().OptimizerFixControl, fixcontrol.Fix45132, 1000)) - if threshold > 0 { // set it to 0 to disable this rule - if lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold { - return -1 - } - if rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold { - return 1 + // This rule is empirical but not always correct. + // If x's range row count is significantly lower than y's, for example, 1000 times, we think x is better. + if lhs.path.CorrCountAfterAccess > 100 && rhs.path.CorrCountAfterAccess > 100 && // to prevent some extreme cases, e.g. 0.01 : 10 + len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 && // not IndexMerge since its row count estimation is not accurate enough + prop.ExpectedCnt == math.MaxFloat64 { // Limit may affect access row count + threshold := float64(fixcontrol.GetIntWithDefault(sctx.GetSessionVars().OptimizerFixControl, fixcontrol.Fix45132, 1000)) + if threshold > 0 { // set it to 0 to disable this rule + if rhsMoreFilters && + (lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold || rhsCorrRatio < lhsCorrRatio) { + return -1 + } + if lhsMoreFilters && + (rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold || lhsCorrRatio < rhsCorrRatio) { + return 1 + } } } } diff --git a/pkg/planner/core/stats.go b/pkg/planner/core/stats.go index d48f711f5cc34..50ada5e873e31 100644 --- a/pkg/planner/core/stats.go +++ b/pkg/planner/core/stats.go @@ -176,6 +176,7 @@ func fillIndexPath(ds *logicalop.DataSource, path *util.AccessPath, conds []expr } path.Ranges = ranger.FullRange() path.CountAfterAccess = float64(ds.StatisticTable.RealtimeCount) + path.CorrCountAfterAccess = 0 path.IdxCols, path.IdxColLens = expression.IndexInfo2PrefixCols(ds.Columns, ds.Schema().Columns, path.Index) path.FullIdxCols, path.FullIdxColLens = expression.IndexInfo2Cols(ds.Columns, ds.Schema().Columns, path.Index) if !path.Index.Unique && !path.Index.Primary && len(path.Index.Columns) == len(path.IdxCols) { @@ -404,7 +405,7 @@ func detachCondAndBuildRangeForPath( path.ConstCols[i] = res.ColumnValues[i] != nil } } - path.CountAfterAccess, err = cardinality.GetRowCountByIndexRanges(sctx, histColl, path.Index.ID, path.Ranges) + path.CountAfterAccess, path.CorrCountAfterAccess, err = cardinality.GetRowCountByIndexRanges(sctx, histColl, path.Index.ID, path.Ranges) return err } diff --git a/pkg/planner/util/path.go b/pkg/planner/util/path.go index a8ccf7df379f2..99b1ea2990480 100644 --- a/pkg/planner/util/path.go +++ b/pkg/planner/util/path.go @@ -41,6 +41,10 @@ type AccessPath struct { // CountAfterAccess is the row count after we apply range seek and before we use other filter to filter data. // For index merge path, CountAfterAccess is the row count after partial paths and before we apply table filters. CountAfterAccess float64 + // CorrCountAfterAccess is the row count after only applying the most filtering index columns. + // against the index. This is used when we don't have a full index statistics + // and we need to use the exponential backoff to estimate the row count. + CorrCountAfterAccess float64 // CountAfterIndex is the row count after we apply filters on index and before we apply the table filters. CountAfterIndex float64 AccessConds []expression.Expression @@ -132,6 +136,7 @@ func (path *AccessPath) Clone() *AccessPath { ConstCols: slices.Clone(path.ConstCols), Ranges: CloneRanges(path.Ranges), CountAfterAccess: path.CountAfterAccess, + CorrCountAfterAccess: path.CorrCountAfterAccess, CountAfterIndex: path.CountAfterIndex, AccessConds: CloneExprs(path.AccessConds), EqCondCount: path.EqCondCount, diff --git a/pkg/statistics/statistics_test.go b/pkg/statistics/statistics_test.go index cfaf69cc7f68b..3afdbf30e0b86 100644 --- a/pkg/statistics/statistics_test.go +++ b/pkg/statistics/statistics_test.go @@ -395,51 +395,51 @@ func SubTestIndexRanges() func(*testing.T) { HighVal: []types.Datum{types.MaxValueDatum()}, Collators: collate.GetBinaryCollatorSlice(1), }} - count, err := GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err := GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 99900, int(count)) ran[0].LowVal[0] = types.NewIntDatum(1000) ran[0].HighVal[0] = types.NewIntDatum(2000) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 2500, int(count)) ran[0].LowVal[0] = types.NewIntDatum(1001) ran[0].HighVal[0] = types.NewIntDatum(1999) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 2500, int(count)) ran[0].LowVal[0] = types.NewIntDatum(1000) ran[0].HighVal[0] = types.NewIntDatum(1000) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 100, int(count)) tbl.SetIdx(0, &Index{Info: &model.IndexInfo{Columns: []*model.IndexColumn{{Offset: 0}}, Unique: true}}) ran[0].LowVal[0] = types.NewIntDatum(1000) ran[0].HighVal[0] = types.NewIntDatum(1000) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 1, int(count)) tbl.SetIdx(0, idx) ran[0].LowVal[0] = types.MinNotNullDatum() ran[0].HighVal[0] = types.MaxValueDatum() - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 100000, int(count)) ran[0].LowVal[0] = types.NewIntDatum(1000) ran[0].HighVal[0] = types.NewIntDatum(2000) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 1000, int(count)) ran[0].LowVal[0] = types.NewIntDatum(1001) ran[0].HighVal[0] = types.NewIntDatum(1990) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 989, int(count)) ran[0].LowVal[0] = types.NewIntDatum(1000) ran[0].HighVal[0] = types.NewIntDatum(1000) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 1, int(count)) } diff --git a/pkg/statistics/table.go b/pkg/statistics/table.go index 75c50e1921ce6..b386a308431c7 100644 --- a/pkg/statistics/table.go +++ b/pkg/statistics/table.go @@ -50,7 +50,7 @@ var ( // Note: all functions below will be removed after finishing moving all estimation functions into the cardinality package. // GetRowCountByIndexRanges is a function type to get row count by index ranges. - GetRowCountByIndexRanges func(sctx planctx.PlanContext, coll *HistColl, idxID int64, indexRanges []*ranger.Range) (result float64, err error) + GetRowCountByIndexRanges func(sctx planctx.PlanContext, coll *HistColl, idxID int64, indexRanges []*ranger.Range) (result float64, corrResult float64, err error) // GetRowCountByIntColumnRanges is a function type to get row count by int column ranges. GetRowCountByIntColumnRanges func(sctx planctx.PlanContext, coll *HistColl, colID int64, intRanges []*ranger.Range) (result float64, err error) diff --git a/tests/integrationtest/r/imdbload.result b/tests/integrationtest/r/imdbload.result index 787c49b2e81eb..3dc0c532d4ccb 100644 --- a/tests/integrationtest/r/imdbload.result +++ b/tests/integrationtest/r/imdbload.result @@ -276,9 +276,9 @@ load stats 's/imdbload_stats/movie_info.json'; load stats 's/imdbload_stats/cast_info.json'; explain select * from char_name where ((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436')); id estRows task access object operator info -IndexLookUp_10 2.00 root -├─IndexRangeScan_8(Build) 2.00 cop[tikv] table:char_name, index:itest2(imdb_index, surname_pcode, name_pcode_nf) range:["I" -inf,"I" "E436"), ["L" -inf,"L" "E436"), keep order:false -└─TableRowIDScan_9(Probe) 2.00 cop[tikv] table:char_name keep order:false +TableReader_7 2.00 root data:Selection_6 +└─Selection_6 2.00 cop[tikv] or(and(eq(imdbload.char_name.imdb_index, "I"), lt(imdbload.char_name.surname_pcode, "E436")), and(eq(imdbload.char_name.imdb_index, "L"), lt(imdbload.char_name.surname_pcode, "E436"))) + └─TableFullScan_5 4314864.00 cop[tikv] table:char_name keep order:false explain select * from char_name use index (itest2) where ((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436')); id estRows task access object operator info IndexLookUp_7 2.00 root @@ -350,20 +350,18 @@ CE_trace explain select * from keyword where ((phonetic_code = 'R1652') and (keyword > 'ecg-monitor' and keyword < 'killers')); id estRows task access object operator info -IndexLookUp_11 901.00 root -├─IndexRangeScan_8(Build) 901.00 cop[tikv] table:keyword, index:itest(phonetic_code, keyword) range:("R1652" "ecg-monitor","R1652" "killers"), keep order:false -└─Selection_10(Probe) 901.00 cop[tikv] gt(imdbload.keyword.keyword, "ecg-monitor"), lt(imdbload.keyword.keyword, "killers") - └─TableRowIDScan_9 901.00 cop[tikv] table:keyword keep order:false +TableReader_7 901.00 root data:Selection_6 +└─Selection_6 901.00 cop[tikv] eq(imdbload.keyword.phonetic_code, "R1652"), gt(imdbload.keyword.keyword, "ecg-monitor"), lt(imdbload.keyword.keyword, "killers") + └─TableFullScan_5 236627.00 cop[tikv] table:keyword keep order:false trace plan target = 'estimation' select * from keyword where ((phonetic_code = 'R1652') and (keyword > 'ecg-monitor' and keyword < 'killers')); CE_trace [{"table_name":"keyword","type":"Column Stats-Point","expr":"((phonetic_code = 'R1652'))","row_count":23480},{"table_name":"keyword","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":236627},{"table_name":"keyword","type":"Column Stats-Range","expr":"((keyword > 'ecg-monitor' and keyword < 'killers'))","row_count":44075},{"table_name":"keyword","type":"Index Stats-Point","expr":"((phonetic_code = 'R1652'))","row_count":23480},{"table_name":"keyword","type":"Index Stats-Range","expr":"((keyword >= 'ecg-m' and keyword <= 'kille'))","row_count":44036},{"table_name":"keyword","type":"Index Stats-Range","expr":"((phonetic_code = 'R1652') and (keyword > 'ecg-monitor' and keyword < 'killers'))","row_count":901},{"table_name":"keyword","type":"Table Stats-Expression-CNF","expr":"`and`(`eq`(imdbload.keyword.phonetic_code, 'R1652'), `and`(`gt`(imdbload.keyword.keyword, 'ecg-monitor'), `lt`(imdbload.keyword.keyword, 'killers')))","row_count":901}] explain select * from cast_info where (nr_order is null) and (person_role_id = 2) and (note >= '(key set pa: Florida'); id estRows task access object operator info -IndexLookUp_11 144633.00 root -├─IndexRangeScan_8(Build) 144633.00 cop[tikv] table:cast_info, index:itest2(nr_order, person_role_id, note) range:[NULL 2 "(key set pa: Florida",NULL 2 +inf], keep order:false -└─Selection_10(Probe) 144633.00 cop[tikv] ge(imdbload.cast_info.note, "(key set pa: Florida") - └─TableRowIDScan_9 144633.00 cop[tikv] table:cast_info keep order:false +TableReader_7 144633.00 root data:Selection_6 +└─Selection_6 144633.00 cop[tikv] eq(imdbload.cast_info.person_role_id, 2), ge(imdbload.cast_info.note, "(key set pa: Florida"), isnull(imdbload.cast_info.nr_order) + └─TableFullScan_5 63475835.00 cop[tikv] table:cast_info keep order:false trace plan target = 'estimation' select * from cast_info where (nr_order is null) and (person_role_id = 2) and (note >= '(key set pa: Florida'); CE_trace [{"table_name":"cast_info","type":"Column Stats-Point","expr":"((nr_order is null))","row_count":45995275},{"table_name":"cast_info","type":"Column Stats-Point","expr":"((person_role_id = 2))","row_count":2089611},{"table_name":"cast_info","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":63475835},{"table_name":"cast_info","type":"Column Stats-Range","expr":"((note >= '(key set pa: Florida' and true))","row_count":14934328},{"table_name":"cast_info","type":"Index Stats-Point","expr":"((person_role_id = 2))","row_count":2089611},{"table_name":"cast_info","type":"Index Stats-Range","expr":"((nr_order is null) and (person_role_id = 2) and (note >= '(key set pa: Florida' and true))","row_count":144633},{"table_name":"cast_info","type":"Table Stats-Expression-CNF","expr":"`and`(`isnull`(imdbload.cast_info.nr_order), `and`(`eq`(imdbload.cast_info.person_role_id, 2), `ge`(imdbload.cast_info.note, '(key set pa: Florida')))","row_count":144633},{"table_name":"cast_info","type":"Table Stats-Expression-CNF","expr":"`eq`(imdbload.cast_info.person_role_id, 2)","row_count":2089611}]