From 3df44deb847a1e1ed203b96a8179742a5b28d141 Mon Sep 17 00:00:00 2001 From: tpp Date: Sat, 4 Jan 2025 22:02:15 -0600 Subject: [PATCH 01/23] planner: Recognize potential for correlation in subset index match --- pkg/planner/cardinality/cross_estimation.go | 18 +++---- pkg/planner/cardinality/row_count_index.go | 50 +++++++++-------- pkg/planner/cardinality/selectivity.go | 8 ++- pkg/planner/core/debugtrace.go | 16 +++--- pkg/planner/core/exhaust_physical_plans.go | 9 ++-- pkg/planner/core/find_best_task.go | 60 +++++++++++++-------- pkg/planner/core/stats.go | 3 +- pkg/planner/util/path.go | 5 ++ pkg/statistics/statistics_test.go | 18 +++---- pkg/statistics/table.go | 2 +- 10 files changed, 113 insertions(+), 76 deletions(-) diff --git a/pkg/planner/cardinality/cross_estimation.go b/pkg/planner/cardinality/cross_estimation.go index d249a47502855..fb4b754d9cda0 100644 --- a/pkg/planner/cardinality/cross_estimation.go +++ b/pkg/planner/cardinality/cross_estimation.go @@ -156,7 +156,7 @@ func crossEstimateRowCount(sctx planctx.PlanContext, if idxExists && len(idxIDs) > 0 { idxID = idxIDs[0] } - rangeCounts, ok := getColumnRangeCounts(sctx, colUniqueID, ranges, dsTableStats.HistColl, idxID) + rangeCounts, _, ok := getColumnRangeCounts(sctx, colUniqueID, ranges, dsTableStats.HistColl, idxID) if !ok { return 0, false, corr } @@ -166,7 +166,7 @@ func crossEstimateRowCount(sctx planctx.PlanContext, } var rangeCount float64 if idxExists { - rangeCount, err = GetRowCountByIndexRanges(sctx, dsTableStats.HistColl, idxID, convertedRanges) + rangeCount, _, err = GetRowCountByIndexRanges(sctx, dsTableStats.HistColl, idxID, convertedRanges) } else { rangeCount, err = GetRowCountByColumnRanges(sctx, dsTableStats.HistColl, colUniqueID, convertedRanges) } @@ -182,30 +182,30 @@ func crossEstimateRowCount(sctx planctx.PlanContext, } // getColumnRangeCounts estimates row count for each range respectively. -func getColumnRangeCounts(sctx planctx.PlanContext, colID int64, ranges []*ranger.Range, histColl *statistics.HistColl, idxID int64) ([]float64, bool) { +func getColumnRangeCounts(sctx planctx.PlanContext, colID int64, ranges []*ranger.Range, histColl *statistics.HistColl, idxID int64) ([]float64, float64, bool) { var err error - var count float64 + var count, corrCount float64 rangeCounts := make([]float64, len(ranges)) for i, ran := range ranges { if idxID >= 0 { idxHist := histColl.GetIdx(idxID) if statistics.IndexStatsIsInvalid(sctx, idxHist, histColl, idxID) { - return nil, false + return nil, 0, false } - count, err = GetRowCountByIndexRanges(sctx, histColl, idxID, []*ranger.Range{ran}) + count, corrCount, err = GetRowCountByIndexRanges(sctx, histColl, idxID, []*ranger.Range{ran}) } else { colHist := histColl.GetCol(colID) if statistics.ColumnStatsIsInvalid(colHist, sctx, histColl, colID) { - return nil, false + return nil, 0, false } count, err = GetRowCountByColumnRanges(sctx, histColl, colID, []*ranger.Range{ran}) } if err != nil { - return nil, false + return nil, 0, false } rangeCounts[i] = count } - return rangeCounts, true + return rangeCounts, corrCount, true } // convertRangeFromExpectedCnt builds new ranges used to estimate row count we need to scan in table scan before finding specified diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index 08641e20b84dd..ab6116cba6e08 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -38,7 +38,7 @@ import ( ) // GetRowCountByIndexRanges estimates the row count by a slice of Range. -func GetRowCountByIndexRanges(sctx planctx.PlanContext, coll *statistics.HistColl, idxID int64, indexRanges []*ranger.Range) (result float64, err error) { +func GetRowCountByIndexRanges(sctx planctx.PlanContext, coll *statistics.HistColl, idxID int64, indexRanges []*ranger.Range) (result float64, corrResult float64, err error) { var name string if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { debugtrace.EnterContextCommon(sctx) @@ -69,7 +69,7 @@ func GetRowCountByIndexRanges(sctx planctx.PlanContext, coll *statistics.HistCol if err == nil && sc.EnableOptimizerCETrace && idx != nil { ceTraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats-Pseudo", uint64(result)) } - return result, err + return result, 0, err } realtimeCnt, modifyCount := coll.GetScaledRealtimeAndModifyCnt(idx) if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { @@ -79,15 +79,16 @@ func GetRowCountByIndexRanges(sctx planctx.PlanContext, coll *statistics.HistCol "Increase Factor", idx.GetIncreaseFactor(realtimeCnt), ) } + corrResult = float64(0) if idx.CMSketch != nil && idx.StatsVer == statistics.Version1 { result, err = getIndexRowCountForStatsV1(sctx, coll, idxID, indexRanges) } else { - result, err = getIndexRowCountForStatsV2(sctx, idx, coll, indexRanges, realtimeCnt, modifyCount) + result, corrResult, err = getIndexRowCountForStatsV2(sctx, idx, coll, indexRanges, realtimeCnt, modifyCount) } if sc.EnableOptimizerCETrace { ceTraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats", uint64(result)) } - return result, errors.Trace(err) + return result, corrResult, errors.Trace(err) } func getIndexRowCountForStatsV1(sctx planctx.PlanContext, coll *statistics.HistColl, idxID int64, indexRanges []*ranger.Range) (float64, error) { @@ -117,7 +118,7 @@ func getIndexRowCountForStatsV1(sctx planctx.PlanContext, coll *statistics.HistC // values in this case. if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) { realtimeCnt, modifyCount := coll.GetScaledRealtimeAndModifyCnt(idx) - count, err := getIndexRowCountForStatsV2(sctx, idx, nil, []*ranger.Range{ran}, realtimeCnt, modifyCount) + count, _, err := getIndexRowCountForStatsV2(sctx, idx, nil, []*ranger.Range{ran}, realtimeCnt, modifyCount) if err != nil { return 0, errors.Trace(err) } @@ -181,7 +182,7 @@ func getIndexRowCountForStatsV1(sctx planctx.PlanContext, coll *statistics.HistC // prefer index stats over column stats if idxIDs, ok := coll.ColUniqueID2IdxIDs[colUniqueID]; ok && len(idxIDs) > 0 { idxID := idxIDs[0] - count, err = GetRowCountByIndexRanges(sctx, coll, idxID, []*ranger.Range{&rang}) + count, _, err = GetRowCountByIndexRanges(sctx, coll, idxID, []*ranger.Range{&rang}) } else { count, err = GetRowCountByColumnRanges(sctx, coll, colUniqueID, []*ranger.Range{&rang}) } @@ -215,26 +216,26 @@ func isSingleColIdxNullRange(idx *statistics.Index, ran *ranger.Range) bool { } // It uses the modifyCount to validate, and realtimeRowCount to adjust the influence of modifications on the table. -func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (float64, error) { +func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (float64, float64, error) { sc := sctx.GetSessionVars().StmtCtx debugTrace := sc.EnableOptimizerDebugTrace if debugTrace { debugtrace.EnterContextCommon(sctx) defer debugtrace.LeaveContextCommon(sctx) } - totalCount := float64(0) + totalCount, corrCount := float64(0), float64(0) isSingleColIdx := len(idx.Info.Columns) == 1 for _, indexRange := range indexRanges { var count float64 lb, err := codec.EncodeKey(sc.TimeZone(), nil, indexRange.LowVal...) err = sc.HandleError(err) if err != nil { - return 0, err + return 0, 0, err } rb, err := codec.EncodeKey(sc.TimeZone(), nil, indexRange.HighVal...) err = sc.HandleError(err) if err != nil { - return 0, err + return 0, 0, err } if debugTrace { debugTraceStartEstimateRange(sctx, indexRange, lb, rb, totalCount) @@ -293,13 +294,14 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, // Due to the limitation of calcFraction and convertDatumToScalar, the histogram actually won't estimate anything. // If the first column's range is point. if rangePosition := getOrdinalOfRangeCond(sc, indexRange); rangePosition > 0 && idx.StatsVer >= statistics.Version2 && coll != nil { - var expBackoffSel float64 - expBackoffSel, expBackoffSuccess, err = expBackoffEstimation(sctx, idx, coll, indexRange) + var expBackoffSel, corrSel float64 + expBackoffSel, corrSel, expBackoffSuccess, err = expBackoffEstimation(sctx, idx, coll, indexRange) if err != nil { - return 0, err + return 0, 0, err } if expBackoffSuccess { expBackoffCnt := expBackoffSel * idx.TotalRowCount() + corrCnt := corrSel * idx.TotalRowCount() upperLimit := expBackoffCnt // Use the multi-column stats to calculate the max possible row count of [l, r) @@ -326,6 +328,7 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, expBackoffCnt = upperLimit } count += expBackoffCnt + corrCount += corrCnt } } if !expBackoffSuccess { @@ -335,6 +338,7 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, // If the current table row count has changed, we should scale the row count accordingly. increaseFactor := idx.GetIncreaseFactor(realtimeRowCount) count *= increaseFactor + corrCount *= increaseFactor // handling the out-of-range part if (outOfRangeOnIndex(idx, l) && !(isSingleColIdx && lowIsNull)) || outOfRangeOnIndex(idx, r) { @@ -369,7 +373,7 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, // Don't allow the final result to go below 1 row totalCount = mathutil.Clamp(totalCount, 1, float64(realtimeRowCount)) } - return totalCount, nil + return totalCount, corrCount, nil } var nullKeyBytes, _ = codec.EncodeKey(time.UTC, nil, types.NewDatum(nil)) @@ -429,7 +433,7 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b } // expBackoffEstimation estimate the multi-col cases following the Exponential Backoff. See comment below for details. -func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRange *ranger.Range) (sel float64, success bool, err error) { +func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRange *ranger.Range) (sel float64, corrsel float64, success bool, err error) { if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { debugtrace.EnterContextCommon(sctx) defer func() { @@ -485,7 +489,7 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll continue } foundStats = true - count, err = GetRowCountByIndexRanges(sctx, coll, idxID, tmpRan) + count, _, err = GetRowCountByIndexRanges(sctx, coll, idxID, tmpRan) if err == nil { break } @@ -497,7 +501,7 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll continue } if err != nil { - return 0, false, err + return 0, 0, false, err } singleColumnEstResults = append(singleColumnEstResults, selectivity) } @@ -509,9 +513,9 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll l = 0 }) if l == 1 { - return singleColumnEstResults[0], true, nil + return singleColumnEstResults[0], 0, true, nil } else if l == 0 { - return 0, false, nil + return 0, 0, false, nil } // Do not allow the exponential backoff to go below the available index bound. If the number of predicates // is less than the number of index columns - use 90% of the bound to differentiate a subset from full index match. @@ -524,19 +528,21 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll if l < len(idx.Info.Columns) { idxLowBound /= 0.9 } + // corrsel is the selectivity of the most filtering column + corrsel = min(idxLowBound, singleColumnEstResults[0]) minTwoCol := min(singleColumnEstResults[0], singleColumnEstResults[1], idxLowBound) multTwoCol := singleColumnEstResults[0] * math.Sqrt(singleColumnEstResults[1]) if l == 2 { - return max(minTwoCol, multTwoCol), true, nil + return max(minTwoCol, multTwoCol), corrsel, true, nil } minThreeCol := min(minTwoCol, singleColumnEstResults[2]) multThreeCol := multTwoCol * math.Sqrt(math.Sqrt(singleColumnEstResults[2])) if l == 3 { - return max(minThreeCol, multThreeCol), true, nil + return max(minThreeCol, multThreeCol), corrsel, true, nil } minFourCol := min(minThreeCol, singleColumnEstResults[3]) multFourCol := multThreeCol * math.Sqrt(math.Sqrt(math.Sqrt(singleColumnEstResults[3]))) - return max(minFourCol, multFourCol), true, nil + return max(minFourCol, multFourCol), corrsel, true, nil } // outOfRangeOnIndex checks if the datum is out of the range. diff --git a/pkg/planner/cardinality/selectivity.go b/pkg/planner/cardinality/selectivity.go index 1fe6eb84524b3..4a42857b1029b 100644 --- a/pkg/planner/cardinality/selectivity.go +++ b/pkg/planner/cardinality/selectivity.go @@ -202,11 +202,12 @@ func Selectivity( if err != nil { return 0, nil, errors.Trace(err) } - cnt, err := GetRowCountByIndexRanges(ctx, coll, id, ranges) + cnt, corrCnt, err := GetRowCountByIndexRanges(ctx, coll, id, ranges) if err != nil { return 0, nil, errors.Trace(err) } selectivity := cnt / float64(coll.RealtimeCount) + corrSelectivity := corrCnt / float64(coll.RealtimeCount) nodes = append(nodes, &StatsNode{ Tp: IndexType, ID: id, @@ -214,6 +215,7 @@ func Selectivity( Ranges: ranges, numCols: len(idxStats.Info.Columns), Selectivity: selectivity, + CorrSelectivity: corrSelectivity, partCover: partCover, minAccessCondsForDNFCond: minAccessCondsForDNFCond, }) @@ -543,6 +545,10 @@ type StatsNode struct { mask int64 // Selectivity indicates the Selectivity of this column/index. Selectivity float64 + // CorrSelectivity indicates the Selectivity of this column/index with correlated column. + // That is - it is the selectivity assuming the most filtering column only, and all other + // columns are uncorrelated. + CorrSelectivity float64 // numCols is the number of columns contained in the index or column(which is always 1). numCols int // partCover indicates whether the bit in the mask is for a full cover or partial cover. It is only true diff --git a/pkg/planner/core/debugtrace.go b/pkg/planner/core/debugtrace.go index 254c278847ed1..b2ee3166e64d0 100644 --- a/pkg/planner/core/debugtrace.go +++ b/pkg/planner/core/debugtrace.go @@ -227,13 +227,14 @@ func stabilizeGetStatsTblInfo(info *getStatsTblInfo) { */ type accessPathForDebugTrace struct { - IndexName string `json:",omitempty"` - AccessConditions []string - IndexFilters []string - TableFilters []string - PartialPaths []accessPathForDebugTrace `json:",omitempty"` - CountAfterAccess float64 - CountAfterIndex float64 + IndexName string `json:",omitempty"` + AccessConditions []string + IndexFilters []string + TableFilters []string + PartialPaths []accessPathForDebugTrace `json:",omitempty"` + CountAfterAccess float64 + CorrCountAfterAccess float64 + CountAfterIndex float64 } func convertAccessPathForDebugTrace(ctx expression.EvalContext, path *util.AccessPath, out *accessPathForDebugTrace) { @@ -244,6 +245,7 @@ func convertAccessPathForDebugTrace(ctx expression.EvalContext, path *util.Acces out.IndexFilters = expression.ExprsToStringsForDisplay(ctx, path.IndexFilters) out.TableFilters = expression.ExprsToStringsForDisplay(ctx, path.TableFilters) out.CountAfterAccess = path.CountAfterAccess + out.CorrCountAfterAccess = path.CorrCountAfterAccess out.CountAfterIndex = path.CountAfterIndex out.PartialPaths = make([]accessPathForDebugTrace, len(path.PartialIndexPaths)) for i, partialPath := range path.PartialIndexPaths { diff --git a/pkg/planner/core/exhaust_physical_plans.go b/pkg/planner/core/exhaust_physical_plans.go index 3c47bcce17fab..96f315b091c67 100644 --- a/pkg/planner/core/exhaust_physical_plans.go +++ b/pkg/planner/core/exhaust_physical_plans.go @@ -1347,10 +1347,11 @@ func constructInnerIndexScanTask( rowCount = math.Min(rowCount, 1.0) } tmpPath := &util.AccessPath{ - IndexFilters: indexConds, - TableFilters: tblConds, - CountAfterIndex: rowCount, - CountAfterAccess: rowCount, + IndexFilters: indexConds, + TableFilters: tblConds, + CountAfterIndex: rowCount, + CountAfterAccess: rowCount, + CorrCountAfterAccess: 0, } // Assume equal conditions used by index join and other conditions are independent. if len(tblConds) > 0 { diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index ac93c82bdb5f4..b21611a8a7909 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -728,32 +728,48 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, prop * if statsTbl != nil && rhs.path.Index != nil { rhsHasStatistics = statsTbl.ColAndIdxExistenceMap.HasAnalyzed(rhs.path.Index.ID, true) } - if !lhs.path.IsTablePath() && !rhs.path.IsTablePath() && // Not a table scan - (lhsHasStatistics || rhsHasStatistics) && // At least one index has statistics - (!lhsHasStatistics || !rhsHasStatistics) && // At least one index doesn't have statistics - len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 { // not IndexMerge due to unreliability - lhsTotalEqual := lhs.path.EqCondCount + lhs.path.EqOrInCondCount - rhsTotalEqual := rhs.path.EqCondCount + rhs.path.EqOrInCondCount - if lhsHasStatistics && lhsTotalEqual > 0 && lhsTotalEqual >= rhsTotalEqual { + lhsTotalEqual := lhs.path.EqCondCount + lhs.path.EqOrInCondCount + rhsTotalEqual := rhs.path.EqCondCount + rhs.path.EqOrInCondCount + + if len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 { + if !lhs.path.IsTablePath() && !rhs.path.IsTablePath() && // Not a table scan + (lhsHasStatistics || rhsHasStatistics) && // At least one index has statistics + (!lhsHasStatistics || !rhsHasStatistics) { // At least one index doesn't have statistics + if lhsHasStatistics && lhsTotalEqual > 0 && lhsTotalEqual >= rhsTotalEqual { + return 1 + } + if rhsHasStatistics && rhsTotalEqual > 0 && rhsTotalEqual >= lhsTotalEqual { + return -1 + } + } + + lhsCorrRatio, rhsCorrRatio := 0.0, 0.0 + if lhs.path.CorrCountAfterAccess > 0 || rhs.path.CorrCountAfterAccess > 0 { + lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess + rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess + } + + if lhsTotalEqual >= rhsTotalEqual && lhsCorrRatio < rhsCorrRatio { return 1 } - if rhsHasStatistics && rhsTotalEqual > 0 && rhsTotalEqual >= lhsTotalEqual { + if rhsTotalEqual >= lhsTotalEqual && rhsCorrRatio < lhsCorrRatio { return -1 } - } - - // This rule is empirical but not always correct. - // If x's range row count is significantly lower than y's, for example, 1000 times, we think x is better. - if lhs.path.CountAfterAccess > 100 && rhs.path.CountAfterAccess > 100 && // to prevent some extreme cases, e.g. 0.01 : 10 - len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 && // not IndexMerge since its row count estimation is not accurate enough - prop.ExpectedCnt == math.MaxFloat64 { // Limit may affect access row count - threshold := float64(fixcontrol.GetIntWithDefault(sctx.GetSessionVars().OptimizerFixControl, fixcontrol.Fix45132, 1000)) - if threshold > 0 { // set it to 0 to disable this rule - if lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold { - return -1 - } - if rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold { - return 1 + // This rule is empirical but not always correct. + // If x's range row count is significantly lower than y's, for example, 1000 times, we think x is better. + if lhs.path.CorrCountAfterAccess > 100 && rhs.path.CorrCountAfterAccess > 100 && // to prevent some extreme cases, e.g. 0.01 : 10 + len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 && // not IndexMerge since its row count estimation is not accurate enough + prop.ExpectedCnt == math.MaxFloat64 { // Limit may affect access row count + threshold := float64(fixcontrol.GetIntWithDefault(sctx.GetSessionVars().OptimizerFixControl, fixcontrol.Fix45132, 1000)) + if threshold > 0 { // set it to 0 to disable this rule + if lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold && + (rhsCorrRatio < lhsCorrRatio || rhsTotalEqual > lhsTotalEqual) { + return -1 + } + if rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold && + (lhsCorrRatio < rhsCorrRatio || lhsTotalEqual > rhsTotalEqual) { + return 1 + } } } } diff --git a/pkg/planner/core/stats.go b/pkg/planner/core/stats.go index d48f711f5cc34..50ada5e873e31 100644 --- a/pkg/planner/core/stats.go +++ b/pkg/planner/core/stats.go @@ -176,6 +176,7 @@ func fillIndexPath(ds *logicalop.DataSource, path *util.AccessPath, conds []expr } path.Ranges = ranger.FullRange() path.CountAfterAccess = float64(ds.StatisticTable.RealtimeCount) + path.CorrCountAfterAccess = 0 path.IdxCols, path.IdxColLens = expression.IndexInfo2PrefixCols(ds.Columns, ds.Schema().Columns, path.Index) path.FullIdxCols, path.FullIdxColLens = expression.IndexInfo2Cols(ds.Columns, ds.Schema().Columns, path.Index) if !path.Index.Unique && !path.Index.Primary && len(path.Index.Columns) == len(path.IdxCols) { @@ -404,7 +405,7 @@ func detachCondAndBuildRangeForPath( path.ConstCols[i] = res.ColumnValues[i] != nil } } - path.CountAfterAccess, err = cardinality.GetRowCountByIndexRanges(sctx, histColl, path.Index.ID, path.Ranges) + path.CountAfterAccess, path.CorrCountAfterAccess, err = cardinality.GetRowCountByIndexRanges(sctx, histColl, path.Index.ID, path.Ranges) return err } diff --git a/pkg/planner/util/path.go b/pkg/planner/util/path.go index a8ccf7df379f2..99b1ea2990480 100644 --- a/pkg/planner/util/path.go +++ b/pkg/planner/util/path.go @@ -41,6 +41,10 @@ type AccessPath struct { // CountAfterAccess is the row count after we apply range seek and before we use other filter to filter data. // For index merge path, CountAfterAccess is the row count after partial paths and before we apply table filters. CountAfterAccess float64 + // CorrCountAfterAccess is the row count after only applying the most filtering index columns. + // against the index. This is used when we don't have a full index statistics + // and we need to use the exponential backoff to estimate the row count. + CorrCountAfterAccess float64 // CountAfterIndex is the row count after we apply filters on index and before we apply the table filters. CountAfterIndex float64 AccessConds []expression.Expression @@ -132,6 +136,7 @@ func (path *AccessPath) Clone() *AccessPath { ConstCols: slices.Clone(path.ConstCols), Ranges: CloneRanges(path.Ranges), CountAfterAccess: path.CountAfterAccess, + CorrCountAfterAccess: path.CorrCountAfterAccess, CountAfterIndex: path.CountAfterIndex, AccessConds: CloneExprs(path.AccessConds), EqCondCount: path.EqCondCount, diff --git a/pkg/statistics/statistics_test.go b/pkg/statistics/statistics_test.go index cfaf69cc7f68b..3afdbf30e0b86 100644 --- a/pkg/statistics/statistics_test.go +++ b/pkg/statistics/statistics_test.go @@ -395,51 +395,51 @@ func SubTestIndexRanges() func(*testing.T) { HighVal: []types.Datum{types.MaxValueDatum()}, Collators: collate.GetBinaryCollatorSlice(1), }} - count, err := GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err := GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 99900, int(count)) ran[0].LowVal[0] = types.NewIntDatum(1000) ran[0].HighVal[0] = types.NewIntDatum(2000) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 2500, int(count)) ran[0].LowVal[0] = types.NewIntDatum(1001) ran[0].HighVal[0] = types.NewIntDatum(1999) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 2500, int(count)) ran[0].LowVal[0] = types.NewIntDatum(1000) ran[0].HighVal[0] = types.NewIntDatum(1000) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 100, int(count)) tbl.SetIdx(0, &Index{Info: &model.IndexInfo{Columns: []*model.IndexColumn{{Offset: 0}}, Unique: true}}) ran[0].LowVal[0] = types.NewIntDatum(1000) ran[0].HighVal[0] = types.NewIntDatum(1000) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 1, int(count)) tbl.SetIdx(0, idx) ran[0].LowVal[0] = types.MinNotNullDatum() ran[0].HighVal[0] = types.MaxValueDatum() - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 100000, int(count)) ran[0].LowVal[0] = types.NewIntDatum(1000) ran[0].HighVal[0] = types.NewIntDatum(2000) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 1000, int(count)) ran[0].LowVal[0] = types.NewIntDatum(1001) ran[0].HighVal[0] = types.NewIntDatum(1990) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 989, int(count)) ran[0].LowVal[0] = types.NewIntDatum(1000) ran[0].HighVal[0] = types.NewIntDatum(1000) - count, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) + count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran) require.NoError(t, err) require.Equal(t, 1, int(count)) } diff --git a/pkg/statistics/table.go b/pkg/statistics/table.go index 75c50e1921ce6..b386a308431c7 100644 --- a/pkg/statistics/table.go +++ b/pkg/statistics/table.go @@ -50,7 +50,7 @@ var ( // Note: all functions below will be removed after finishing moving all estimation functions into the cardinality package. // GetRowCountByIndexRanges is a function type to get row count by index ranges. - GetRowCountByIndexRanges func(sctx planctx.PlanContext, coll *HistColl, idxID int64, indexRanges []*ranger.Range) (result float64, err error) + GetRowCountByIndexRanges func(sctx planctx.PlanContext, coll *HistColl, idxID int64, indexRanges []*ranger.Range) (result float64, corrResult float64, err error) // GetRowCountByIntColumnRanges is a function type to get row count by int column ranges. GetRowCountByIntColumnRanges func(sctx planctx.PlanContext, coll *HistColl, colID int64, intRanges []*ranger.Range) (result float64, err error) From 8f2fdbe9e2c9e0a11878d38b04b8449810c1fd94 Mon Sep 17 00:00:00 2001 From: tpp Date: Sat, 4 Jan 2025 22:19:37 -0600 Subject: [PATCH 02/23] build error --- pkg/planner/cardinality/row_count_index.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index ab6116cba6e08..78dea02c59d52 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -216,23 +216,23 @@ func isSingleColIdxNullRange(idx *statistics.Index, ran *ranger.Range) bool { } // It uses the modifyCount to validate, and realtimeRowCount to adjust the influence of modifications on the table. -func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (float64, float64, error) { +func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (totalCount float64, corrCount float64, err error) { sc := sctx.GetSessionVars().StmtCtx debugTrace := sc.EnableOptimizerDebugTrace if debugTrace { debugtrace.EnterContextCommon(sctx) defer debugtrace.LeaveContextCommon(sctx) } - totalCount, corrCount := float64(0), float64(0) isSingleColIdx := len(idx.Info.Columns) == 1 for _, indexRange := range indexRanges { var count float64 - lb, err := codec.EncodeKey(sc.TimeZone(), nil, indexRange.LowVal...) + var lb, rb []byte + lb, err = codec.EncodeKey(sc.TimeZone(), nil, indexRange.LowVal...) err = sc.HandleError(err) if err != nil { return 0, 0, err } - rb, err := codec.EncodeKey(sc.TimeZone(), nil, indexRange.HighVal...) + rb, err = codec.EncodeKey(sc.TimeZone(), nil, indexRange.HighVal...) err = sc.HandleError(err) if err != nil { return 0, 0, err From ce4bf9a412c81b218b455098b40eff2833d3ea4a Mon Sep 17 00:00:00 2001 From: tpp Date: Sat, 4 Jan 2025 22:49:48 -0600 Subject: [PATCH 03/23] testcase1 --- pkg/planner/cardinality/selectivity_test.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/planner/cardinality/selectivity_test.go b/pkg/planner/cardinality/selectivity_test.go index e645f3d863e79..42c51acbed469 100644 --- a/pkg/planner/cardinality/selectivity_test.go +++ b/pkg/planner/cardinality/selectivity_test.go @@ -252,11 +252,11 @@ func TestEstimationForUnknownValues(t *testing.T) { require.Equal(t, 12.2, count) idxID := table.Meta().Indices[0].ID - count, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(30, 30)) + count, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(30, 30)) require.NoError(t, err) require.Equal(t, 0.1, count) - count, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(9, 30)) + count, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(9, 30)) require.NoError(t, err) require.Equal(t, 10.0, count) @@ -286,7 +286,7 @@ func TestEstimationForUnknownValues(t *testing.T) { require.Equal(t, 1.0, count) idxID = table.Meta().Indices[0].ID - count, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(2, 2)) + count, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(2, 2)) require.NoError(t, err) require.Equal(t, 0.0, count) } @@ -377,11 +377,11 @@ func TestEstimationUniqueKeyEqualConds(t *testing.T) { sctx := mock.NewContext() idxID := table.Meta().Indices[0].ID - count, err := cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(7, 7)) + count, _, err := cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(7, 7)) require.NoError(t, err) require.Equal(t, 1.0, count) - count, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(6, 6)) + count, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(6, 6)) require.NoError(t, err) require.Equal(t, 1.0, count) @@ -1011,12 +1011,12 @@ func TestIssue39593(t *testing.T) { sctx := testKit.Session() idxID := tblInfo.Indices[0].ID vals := []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20} - count, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRanges(vals, vals)) + count, _, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRanges(vals, vals)) require.NoError(t, err) // estimated row count without any changes require.Equal(t, float64(360), count) statsTbl.RealtimeCount *= 10 - count, err = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRanges(vals, vals)) + count, _, err = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRanges(vals, vals)) require.NoError(t, err) // estimated row count after mock modify on the table require.Equal(t, float64(3600), count) From a27a439f307a9fdd040b16ef2eb1ba2512b1653e Mon Sep 17 00:00:00 2001 From: tpp Date: Sat, 4 Jan 2025 23:13:26 -0600 Subject: [PATCH 04/23] revision1 --- pkg/planner/core/find_best_task.go | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index b21611a8a7909..3d9b25cb926a7 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -730,15 +730,19 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, prop * } lhsTotalEqual := lhs.path.EqCondCount + lhs.path.EqOrInCondCount rhsTotalEqual := rhs.path.EqCondCount + rhs.path.EqOrInCondCount + lhsTotalIndexFilters := lhsTotalEqual + len(lhs.path.IndexFilters) + rhsTotalIndexFilters := rhsTotalEqual + len(rhs.path.IndexFilters) + lhsMoreFilters := lhsTotalEqual > 0 && (lhsTotalEqual > rhsTotalEqual || (lhsTotalEqual == rhsTotalEqual && lhsTotalIndexFilters >= rhsTotalIndexFilters)) + rhsMoreFilters := rhsTotalEqual > 0 && (rhsTotalEqual > lhsTotalEqual || (rhsTotalEqual == lhsTotalEqual && rhsTotalIndexFilters >= lhsTotalIndexFilters)) if len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 { if !lhs.path.IsTablePath() && !rhs.path.IsTablePath() && // Not a table scan (lhsHasStatistics || rhsHasStatistics) && // At least one index has statistics (!lhsHasStatistics || !rhsHasStatistics) { // At least one index doesn't have statistics - if lhsHasStatistics && lhsTotalEqual > 0 && lhsTotalEqual >= rhsTotalEqual { + if lhsHasStatistics && lhsMoreFilters { return 1 } - if rhsHasStatistics && rhsTotalEqual > 0 && rhsTotalEqual >= lhsTotalEqual { + if rhsHasStatistics && rhsMoreFilters { return -1 } } @@ -749,10 +753,10 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, prop * rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess } - if lhsTotalEqual >= rhsTotalEqual && lhsCorrRatio < rhsCorrRatio { + if lhsMoreFilters && lhsCorrRatio < rhsCorrRatio { return 1 } - if rhsTotalEqual >= lhsTotalEqual && rhsCorrRatio < lhsCorrRatio { + if rhsMoreFilters && rhsCorrRatio < lhsCorrRatio { return -1 } // This rule is empirical but not always correct. @@ -763,11 +767,11 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, prop * threshold := float64(fixcontrol.GetIntWithDefault(sctx.GetSessionVars().OptimizerFixControl, fixcontrol.Fix45132, 1000)) if threshold > 0 { // set it to 0 to disable this rule if lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold && - (rhsCorrRatio < lhsCorrRatio || rhsTotalEqual > lhsTotalEqual) { + (rhsCorrRatio < lhsCorrRatio || rhsMoreFilters) { return -1 } if rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold && - (lhsCorrRatio < rhsCorrRatio || lhsTotalEqual > rhsTotalEqual) { + lhsCorrRatio < rhsCorrRatio || lhsMoreFilters { return 1 } } From 269a78ca3b7307cdf626b37873dbab3f5186cd90 Mon Sep 17 00:00:00 2001 From: tpp Date: Sun, 5 Jan 2025 00:37:02 -0600 Subject: [PATCH 05/23] revision2 --- pkg/planner/core/find_best_task.go | 18 ++++++++---------- tests/integrationtest/r/imdbload.result | 20 +++++++++----------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index 3d9b25cb926a7..741f865aa325b 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -730,19 +730,17 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, prop * } lhsTotalEqual := lhs.path.EqCondCount + lhs.path.EqOrInCondCount rhsTotalEqual := rhs.path.EqCondCount + rhs.path.EqOrInCondCount - lhsTotalIndexFilters := lhsTotalEqual + len(lhs.path.IndexFilters) - rhsTotalIndexFilters := rhsTotalEqual + len(rhs.path.IndexFilters) - lhsMoreFilters := lhsTotalEqual > 0 && (lhsTotalEqual > rhsTotalEqual || (lhsTotalEqual == rhsTotalEqual && lhsTotalIndexFilters >= rhsTotalIndexFilters)) - rhsMoreFilters := rhsTotalEqual > 0 && (rhsTotalEqual > lhsTotalEqual || (rhsTotalEqual == lhsTotalEqual && rhsTotalIndexFilters >= lhsTotalIndexFilters)) + lhsMoreFilters := (lhsTotalEqual > rhsTotalEqual || (lhsTotalEqual == rhsTotalEqual && len(lhs.path.IndexFilters) >= len(rhs.path.IndexFilters))) + rhsMoreFilters := (rhsTotalEqual > lhsTotalEqual || (rhsTotalEqual == lhsTotalEqual && len(rhs.path.IndexFilters) >= len(lhs.path.IndexFilters))) if len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 { if !lhs.path.IsTablePath() && !rhs.path.IsTablePath() && // Not a table scan (lhsHasStatistics || rhsHasStatistics) && // At least one index has statistics (!lhsHasStatistics || !rhsHasStatistics) { // At least one index doesn't have statistics - if lhsHasStatistics && lhsMoreFilters { + if lhsHasStatistics && lhsTotalEqual > 0 && lhsMoreFilters { return 1 } - if rhsHasStatistics && rhsMoreFilters { + if rhsHasStatistics && rhsTotalEqual > 0 && rhsMoreFilters { return -1 } } @@ -766,12 +764,12 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, prop * prop.ExpectedCnt == math.MaxFloat64 { // Limit may affect access row count threshold := float64(fixcontrol.GetIntWithDefault(sctx.GetSessionVars().OptimizerFixControl, fixcontrol.Fix45132, 1000)) if threshold > 0 { // set it to 0 to disable this rule - if lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold && - (rhsCorrRatio < lhsCorrRatio || rhsMoreFilters) { + if rhsMoreFilters && + (lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold || rhsCorrRatio < lhsCorrRatio) { return -1 } - if rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold && - lhsCorrRatio < rhsCorrRatio || lhsMoreFilters { + if lhsMoreFilters && + (rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold || lhsCorrRatio < rhsCorrRatio) { return 1 } } diff --git a/tests/integrationtest/r/imdbload.result b/tests/integrationtest/r/imdbload.result index 787c49b2e81eb..3dc0c532d4ccb 100644 --- a/tests/integrationtest/r/imdbload.result +++ b/tests/integrationtest/r/imdbload.result @@ -276,9 +276,9 @@ load stats 's/imdbload_stats/movie_info.json'; load stats 's/imdbload_stats/cast_info.json'; explain select * from char_name where ((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436')); id estRows task access object operator info -IndexLookUp_10 2.00 root -├─IndexRangeScan_8(Build) 2.00 cop[tikv] table:char_name, index:itest2(imdb_index, surname_pcode, name_pcode_nf) range:["I" -inf,"I" "E436"), ["L" -inf,"L" "E436"), keep order:false -└─TableRowIDScan_9(Probe) 2.00 cop[tikv] table:char_name keep order:false +TableReader_7 2.00 root data:Selection_6 +└─Selection_6 2.00 cop[tikv] or(and(eq(imdbload.char_name.imdb_index, "I"), lt(imdbload.char_name.surname_pcode, "E436")), and(eq(imdbload.char_name.imdb_index, "L"), lt(imdbload.char_name.surname_pcode, "E436"))) + └─TableFullScan_5 4314864.00 cop[tikv] table:char_name keep order:false explain select * from char_name use index (itest2) where ((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436')); id estRows task access object operator info IndexLookUp_7 2.00 root @@ -350,20 +350,18 @@ CE_trace explain select * from keyword where ((phonetic_code = 'R1652') and (keyword > 'ecg-monitor' and keyword < 'killers')); id estRows task access object operator info -IndexLookUp_11 901.00 root -├─IndexRangeScan_8(Build) 901.00 cop[tikv] table:keyword, index:itest(phonetic_code, keyword) range:("R1652" "ecg-monitor","R1652" "killers"), keep order:false -└─Selection_10(Probe) 901.00 cop[tikv] gt(imdbload.keyword.keyword, "ecg-monitor"), lt(imdbload.keyword.keyword, "killers") - └─TableRowIDScan_9 901.00 cop[tikv] table:keyword keep order:false +TableReader_7 901.00 root data:Selection_6 +└─Selection_6 901.00 cop[tikv] eq(imdbload.keyword.phonetic_code, "R1652"), gt(imdbload.keyword.keyword, "ecg-monitor"), lt(imdbload.keyword.keyword, "killers") + └─TableFullScan_5 236627.00 cop[tikv] table:keyword keep order:false trace plan target = 'estimation' select * from keyword where ((phonetic_code = 'R1652') and (keyword > 'ecg-monitor' and keyword < 'killers')); CE_trace [{"table_name":"keyword","type":"Column Stats-Point","expr":"((phonetic_code = 'R1652'))","row_count":23480},{"table_name":"keyword","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":236627},{"table_name":"keyword","type":"Column Stats-Range","expr":"((keyword > 'ecg-monitor' and keyword < 'killers'))","row_count":44075},{"table_name":"keyword","type":"Index Stats-Point","expr":"((phonetic_code = 'R1652'))","row_count":23480},{"table_name":"keyword","type":"Index Stats-Range","expr":"((keyword >= 'ecg-m' and keyword <= 'kille'))","row_count":44036},{"table_name":"keyword","type":"Index Stats-Range","expr":"((phonetic_code = 'R1652') and (keyword > 'ecg-monitor' and keyword < 'killers'))","row_count":901},{"table_name":"keyword","type":"Table Stats-Expression-CNF","expr":"`and`(`eq`(imdbload.keyword.phonetic_code, 'R1652'), `and`(`gt`(imdbload.keyword.keyword, 'ecg-monitor'), `lt`(imdbload.keyword.keyword, 'killers')))","row_count":901}] explain select * from cast_info where (nr_order is null) and (person_role_id = 2) and (note >= '(key set pa: Florida'); id estRows task access object operator info -IndexLookUp_11 144633.00 root -├─IndexRangeScan_8(Build) 144633.00 cop[tikv] table:cast_info, index:itest2(nr_order, person_role_id, note) range:[NULL 2 "(key set pa: Florida",NULL 2 +inf], keep order:false -└─Selection_10(Probe) 144633.00 cop[tikv] ge(imdbload.cast_info.note, "(key set pa: Florida") - └─TableRowIDScan_9 144633.00 cop[tikv] table:cast_info keep order:false +TableReader_7 144633.00 root data:Selection_6 +└─Selection_6 144633.00 cop[tikv] eq(imdbload.cast_info.person_role_id, 2), ge(imdbload.cast_info.note, "(key set pa: Florida"), isnull(imdbload.cast_info.nr_order) + └─TableFullScan_5 63475835.00 cop[tikv] table:cast_info keep order:false trace plan target = 'estimation' select * from cast_info where (nr_order is null) and (person_role_id = 2) and (note >= '(key set pa: Florida'); CE_trace [{"table_name":"cast_info","type":"Column Stats-Point","expr":"((nr_order is null))","row_count":45995275},{"table_name":"cast_info","type":"Column Stats-Point","expr":"((person_role_id = 2))","row_count":2089611},{"table_name":"cast_info","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":63475835},{"table_name":"cast_info","type":"Column Stats-Range","expr":"((note >= '(key set pa: Florida' and true))","row_count":14934328},{"table_name":"cast_info","type":"Index Stats-Point","expr":"((person_role_id = 2))","row_count":2089611},{"table_name":"cast_info","type":"Index Stats-Range","expr":"((nr_order is null) and (person_role_id = 2) and (note >= '(key set pa: Florida' and true))","row_count":144633},{"table_name":"cast_info","type":"Table Stats-Expression-CNF","expr":"`and`(`isnull`(imdbload.cast_info.nr_order), `and`(`eq`(imdbload.cast_info.person_role_id, 2), `ge`(imdbload.cast_info.note, '(key set pa: Florida')))","row_count":144633},{"table_name":"cast_info","type":"Table Stats-Expression-CNF","expr":"`eq`(imdbload.cast_info.person_role_id, 2)","row_count":2089611}] From 3ee89e06d8aaedc93c24458e0333900d2c6663bc Mon Sep 17 00:00:00 2001 From: tpp Date: Sun, 5 Jan 2025 16:22:18 -0800 Subject: [PATCH 06/23] revision3 --- .codegpt/head | 1 + pkg/planner/core/find_best_task.go | 35 +++++++++++++++--------------- 2 files changed, 19 insertions(+), 17 deletions(-) create mode 100644 .codegpt/head diff --git a/.codegpt/head b/.codegpt/head new file mode 100644 index 0000000000000..712e0cd554e79 --- /dev/null +++ b/.codegpt/head @@ -0,0 +1 @@ +979ec686-8133-4fa8-8c6a-027f512c60a2 \ No newline at end of file diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index 741f865aa325b..9bf7a07449ddf 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -734,29 +734,30 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, prop * rhsMoreFilters := (rhsTotalEqual > lhsTotalEqual || (rhsTotalEqual == lhsTotalEqual && len(rhs.path.IndexFilters) >= len(lhs.path.IndexFilters))) if len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 { - if !lhs.path.IsTablePath() && !rhs.path.IsTablePath() && // Not a table scan - (lhsHasStatistics || rhsHasStatistics) && // At least one index has statistics - (!lhsHasStatistics || !rhsHasStatistics) { // At least one index doesn't have statistics - if lhsHasStatistics && lhsTotalEqual > 0 && lhsMoreFilters { + if !lhs.path.IsTablePath() && !rhs.path.IsTablePath() { // Not a table scan + if (lhsHasStatistics || rhsHasStatistics) && // At least one index has statistics + (!lhsHasStatistics || !rhsHasStatistics) { // At least one index doesn't have statistics + if lhsHasStatistics && lhsTotalEqual > 0 && lhsMoreFilters { + return 1 + } + if rhsHasStatistics && rhsTotalEqual > 0 && rhsMoreFilters { + return -1 + } + } + + lhsCorrRatio, rhsCorrRatio := 0.0, 0.0 + if lhs.path.CorrCountAfterAccess > 0 || rhs.path.CorrCountAfterAccess > 0 { + lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess + rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess + } + if lhsMoreFilters && lhsCorrRatio < rhsCorrRatio { return 1 } - if rhsHasStatistics && rhsTotalEqual > 0 && rhsMoreFilters { + if rhsMoreFilters && rhsCorrRatio < lhsCorrRatio { return -1 } } - lhsCorrRatio, rhsCorrRatio := 0.0, 0.0 - if lhs.path.CorrCountAfterAccess > 0 || rhs.path.CorrCountAfterAccess > 0 { - lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess - rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess - } - - if lhsMoreFilters && lhsCorrRatio < rhsCorrRatio { - return 1 - } - if rhsMoreFilters && rhsCorrRatio < lhsCorrRatio { - return -1 - } // This rule is empirical but not always correct. // If x's range row count is significantly lower than y's, for example, 1000 times, we think x is better. if lhs.path.CorrCountAfterAccess > 100 && rhs.path.CorrCountAfterAccess > 100 && // to prevent some extreme cases, e.g. 0.01 : 10 From 4e006db5b3343aa471a198321479a177e6e21cdc Mon Sep 17 00:00:00 2001 From: tpp Date: Sun, 5 Jan 2025 17:06:01 -0800 Subject: [PATCH 07/23] revision4 --- pkg/planner/core/find_best_task.go | 43 +++++++++++++++--------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index 9bf7a07449ddf..d85f3efdabc86 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -730,34 +730,33 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, prop * } lhsTotalEqual := lhs.path.EqCondCount + lhs.path.EqOrInCondCount rhsTotalEqual := rhs.path.EqCondCount + rhs.path.EqOrInCondCount - lhsMoreFilters := (lhsTotalEqual > rhsTotalEqual || (lhsTotalEqual == rhsTotalEqual && len(lhs.path.IndexFilters) >= len(rhs.path.IndexFilters))) - rhsMoreFilters := (rhsTotalEqual > lhsTotalEqual || (rhsTotalEqual == lhsTotalEqual && len(rhs.path.IndexFilters) >= len(lhs.path.IndexFilters))) - - if len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 { - if !lhs.path.IsTablePath() && !rhs.path.IsTablePath() { // Not a table scan - if (lhsHasStatistics || rhsHasStatistics) && // At least one index has statistics - (!lhsHasStatistics || !rhsHasStatistics) { // At least one index doesn't have statistics - if lhsHasStatistics && lhsTotalEqual > 0 && lhsMoreFilters { - return 1 - } - if rhsHasStatistics && rhsTotalEqual > 0 && rhsMoreFilters { - return -1 - } - } - - lhsCorrRatio, rhsCorrRatio := 0.0, 0.0 - if lhs.path.CorrCountAfterAccess > 0 || rhs.path.CorrCountAfterAccess > 0 { - lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess - rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess - } - if lhsMoreFilters && lhsCorrRatio < rhsCorrRatio { + lhsMoreFilters := (lhsTotalEqual > rhsTotalEqual || (lhsTotalEqual > 0 && lhsTotalEqual == rhsTotalEqual && len(lhs.path.IndexFilters) >= len(rhs.path.IndexFilters))) + rhsMoreFilters := (rhsTotalEqual > lhsTotalEqual || (rhsTotalEqual > 0 && rhsTotalEqual == lhsTotalEqual && len(rhs.path.IndexFilters) >= len(lhs.path.IndexFilters))) + + if len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 && + !lhs.path.IsTablePath() && !rhs.path.IsTablePath() { // Not a table scan + if (lhsHasStatistics || rhsHasStatistics) && // At least one index has statistics + (!lhsHasStatistics || !rhsHasStatistics) { // At least one index doesn't have statistics + if lhsHasStatistics && lhsTotalEqual > 0 && lhsMoreFilters { return 1 } - if rhsMoreFilters && rhsCorrRatio < lhsCorrRatio { + if rhsHasStatistics && rhsTotalEqual > 0 && rhsMoreFilters { return -1 } } + lhsCorrRatio, rhsCorrRatio := 0.0, 0.0 + if lhs.path.CorrCountAfterAccess > 0 || rhs.path.CorrCountAfterAccess > 0 { + lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess + rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess + } + if lhsMoreFilters && lhsCorrRatio < rhsCorrRatio { + return 1 + } + if rhsMoreFilters && rhsCorrRatio < lhsCorrRatio { + return -1 + } + // This rule is empirical but not always correct. // If x's range row count is significantly lower than y's, for example, 1000 times, we think x is better. if lhs.path.CorrCountAfterAccess > 100 && rhs.path.CorrCountAfterAccess > 100 && // to prevent some extreme cases, e.g. 0.01 : 10 From 4c2a739049f94db7799a2fad2a30612109e04999 Mon Sep 17 00:00:00 2001 From: tpp <146148086+terry1purcell@users.noreply.github.com> Date: Sun, 5 Jan 2025 17:19:14 -0800 Subject: [PATCH 08/23] Delete .codegpt/head file is not in repo --- .codegpt/head | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .codegpt/head diff --git a/.codegpt/head b/.codegpt/head deleted file mode 100644 index 712e0cd554e79..0000000000000 --- a/.codegpt/head +++ /dev/null @@ -1 +0,0 @@ -979ec686-8133-4fa8-8c6a-027f512c60a2 \ No newline at end of file From 3f440d5d57a4fbd4721dd31e508bcc39b25ff4e9 Mon Sep 17 00:00:00 2001 From: tpp Date: Thu, 23 Jan 2025 16:48:34 -0800 Subject: [PATCH 09/23] revert for conflict --- pkg/planner/core/find_best_task.go | 182 +++++++++++++++++------------ 1 file changed, 105 insertions(+), 77 deletions(-) diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index d85f3efdabc86..d5bcc8a703b65 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -45,7 +45,6 @@ import ( h "github.com/pingcap/tidb/pkg/util/hint" "github.com/pingcap/tidb/pkg/util/intest" "github.com/pingcap/tidb/pkg/util/logutil" - "github.com/pingcap/tidb/pkg/util/ranger" "github.com/pingcap/tidb/pkg/util/tracing" "github.com/pingcap/tipb/go-tipb" "go.uber.org/zap" @@ -710,99 +709,121 @@ func compareGlobalIndex(lhs, rhs *candidatePath) int { } // compareCandidates is the core of skyline pruning, which is used to decide which candidate path is better. -// The return value is 1 if lhs is better, -1 if rhs is better, 0 if they are equivalent or not comparable. -func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, prop *property.PhysicalProperty, lhs, rhs *candidatePath) int { +// The first return value is 1 if lhs is better, -1 if rhs is better, 0 if they are equivalent or not comparable. +// The 2nd return value indicates whether the "better path" is missing statistics or not. +func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, tableInfo *model.TableInfo, prop *property.PhysicalProperty, lhs, rhs *candidatePath, preferRange bool) (int, bool) { // Due to #50125, full scan on MVIndex has been disabled, so MVIndex path might lead to 'can't find a proper plan' error at the end. // Avoid MVIndex path to exclude all other paths and leading to 'can't find a proper plan' error, see #49438 for an example. if isMVIndexPath(lhs.path) || isMVIndexPath(rhs.path) { - return 0 - } - - // If one index has statistics and the other does not, choose the index with statistics if it - // has the same or higher number of equal/IN predicates. - lhsHasStatistics := statsTbl.Pseudo - if statsTbl != nil && lhs.path.Index != nil { - lhsHasStatistics = statsTbl.ColAndIdxExistenceMap.HasAnalyzed(lhs.path.Index.ID, true) - } - rhsHasStatistics := statsTbl.Pseudo - if statsTbl != nil && rhs.path.Index != nil { - rhsHasStatistics = statsTbl.ColAndIdxExistenceMap.HasAnalyzed(rhs.path.Index.ID, true) - } - lhsTotalEqual := lhs.path.EqCondCount + lhs.path.EqOrInCondCount - rhsTotalEqual := rhs.path.EqCondCount + rhs.path.EqOrInCondCount - lhsMoreFilters := (lhsTotalEqual > rhsTotalEqual || (lhsTotalEqual > 0 && lhsTotalEqual == rhsTotalEqual && len(lhs.path.IndexFilters) >= len(rhs.path.IndexFilters))) - rhsMoreFilters := (rhsTotalEqual > lhsTotalEqual || (rhsTotalEqual > 0 && rhsTotalEqual == lhsTotalEqual && len(rhs.path.IndexFilters) >= len(lhs.path.IndexFilters))) - - if len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 && - !lhs.path.IsTablePath() && !rhs.path.IsTablePath() { // Not a table scan - if (lhsHasStatistics || rhsHasStatistics) && // At least one index has statistics - (!lhsHasStatistics || !rhsHasStatistics) { // At least one index doesn't have statistics - if lhsHasStatistics && lhsTotalEqual > 0 && lhsMoreFilters { - return 1 + return 0, false + } + // lhsPseudo == lhs has pseudo (no) stats for the table or index for the lhs path. + // rhsPseudo == rhs has pseudo (no) stats for the table or index for the rhs path. + // + // For the return value - if lhs wins (1), we return lhsPseudo. If rhs wins (-1), we return rhsPseudo. + // If there is no winner (0), we return false. + // + // This return value is used later in SkyLinePruning to determine whether we should preference an index scan + // over a table scan. Allowing indexes without statistics to survive means they can win via heuristics where + // they otherwise would have lost on cost. + lhsPseudo, rhsPseudo, tablePseudo := false, false, false + lhsFullScan := lhs.path.IsFullScanRange(tableInfo) + rhsFullScan := rhs.path.IsFullScanRange(tableInfo) + if statsTbl != nil { + lhsPseudo, rhsPseudo, tablePseudo = statsTbl.HistColl.Pseudo, statsTbl.HistColl.Pseudo, statsTbl.HistColl.Pseudo + if len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 { + if !lhsFullScan && lhs.path.Index != nil { + if statsTbl.ColAndIdxExistenceMap.HasAnalyzed(lhs.path.Index.ID, true) { + lhsPseudo = false // We have statistics for the lhs index + } else { + lhsPseudo = true + } } - if rhsHasStatistics && rhsTotalEqual > 0 && rhsMoreFilters { - return -1 + if !rhsFullScan && rhs.path.Index != nil { + if statsTbl.ColAndIdxExistenceMap.HasAnalyzed(rhs.path.Index.ID, true) { + rhsPseudo = false // We have statistics on the rhs index + } else { + rhsPseudo = true + } } } + } - lhsCorrRatio, rhsCorrRatio := 0.0, 0.0 - if lhs.path.CorrCountAfterAccess > 0 || rhs.path.CorrCountAfterAccess > 0 { - lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess - rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess + matchResult, globalResult := compareBool(lhs.isMatchProp, rhs.isMatchProp), compareGlobalIndex(lhs, rhs) + accessResult, comparable1 := util.CompareCol2Len(lhs.accessCondsColMap, rhs.accessCondsColMap) + scanResult, comparable2 := compareIndexBack(lhs, rhs) + sum := accessResult + scanResult + matchResult + globalResult + + // First rules apply when an index doesn't have statistics and another object (index or table) has statistics + if (lhsPseudo || rhsPseudo) && !tablePseudo && !lhsFullScan && !rhsFullScan { // At least one index doesn't have statistics + // If one index has statistics and the other does not, choose the index with statistics if it + // has the same or higher number of equal/IN predicates. + if !lhsPseudo && globalResult >= 0 && sum >= 0 && + lhs.path.EqOrInCondCount > 0 && lhs.path.EqOrInCondCount >= rhs.path.EqOrInCondCount { + return 1, false // left wins and has statistics } - if lhsMoreFilters && lhsCorrRatio < rhsCorrRatio { - return 1 + if !rhsPseudo && globalResult <= 0 && sum <= 0 && + rhs.path.EqOrInCondCount > 0 && rhs.path.EqOrInCondCount >= lhs.path.EqOrInCondCount { + return -1, false // right wins and has statistics } - if rhsMoreFilters && rhsCorrRatio < lhsCorrRatio { - return -1 + if preferRange { + // keep an index without statistics if that index has more equal/IN predicates, AND: + // 1) there are at least 2 equal/INs + // 2) OR - it's a full index match for all index predicates + if lhsPseudo && lhs.path.EqOrInCondCount > rhs.path.EqOrInCondCount && globalResult >= 0 && sum >= 0 && + (lhs.path.EqOrInCondCount > 1 || (lhs.path.EqOrInCondCount > 0 && len(lhs.indexCondsColMap) >= len(lhs.path.Index.Columns))) { + return 1, true // left wins and does NOT have statistics + } + if rhsPseudo && rhs.path.EqOrInCondCount > lhs.path.EqOrInCondCount && globalResult <= 0 && sum <= 0 && + (rhs.path.EqOrInCondCount > 1 || (rhs.path.EqOrInCondCount > 0 && len(rhs.indexCondsColMap) >= len(rhs.path.Index.Columns))) { + return -1, true // right wins and does NOT have statistics + } } + } - // This rule is empirical but not always correct. - // If x's range row count is significantly lower than y's, for example, 1000 times, we think x is better. - if lhs.path.CorrCountAfterAccess > 100 && rhs.path.CorrCountAfterAccess > 100 && // to prevent some extreme cases, e.g. 0.01 : 10 - len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 && // not IndexMerge since its row count estimation is not accurate enough - prop.ExpectedCnt == math.MaxFloat64 { // Limit may affect access row count - threshold := float64(fixcontrol.GetIntWithDefault(sctx.GetSessionVars().OptimizerFixControl, fixcontrol.Fix45132, 1000)) - if threshold > 0 { // set it to 0 to disable this rule - if rhsMoreFilters && - (lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold || rhsCorrRatio < lhsCorrRatio) { - return -1 - } - if lhsMoreFilters && - (rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold || lhsCorrRatio < rhsCorrRatio) { - return 1 - } + // This rule is empirical but not always correct. + // If x's range row count is significantly lower than y's, for example, 1000 times, we think x is better. + if lhs.path.CountAfterAccess > 100 && rhs.path.CountAfterAccess > 100 && // to prevent some extreme cases, e.g. 0.01 : 10 + len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 && // not IndexMerge since its row count estimation is not accurate enough + prop.ExpectedCnt == math.MaxFloat64 { // Limit may affect access row count + threshold := float64(fixcontrol.GetIntWithDefault(sctx.GetSessionVars().OptimizerFixControl, fixcontrol.Fix45132, 1000)) + if threshold > 0 { // set it to 0 to disable this rule + if lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold { + return -1, false + } + if rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold { + return 1, false } } } - // Below compares the two candidate paths on three dimensions: + // Below compares the two candidate paths on four dimensions: // (1): the set of columns that occurred in the access condition, // (2): does it require a double scan, // (3): whether or not it matches the physical property, // (4): it's a global index path or not. // If `x` is not worse than `y` at all factors, // and there exists one factor that `x` is better than `y`, then `x` is better than `y`. - accessResult, comparable1 := util.CompareCol2Len(lhs.accessCondsColMap, rhs.accessCondsColMap) if !comparable1 { - return 0 + return 0, false } - scanResult, comparable2 := compareIndexBack(lhs, rhs) if !comparable2 { - return 0 + return 0, false } - matchResult, globalResult := compareBool(lhs.isMatchProp, rhs.isMatchProp), compareGlobalIndex(lhs, rhs) - sum := accessResult + scanResult + matchResult + globalResult if accessResult >= 0 && scanResult >= 0 && matchResult >= 0 && globalResult >= 0 && sum > 0 { - return 1 + return 1, false } if accessResult <= 0 && scanResult <= 0 && matchResult <= 0 && globalResult <= 0 && sum < 0 { - return -1 + return -1, false } - return 0 + return 0, false } func isMatchProp(ds *logicalop.DataSource, path *util.AccessPath, prop *property.PhysicalProperty) bool { + if ds.Table.Type().IsClusterTable() && !prop.IsSortItemEmpty() { + // TableScan with cluster table can't keep order. + return false + } if prop.VectorProp.VSInfo != nil && path.Index != nil && path.Index.VectorInfo != nil { if path.Index == nil || path.Index.VectorInfo == nil { return false @@ -1142,6 +1163,9 @@ func getIndexMergeCandidate(ds *logicalop.DataSource, path *util.AccessPath, pro // there exists a path that is not worse than it at all factors and there is at least one better factor. func skylinePruning(ds *logicalop.DataSource, prop *property.PhysicalProperty) []*candidatePath { candidates := make([]*candidatePath, 0, 4) + idxMissingStats := false + // tidb_opt_prefer_range_scan is the master switch to control index preferencing + preferRange := ds.SCtx().GetSessionVars().GetAllowPreferRangeScan() for _, path := range ds.PossibleAccessPaths { // We should check whether the possible access path is valid first. if path.StoreType != kv.TiFlash && prop.IsFlashProp() { @@ -1182,7 +1206,12 @@ func skylinePruning(ds *logicalop.DataSource, prop *property.PhysicalProperty) [ if candidates[i].path.StoreType == kv.TiFlash { continue } - result := compareCandidates(ds.SCtx(), ds.StatisticTable, prop, candidates[i], currentCandidate) + var result int + currentMissingStats := false + result, currentMissingStats = compareCandidates(ds.SCtx(), ds.StatisticTable, ds.TableInfo, prop, candidates[i], currentCandidate, preferRange) + if currentMissingStats { + idxMissingStats = true // Ensure that we track idxMissingStats across all iterations + } if result == 1 { pruned = true // We can break here because the current candidate cannot prune others anymore. @@ -1202,28 +1231,23 @@ func skylinePruning(ds *logicalop.DataSource, prop *property.PhysicalProperty) [ fixcontrol.Fix52869, false, ) - // tidb_opt_prefer_range_scan is the master switch to control index preferencing - preferRange := ds.SCtx().GetSessionVars().GetAllowPreferRangeScan() && - (preferMerge || (ds.TableStats.HistColl.Pseudo || ds.TableStats.RowCount < 1)) + if preferRange { + // Override preferRange with the following limitations to scope + preferRange = preferMerge || idxMissingStats || ds.TableStats.HistColl.Pseudo || ds.TableStats.RowCount < 1 + } if preferRange && len(candidates) > 1 { - // If a candidate path is TiFlash-path or forced-path or MV index, we just keep them. For other candidate paths, if there exists - // any range scan path, we remove full scan paths and keep range scan paths. + // If a candidate path is TiFlash-path or forced-path or MV index or global index, we just keep them. For other + // candidate paths, if there exists any range scan path, we remove full scan paths and keep range scan paths. preferredPaths := make([]*candidatePath, 0, len(candidates)) var hasRangeScanPath bool for _, c := range candidates { - if c.path.Forced || c.path.StoreType == kv.TiFlash || (c.path.Index != nil && c.path.Index.MVIndex) { + if c.path.Forced || c.path.StoreType == kv.TiFlash || (c.path.Index != nil && (c.path.Index.Global || c.path.Index.MVIndex)) { preferredPaths = append(preferredPaths, c) continue } - var unsignedIntHandle bool - if c.path.IsIntHandlePath && ds.TableInfo.PKIsHandle { - if pkColInfo := ds.TableInfo.GetPkColInfo(); pkColInfo != nil { - unsignedIntHandle = mysql.HasUnsignedFlag(pkColInfo.GetFlag()) - } - } - if !ranger.HasFullRange(c.path.Ranges, unsignedIntHandle) { + if !c.path.IsFullScanRange(ds.TableInfo) { // Preference plans with equals/IN predicates or where there is more filtering in the index than against the table - indexFilters := c.path.EqCondCount > 0 || c.path.EqOrInCondCount > 0 || len(c.path.TableFilters) < len(c.path.IndexFilters) + indexFilters := c.path.EqOrInCondCount > 0 || len(c.path.TableFilters) < len(c.path.IndexFilters) if preferMerge || (indexFilters && (prop.IsSortItemEmpty() || c.isMatchProp)) { preferredPaths = append(preferredPaths, c) hasRangeScanPath = true @@ -1421,6 +1445,10 @@ func findBestTask4LogicalDataSource(lp base.LogicalPlan, prop *property.Physical if ds.PreferStoreType&h.PreferTiFlash != 0 && path.StoreType == kv.TiKV { continue } + // prefer tikv, while current table path is tiflash, skip it. + if ds.PreferStoreType&h.PreferTiKV != 0 && path.StoreType == kv.TiFlash { + continue + } idxMergeTask, err := convertToIndexMergeScan(ds, prop, candidate, opt) if err != nil { return nil, 0, err From 06cbd70390d86a544bd875400a00896d6e58ba33 Mon Sep 17 00:00:00 2001 From: tpp Date: Thu, 23 Jan 2025 17:01:01 -0800 Subject: [PATCH 10/23] re-add code change after conflict resolution --- pkg/planner/core/find_best_task.go | 33 +++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index d5bcc8a703b65..ebda30b3ba48c 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -783,16 +783,29 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, tableI // This rule is empirical but not always correct. // If x's range row count is significantly lower than y's, for example, 1000 times, we think x is better. - if lhs.path.CountAfterAccess > 100 && rhs.path.CountAfterAccess > 100 && // to prevent some extreme cases, e.g. 0.01 : 10 - len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 && // not IndexMerge since its row count estimation is not accurate enough - prop.ExpectedCnt == math.MaxFloat64 { // Limit may affect access row count - threshold := float64(fixcontrol.GetIntWithDefault(sctx.GetSessionVars().OptimizerFixControl, fixcontrol.Fix45132, 1000)) - if threshold > 0 { // set it to 0 to disable this rule - if lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold { - return -1, false - } - if rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold { - return 1, false + if len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 { // not IndexMerge since its row count estimation is not accurate enough + lhsCorrRatio, rhsCorrRatio := 0.0, 0.0 + if lhs.path.CorrCountAfterAccess > 0 || rhs.path.CorrCountAfterAccess > 0 { + lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess + rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess + } + if globalResult >= 0 && sum >= 0 && lhsCorrRatio < rhsCorrRatio { + return 1, false + } + if globalResult <= 0 && sum <= 0 && rhsCorrRatio < lhsCorrRatio { + return -1, false + } + + if lhs.path.CountAfterAccess > 100 && rhs.path.CountAfterAccess > 100 && // to prevent some extreme cases, e.g. 0.01 : 10 + prop.ExpectedCnt == math.MaxFloat64 { // Limit may affect access row count + threshold := float64(fixcontrol.GetIntWithDefault(sctx.GetSessionVars().OptimizerFixControl, fixcontrol.Fix45132, 1000)) + if threshold > 0 { // set it to 0 to disable this rule + if lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold { + return -1, false + } + if rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold { + return 1, false + } } } } From 1096071f8f517ccd205b64f34cdab6195a3ffdad Mon Sep 17 00:00:00 2001 From: tpp Date: Thu, 23 Jan 2025 17:40:11 -0800 Subject: [PATCH 11/23] revision1 --- pkg/planner/core/find_best_task.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index ebda30b3ba48c..5d9329c809bc2 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -781,21 +781,20 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, tableI } } - // This rule is empirical but not always correct. - // If x's range row count is significantly lower than y's, for example, 1000 times, we think x is better. if len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 { // not IndexMerge since its row count estimation is not accurate enough lhsCorrRatio, rhsCorrRatio := 0.0, 0.0 if lhs.path.CorrCountAfterAccess > 0 || rhs.path.CorrCountAfterAccess > 0 { lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess } - if globalResult >= 0 && sum >= 0 && lhsCorrRatio < rhsCorrRatio { + if globalResult >= 0 && sum >= 0 && !lhsFullScan && lhsCorrRatio < rhsCorrRatio { return 1, false } - if globalResult <= 0 && sum <= 0 && rhsCorrRatio < lhsCorrRatio { + if globalResult <= 0 && sum <= 0 && !rhsFullScan && rhsCorrRatio < lhsCorrRatio { return -1, false } - + // This rule is empirical but not always correct. + // If x's range row count is significantly lower than y's, for example, 1000 times, we think x is better. if lhs.path.CountAfterAccess > 100 && rhs.path.CountAfterAccess > 100 && // to prevent some extreme cases, e.g. 0.01 : 10 prop.ExpectedCnt == math.MaxFloat64 { // Limit may affect access row count threshold := float64(fixcontrol.GetIntWithDefault(sctx.GetSessionVars().OptimizerFixControl, fixcontrol.Fix45132, 1000)) From a87a38953deaf572430c0dfb0193686f5a113c55 Mon Sep 17 00:00:00 2001 From: tpp Date: Thu, 23 Jan 2025 18:53:13 -0800 Subject: [PATCH 12/23] revision2 --- pkg/planner/core/find_best_task.go | 55 ++++++++++++++++-------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index 5d9329c809bc2..af61f39a6560f 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -708,6 +708,21 @@ func compareGlobalIndex(lhs, rhs *candidatePath) int { return compareBool(lhs.path.Index.Global, rhs.path.Index.Global) } +func compareCorrRatio(lhs, rhs *candidatePath) (int, float64) { + lhsCorrRatio, rhsCorrRatio := 0.0, 0.0 + if lhs.path.CorrCountAfterAccess > 0 || rhs.path.CorrCountAfterAccess > 0 { + lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess + rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess + } + if lhsCorrRatio > 0 { + return 1, lhsCorrRatio + } + if rhsCorrRatio > 0 { + return -1, rhsCorrRatio + } + return 0, 0 +} + // compareCandidates is the core of skyline pruning, which is used to decide which candidate path is better. // The first return value is 1 if lhs is better, -1 if rhs is better, 0 if they are equivalent or not comparable. // The 2nd return value indicates whether the "better path" is missing statistics or not. @@ -752,7 +767,9 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, tableI matchResult, globalResult := compareBool(lhs.isMatchProp, rhs.isMatchProp), compareGlobalIndex(lhs, rhs) accessResult, comparable1 := util.CompareCol2Len(lhs.accessCondsColMap, rhs.accessCondsColMap) scanResult, comparable2 := compareIndexBack(lhs, rhs) - sum := accessResult + scanResult + matchResult + globalResult + // corrResult returns the left vs right comparison as a boolean, but also the actual ratio - which will be used in future + corrResult, _ := compareCorrRatio(lhs, rhs) + sum := accessResult + scanResult + matchResult + globalResult + corrResult // First rules apply when an index doesn't have statistics and another object (index or table) has statistics if (lhsPseudo || rhsPseudo) && !tablePseudo && !lhsFullScan && !rhsFullScan { // At least one index doesn't have statistics @@ -781,30 +798,18 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, tableI } } - if len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 { // not IndexMerge since its row count estimation is not accurate enough - lhsCorrRatio, rhsCorrRatio := 0.0, 0.0 - if lhs.path.CorrCountAfterAccess > 0 || rhs.path.CorrCountAfterAccess > 0 { - lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess - rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess - } - if globalResult >= 0 && sum >= 0 && !lhsFullScan && lhsCorrRatio < rhsCorrRatio { - return 1, false - } - if globalResult <= 0 && sum <= 0 && !rhsFullScan && rhsCorrRatio < lhsCorrRatio { - return -1, false - } - // This rule is empirical but not always correct. - // If x's range row count is significantly lower than y's, for example, 1000 times, we think x is better. - if lhs.path.CountAfterAccess > 100 && rhs.path.CountAfterAccess > 100 && // to prevent some extreme cases, e.g. 0.01 : 10 - prop.ExpectedCnt == math.MaxFloat64 { // Limit may affect access row count - threshold := float64(fixcontrol.GetIntWithDefault(sctx.GetSessionVars().OptimizerFixControl, fixcontrol.Fix45132, 1000)) - if threshold > 0 { // set it to 0 to disable this rule - if lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold { - return -1, false - } - if rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold { - return 1, false - } + // This rule is empirical but not always correct. + // If x's range row count is significantly lower than y's, for example, 1000 times, we think x is better. + if len(lhs.path.PartialIndexPaths) == 0 && len(rhs.path.PartialIndexPaths) == 0 && // not IndexMerge since its row count estimation is not accurate enough + lhs.path.CountAfterAccess > 100 && rhs.path.CountAfterAccess > 100 && // to prevent some extreme cases, e.g. 0.01 : 10 + prop.ExpectedCnt == math.MaxFloat64 { // Limit may affect access row count + threshold := float64(fixcontrol.GetIntWithDefault(sctx.GetSessionVars().OptimizerFixControl, fixcontrol.Fix45132, 1000)) + if threshold > 0 { // set it to 0 to disable this rule + if lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold && corrResult <= 0 { + return -1, false + } + if rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold && corrResult >= 0 { + return 1, false } } } From b183768ae7b4abe8f780108a5c4bd55efd91f59f Mon Sep 17 00:00:00 2001 From: tpp Date: Thu, 23 Jan 2025 19:26:01 -0800 Subject: [PATCH 13/23] testcase1 --- tests/integrationtest/r/imdbload.result | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/integrationtest/r/imdbload.result b/tests/integrationtest/r/imdbload.result index 3dc0c532d4ccb..787c49b2e81eb 100644 --- a/tests/integrationtest/r/imdbload.result +++ b/tests/integrationtest/r/imdbload.result @@ -276,9 +276,9 @@ load stats 's/imdbload_stats/movie_info.json'; load stats 's/imdbload_stats/cast_info.json'; explain select * from char_name where ((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436')); id estRows task access object operator info -TableReader_7 2.00 root data:Selection_6 -└─Selection_6 2.00 cop[tikv] or(and(eq(imdbload.char_name.imdb_index, "I"), lt(imdbload.char_name.surname_pcode, "E436")), and(eq(imdbload.char_name.imdb_index, "L"), lt(imdbload.char_name.surname_pcode, "E436"))) - └─TableFullScan_5 4314864.00 cop[tikv] table:char_name keep order:false +IndexLookUp_10 2.00 root +├─IndexRangeScan_8(Build) 2.00 cop[tikv] table:char_name, index:itest2(imdb_index, surname_pcode, name_pcode_nf) range:["I" -inf,"I" "E436"), ["L" -inf,"L" "E436"), keep order:false +└─TableRowIDScan_9(Probe) 2.00 cop[tikv] table:char_name keep order:false explain select * from char_name use index (itest2) where ((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436')); id estRows task access object operator info IndexLookUp_7 2.00 root @@ -350,18 +350,20 @@ CE_trace explain select * from keyword where ((phonetic_code = 'R1652') and (keyword > 'ecg-monitor' and keyword < 'killers')); id estRows task access object operator info -TableReader_7 901.00 root data:Selection_6 -└─Selection_6 901.00 cop[tikv] eq(imdbload.keyword.phonetic_code, "R1652"), gt(imdbload.keyword.keyword, "ecg-monitor"), lt(imdbload.keyword.keyword, "killers") - └─TableFullScan_5 236627.00 cop[tikv] table:keyword keep order:false +IndexLookUp_11 901.00 root +├─IndexRangeScan_8(Build) 901.00 cop[tikv] table:keyword, index:itest(phonetic_code, keyword) range:("R1652" "ecg-monitor","R1652" "killers"), keep order:false +└─Selection_10(Probe) 901.00 cop[tikv] gt(imdbload.keyword.keyword, "ecg-monitor"), lt(imdbload.keyword.keyword, "killers") + └─TableRowIDScan_9 901.00 cop[tikv] table:keyword keep order:false trace plan target = 'estimation' select * from keyword where ((phonetic_code = 'R1652') and (keyword > 'ecg-monitor' and keyword < 'killers')); CE_trace [{"table_name":"keyword","type":"Column Stats-Point","expr":"((phonetic_code = 'R1652'))","row_count":23480},{"table_name":"keyword","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":236627},{"table_name":"keyword","type":"Column Stats-Range","expr":"((keyword > 'ecg-monitor' and keyword < 'killers'))","row_count":44075},{"table_name":"keyword","type":"Index Stats-Point","expr":"((phonetic_code = 'R1652'))","row_count":23480},{"table_name":"keyword","type":"Index Stats-Range","expr":"((keyword >= 'ecg-m' and keyword <= 'kille'))","row_count":44036},{"table_name":"keyword","type":"Index Stats-Range","expr":"((phonetic_code = 'R1652') and (keyword > 'ecg-monitor' and keyword < 'killers'))","row_count":901},{"table_name":"keyword","type":"Table Stats-Expression-CNF","expr":"`and`(`eq`(imdbload.keyword.phonetic_code, 'R1652'), `and`(`gt`(imdbload.keyword.keyword, 'ecg-monitor'), `lt`(imdbload.keyword.keyword, 'killers')))","row_count":901}] explain select * from cast_info where (nr_order is null) and (person_role_id = 2) and (note >= '(key set pa: Florida'); id estRows task access object operator info -TableReader_7 144633.00 root data:Selection_6 -└─Selection_6 144633.00 cop[tikv] eq(imdbload.cast_info.person_role_id, 2), ge(imdbload.cast_info.note, "(key set pa: Florida"), isnull(imdbload.cast_info.nr_order) - └─TableFullScan_5 63475835.00 cop[tikv] table:cast_info keep order:false +IndexLookUp_11 144633.00 root +├─IndexRangeScan_8(Build) 144633.00 cop[tikv] table:cast_info, index:itest2(nr_order, person_role_id, note) range:[NULL 2 "(key set pa: Florida",NULL 2 +inf], keep order:false +└─Selection_10(Probe) 144633.00 cop[tikv] ge(imdbload.cast_info.note, "(key set pa: Florida") + └─TableRowIDScan_9 144633.00 cop[tikv] table:cast_info keep order:false trace plan target = 'estimation' select * from cast_info where (nr_order is null) and (person_role_id = 2) and (note >= '(key set pa: Florida'); CE_trace [{"table_name":"cast_info","type":"Column Stats-Point","expr":"((nr_order is null))","row_count":45995275},{"table_name":"cast_info","type":"Column Stats-Point","expr":"((person_role_id = 2))","row_count":2089611},{"table_name":"cast_info","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":63475835},{"table_name":"cast_info","type":"Column Stats-Range","expr":"((note >= '(key set pa: Florida' and true))","row_count":14934328},{"table_name":"cast_info","type":"Index Stats-Point","expr":"((person_role_id = 2))","row_count":2089611},{"table_name":"cast_info","type":"Index Stats-Range","expr":"((nr_order is null) and (person_role_id = 2) and (note >= '(key set pa: Florida' and true))","row_count":144633},{"table_name":"cast_info","type":"Table Stats-Expression-CNF","expr":"`and`(`isnull`(imdbload.cast_info.nr_order), `and`(`eq`(imdbload.cast_info.person_role_id, 2), `ge`(imdbload.cast_info.note, '(key set pa: Florida')))","row_count":144633},{"table_name":"cast_info","type":"Table Stats-Expression-CNF","expr":"`eq`(imdbload.cast_info.person_role_id, 2)","row_count":2089611}] From 301d616532ceea01abbd1446e7347391fa504aac Mon Sep 17 00:00:00 2001 From: tpp Date: Fri, 24 Jan 2025 04:30:56 -0800 Subject: [PATCH 14/23] testcase2 --- pkg/planner/core/find_best_task.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index af61f39a6560f..bf04046a0eeae 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -767,8 +767,11 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, tableI matchResult, globalResult := compareBool(lhs.isMatchProp, rhs.isMatchProp), compareGlobalIndex(lhs, rhs) accessResult, comparable1 := util.CompareCol2Len(lhs.accessCondsColMap, rhs.accessCondsColMap) scanResult, comparable2 := compareIndexBack(lhs, rhs) - // corrResult returns the left vs right comparison as a boolean, but also the actual ratio - which will be used in future - corrResult, _ := compareCorrRatio(lhs, rhs) + corrResult := 0 + if lhsPseudo == rhsPseudo { + // corrResult returns the left vs right comparison as a boolean, but also the actual ratio - which will be used in future + corrResult, _ = compareCorrRatio(lhs, rhs) + } sum := accessResult + scanResult + matchResult + globalResult + corrResult // First rules apply when an index doesn't have statistics and another object (index or table) has statistics From 5751476bd7616e10e40d01dae9583a060d5fd5aa Mon Sep 17 00:00:00 2001 From: tpp Date: Fri, 24 Jan 2025 17:37:04 -0800 Subject: [PATCH 15/23] code fix1 --- pkg/planner/core/find_best_task.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index bf04046a0eeae..d6f5793ddd4a8 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -714,10 +714,10 @@ func compareCorrRatio(lhs, rhs *candidatePath) (int, float64) { lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess } - if lhsCorrRatio > 0 { + if lhsCorrRatio < rhsCorrRatio { return 1, lhsCorrRatio } - if rhsCorrRatio > 0 { + if rhsCorrRatio < lhsCorrRatio { return -1, rhsCorrRatio } return 0, 0 From 8e94e2fa3bf489c207e8e49593f4481fe69430bc Mon Sep 17 00:00:00 2001 From: tpp Date: Fri, 24 Jan 2025 18:26:05 -0800 Subject: [PATCH 16/23] code fix2 --- pkg/planner/cardinality/row_count_index.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index 78dea02c59d52..a50c50b100d53 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -529,7 +529,7 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll idxLowBound /= 0.9 } // corrsel is the selectivity of the most filtering column - corrsel = min(idxLowBound, singleColumnEstResults[0]) + corrsel = max(idxLowBound, singleColumnEstResults[0]) minTwoCol := min(singleColumnEstResults[0], singleColumnEstResults[1], idxLowBound) multTwoCol := singleColumnEstResults[0] * math.Sqrt(singleColumnEstResults[1]) if l == 2 { From 31f6ede157b96101d315dc9410f2d88be4c3d5b9 Mon Sep 17 00:00:00 2001 From: tpp Date: Fri, 24 Jan 2025 19:04:59 -0800 Subject: [PATCH 17/23] testcase2 --- pkg/planner/core/find_best_task.go | 2 +- .../r/planner/cardinality/selectivity.result | 12 ++++++------ .../t/planner/cardinality/selectivity.test | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index d6f5793ddd4a8..1b9c0a88c49fb 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -768,7 +768,7 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, tableI accessResult, comparable1 := util.CompareCol2Len(lhs.accessCondsColMap, rhs.accessCondsColMap) scanResult, comparable2 := compareIndexBack(lhs, rhs) corrResult := 0 - if lhsPseudo == rhsPseudo { + if lhsPseudo == rhsPseudo && !lhsFullScan && !rhsFullScan { // corrResult returns the left vs right comparison as a boolean, but also the actual ratio - which will be used in future corrResult, _ = compareCorrRatio(lhs, rhs) } diff --git a/tests/integrationtest/r/planner/cardinality/selectivity.result b/tests/integrationtest/r/planner/cardinality/selectivity.result index 89bcbb5ce9da3..c8ef8f137ee0f 100644 --- a/tests/integrationtest/r/planner/cardinality/selectivity.result +++ b/tests/integrationtest/r/planner/cardinality/selectivity.result @@ -1223,15 +1223,15 @@ insert into t values ('tw', 0); insert into t values ('tw', 0); insert into t values ('tw', 0); analyze table t all columns; -explain select * from t where a = 'tw' and b < 0; +explain format='brief' select * from t where a = 'tw' and b < 0; id estRows task access object operator info -IndexReader_6 1.00 root index:IndexRangeScan_5 -└─IndexRangeScan_5 1.00 cop[tikv] table:t, index:idx(a, b) range:["tw" -inf,"tw" 0), keep order:false +IndexReader 1.00 root index:IndexRangeScan +└─IndexRangeScan 1.00 cop[tikv] table:t, index:idx(a, b) range:["tw" -inf,"tw" 0), keep order:false set @@tidb_opt_fix_control = '47400:on'; -explain select * from t where a = 'tw' and b < 0; +explain format='brief' select * from t where a = 'tw' and b < 0; id estRows task access object operator info -IndexReader_6 0.00 root index:IndexRangeScan_5 -└─IndexRangeScan_5 0.00 cop[tikv] table:t, index:idx(a, b) range:["tw" -inf,"tw" 0), keep order:false +IndexReader 0.00 root index:IndexRangeScan +└─IndexRangeScan 0.00 cop[tikv] table:t, index:idx(a, b) range:["tw" -inf,"tw" 0), keep order:false set @@tidb_opt_fix_control = '47400:off'; drop table if exists t; create table t(id int auto_increment, kid int, pid int, primary key(id), key(kid, pid)); diff --git a/tests/integrationtest/t/planner/cardinality/selectivity.test b/tests/integrationtest/t/planner/cardinality/selectivity.test index b865738ef56bb..a953f45a821e8 100644 --- a/tests/integrationtest/t/planner/cardinality/selectivity.test +++ b/tests/integrationtest/t/planner/cardinality/selectivity.test @@ -646,9 +646,9 @@ insert into t values ('tw', 0); insert into t values ('tw', 0); insert into t values ('tw', 0); analyze table t all columns; -explain select * from t where a = 'tw' and b < 0; +explain format='brief' select * from t where a = 'tw' and b < 0; set @@tidb_opt_fix_control = '47400:on'; -explain select * from t where a = 'tw' and b < 0; +explain format='brief' select * from t where a = 'tw' and b < 0; set @@tidb_opt_fix_control = '47400:off'; # TestSelectCombinedLowBound From ed70dac45a69c98dba42af67b3b526f8e8ac0547 Mon Sep 17 00:00:00 2001 From: tpp Date: Fri, 24 Jan 2025 19:46:12 -0800 Subject: [PATCH 18/23] testcase3 --- pkg/planner/core/find_best_task.go | 2 ++ .../r/planner/cardinality/selectivity.result | 12 ++++++------ .../t/planner/cardinality/selectivity.test | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index 1b9c0a88c49fb..8bf955c1eb356 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -768,6 +768,8 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, tableI accessResult, comparable1 := util.CompareCol2Len(lhs.accessCondsColMap, rhs.accessCondsColMap) scanResult, comparable2 := compareIndexBack(lhs, rhs) corrResult := 0 + // corrResult focuses on comparing competing indexes with statistics - potentially those index estimates may have an + // implied correlation. Thus - exclude if validity of statistics between lhs & rhs or if these are full scans if lhsPseudo == rhsPseudo && !lhsFullScan && !rhsFullScan { // corrResult returns the left vs right comparison as a boolean, but also the actual ratio - which will be used in future corrResult, _ = compareCorrRatio(lhs, rhs) diff --git a/tests/integrationtest/r/planner/cardinality/selectivity.result b/tests/integrationtest/r/planner/cardinality/selectivity.result index c8ef8f137ee0f..89bcbb5ce9da3 100644 --- a/tests/integrationtest/r/planner/cardinality/selectivity.result +++ b/tests/integrationtest/r/planner/cardinality/selectivity.result @@ -1223,15 +1223,15 @@ insert into t values ('tw', 0); insert into t values ('tw', 0); insert into t values ('tw', 0); analyze table t all columns; -explain format='brief' select * from t where a = 'tw' and b < 0; +explain select * from t where a = 'tw' and b < 0; id estRows task access object operator info -IndexReader 1.00 root index:IndexRangeScan -└─IndexRangeScan 1.00 cop[tikv] table:t, index:idx(a, b) range:["tw" -inf,"tw" 0), keep order:false +IndexReader_6 1.00 root index:IndexRangeScan_5 +└─IndexRangeScan_5 1.00 cop[tikv] table:t, index:idx(a, b) range:["tw" -inf,"tw" 0), keep order:false set @@tidb_opt_fix_control = '47400:on'; -explain format='brief' select * from t where a = 'tw' and b < 0; +explain select * from t where a = 'tw' and b < 0; id estRows task access object operator info -IndexReader 0.00 root index:IndexRangeScan -└─IndexRangeScan 0.00 cop[tikv] table:t, index:idx(a, b) range:["tw" -inf,"tw" 0), keep order:false +IndexReader_6 0.00 root index:IndexRangeScan_5 +└─IndexRangeScan_5 0.00 cop[tikv] table:t, index:idx(a, b) range:["tw" -inf,"tw" 0), keep order:false set @@tidb_opt_fix_control = '47400:off'; drop table if exists t; create table t(id int auto_increment, kid int, pid int, primary key(id), key(kid, pid)); diff --git a/tests/integrationtest/t/planner/cardinality/selectivity.test b/tests/integrationtest/t/planner/cardinality/selectivity.test index a953f45a821e8..b865738ef56bb 100644 --- a/tests/integrationtest/t/planner/cardinality/selectivity.test +++ b/tests/integrationtest/t/planner/cardinality/selectivity.test @@ -646,9 +646,9 @@ insert into t values ('tw', 0); insert into t values ('tw', 0); insert into t values ('tw', 0); analyze table t all columns; -explain format='brief' select * from t where a = 'tw' and b < 0; +explain select * from t where a = 'tw' and b < 0; set @@tidb_opt_fix_control = '47400:on'; -explain format='brief' select * from t where a = 'tw' and b < 0; +explain select * from t where a = 'tw' and b < 0; set @@tidb_opt_fix_control = '47400:off'; # TestSelectCombinedLowBound From 5e870f3510708fc14c7f5dee880f0cd9a34599ba Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 19 Dec 2024 10:34:00 +0800 Subject: [PATCH 19/23] compact restore: use closure to initial snapshot restore checkpoint (#58146) close pingcap/tidb#58237 --- br/pkg/restore/snap_client/client.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/br/pkg/restore/snap_client/client.go b/br/pkg/restore/snap_client/client.go index 9d03c080356f5..6048f2b8dd191 100644 --- a/br/pkg/restore/snap_client/client.go +++ b/br/pkg/restore/snap_client/client.go @@ -78,7 +78,10 @@ const minBatchDdlSize = 1 type SnapClient struct { restorer restore.SstRestorer +<<<<<<< HEAD importer *SnapFileImporter +======= +>>>>>>> 659e3e73dd (compact restore: use closure to initial snapshot restore checkpoint (#58146)) // Use a closure to lazy load checkpoint runner getRestorerFn func(*checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer // Tool clients used by SnapClient @@ -588,7 +591,11 @@ func (rc *SnapClient) initClients(ctx context.Context, backend *backuppb.Storage } // Raw/Txn restore are not support checkpoint for now rc.getRestorerFn = func(checkpointRunner *checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer { +<<<<<<< HEAD return restore.NewSimpleSstRestorer(ctx, rc.importer, rc.workerPool, nil) +======= + return restore.NewSimpleSstRestorer(ctx, fileImporter, rc.workerPool, nil) +>>>>>>> 659e3e73dd (compact restore: use closure to initial snapshot restore checkpoint (#58146)) } } else { // or create a fileImporter with the cluster API version @@ -598,7 +605,11 @@ func (rc *SnapClient) initClients(ctx context.Context, backend *backuppb.Storage return errors.Trace(err) } rc.getRestorerFn = func(checkpointRunner *checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer { +<<<<<<< HEAD return restore.NewMultiTablesRestorer(ctx, rc.importer, rc.workerPool, checkpointRunner) +======= + return restore.NewMultiTablesRestorer(ctx, fileImporter, rc.workerPool, checkpointRunner) +>>>>>>> 659e3e73dd (compact restore: use closure to initial snapshot restore checkpoint (#58146)) } } return nil From f8b5ddadc2ab482bd57a200188b5eb5b5f556e4a Mon Sep 17 00:00:00 2001 From: tpp Date: Mon, 27 Jan 2025 16:43:35 -0800 Subject: [PATCH 20/23] rebase3 --- pkg/expression/function_traits.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/expression/function_traits.go b/pkg/expression/function_traits.go index 978b8eba79eaf..3e81380bc4a67 100644 --- a/pkg/expression/function_traits.go +++ b/pkg/expression/function_traits.go @@ -99,6 +99,10 @@ var IllegalFunctions4GeneratedColumns = map[string]struct{}{ ast.CurrentUser: {}, ast.Curtime: {}, ast.Database: {}, +<<<<<<< HEAD +======= + ast.Encrypt: {}, +>>>>>>> 77866d1f46 (expression: Update generated column function restrictions (#58308)) ast.FoundRows: {}, ast.GetLock: {}, ast.GetVar: {}, From c985de6cc6491270d9c1a709ea628b66b9002300 Mon Sep 17 00:00:00 2001 From: tpp Date: Mon, 27 Jan 2025 17:38:38 -0800 Subject: [PATCH 21/23] rebase4 --- br/pkg/restore/snap_client/client.go | 11 ----------- pkg/expression/function_traits.go | 4 ---- 2 files changed, 15 deletions(-) diff --git a/br/pkg/restore/snap_client/client.go b/br/pkg/restore/snap_client/client.go index 6048f2b8dd191..9d03c080356f5 100644 --- a/br/pkg/restore/snap_client/client.go +++ b/br/pkg/restore/snap_client/client.go @@ -78,10 +78,7 @@ const minBatchDdlSize = 1 type SnapClient struct { restorer restore.SstRestorer -<<<<<<< HEAD importer *SnapFileImporter -======= ->>>>>>> 659e3e73dd (compact restore: use closure to initial snapshot restore checkpoint (#58146)) // Use a closure to lazy load checkpoint runner getRestorerFn func(*checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer // Tool clients used by SnapClient @@ -591,11 +588,7 @@ func (rc *SnapClient) initClients(ctx context.Context, backend *backuppb.Storage } // Raw/Txn restore are not support checkpoint for now rc.getRestorerFn = func(checkpointRunner *checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer { -<<<<<<< HEAD return restore.NewSimpleSstRestorer(ctx, rc.importer, rc.workerPool, nil) -======= - return restore.NewSimpleSstRestorer(ctx, fileImporter, rc.workerPool, nil) ->>>>>>> 659e3e73dd (compact restore: use closure to initial snapshot restore checkpoint (#58146)) } } else { // or create a fileImporter with the cluster API version @@ -605,11 +598,7 @@ func (rc *SnapClient) initClients(ctx context.Context, backend *backuppb.Storage return errors.Trace(err) } rc.getRestorerFn = func(checkpointRunner *checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer { -<<<<<<< HEAD return restore.NewMultiTablesRestorer(ctx, rc.importer, rc.workerPool, checkpointRunner) -======= - return restore.NewMultiTablesRestorer(ctx, fileImporter, rc.workerPool, checkpointRunner) ->>>>>>> 659e3e73dd (compact restore: use closure to initial snapshot restore checkpoint (#58146)) } } return nil diff --git a/pkg/expression/function_traits.go b/pkg/expression/function_traits.go index 3e81380bc4a67..978b8eba79eaf 100644 --- a/pkg/expression/function_traits.go +++ b/pkg/expression/function_traits.go @@ -99,10 +99,6 @@ var IllegalFunctions4GeneratedColumns = map[string]struct{}{ ast.CurrentUser: {}, ast.Curtime: {}, ast.Database: {}, -<<<<<<< HEAD -======= - ast.Encrypt: {}, ->>>>>>> 77866d1f46 (expression: Update generated column function restrictions (#58308)) ast.FoundRows: {}, ast.GetLock: {}, ast.GetVar: {}, From c01fbbe830c4da125f16a725b1d0d06525f723e4 Mon Sep 17 00:00:00 2001 From: tpp Date: Fri, 31 Jan 2025 13:36:35 -0800 Subject: [PATCH 22/23] updates after unit test --- pkg/planner/cardinality/selectivity.go | 4 +-- pkg/planner/core/find_best_task.go | 42 +++++++++++++++++++++----- pkg/planner/core/stats.go | 3 ++ 3 files changed, 40 insertions(+), 9 deletions(-) diff --git a/pkg/planner/cardinality/selectivity.go b/pkg/planner/cardinality/selectivity.go index 4a42857b1029b..11a6146582b5b 100644 --- a/pkg/planner/cardinality/selectivity.go +++ b/pkg/planner/cardinality/selectivity.go @@ -546,8 +546,8 @@ type StatsNode struct { // Selectivity indicates the Selectivity of this column/index. Selectivity float64 // CorrSelectivity indicates the Selectivity of this column/index with correlated column. - // That is - it is the selectivity assuming the most filtering column only, and all other - // columns are uncorrelated. + // That is - it is the selectivity assuming the most filtering index column only, and all other + // columns are correlated with this column. CorrSelectivity float64 // numCols is the number of columns contained in the index or column(which is always 1). numCols int diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index 8fd569e1d1fad..0d5862af84602 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -710,15 +710,42 @@ func compareGlobalIndex(lhs, rhs *candidatePath) int { func compareCorrRatio(lhs, rhs *candidatePath) (int, float64) { lhsCorrRatio, rhsCorrRatio := 0.0, 0.0 + // CorrCountAfterAccess tracks the "CountAfterAccess" only including the most selective index column, thus + // lhs/rhsCorrRatio represents the "risk" of the CountAfterAccess value - lower value means less risk that + // we do NOT know about actual correlation between indexed columns if lhs.path.CorrCountAfterAccess > 0 || rhs.path.CorrCountAfterAccess > 0 { lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess } + // rhs has lower index selectivity and lower risk + if rhs.path.CountAfterAccess < lhs.path.CountAfterAccess && rhsCorrRatio < lhsCorrRatio { + return -1, lhsCorrRatio + } + // lhs has lower risk if lhsCorrRatio < rhsCorrRatio { - return 1, lhsCorrRatio + // And lhs has lower index selectivity + if lhs.path.CountAfterAccess < rhs.path.CountAfterAccess { + return 1, lhsCorrRatio + } + // Add 10% of the difference between correlated and actual and compare + rhsAdjustCount := rhs.path.CountAfterAccess + ((rhs.path.CorrCountAfterAccess - rhs.path.CountAfterAccess) * 0.1) + if (lhs.path.CountAfterAccess < 10 || lhs.path.CountAfterAccess < (rhs.path.CountAfterAccess*10)) && + lhs.path.CorrCountAfterAccess < rhsAdjustCount { + return 1, lhsCorrRatio + } } + // rhs has lower risk if rhsCorrRatio < lhsCorrRatio { - return -1, rhsCorrRatio + // And rhs has lower index selectivity + if rhs.path.CountAfterAccess < lhs.path.CountAfterAccess { + return -1, rhsCorrRatio + } + // Add 10% of the difference between correlated and actual and compare + lhsAdjustCount := lhs.path.CountAfterAccess + ((lhs.path.CorrCountAfterAccess - lhs.path.CountAfterAccess) * 0.1) + if (rhs.path.CountAfterAccess < 10 || rhs.path.CountAfterAccess < (lhs.path.CountAfterAccess*10)) && + rhs.path.CorrCountAfterAccess < lhsAdjustCount { + return -1, rhsCorrRatio + } } return 0, 0 } @@ -819,23 +846,24 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, tableI } } - // Below compares the two candidate paths on four dimensions: + // Below compares the two candidate paths on multiple dimensions: // (1): the set of columns that occurred in the access condition, // (2): does it require a double scan, // (3): whether or not it matches the physical property, // (4): it's a global index path or not. + // (5): whether it's correlation ration indicates that it has high risk in it's index scan estimate // If `x` is not worse than `y` at all factors, // and there exists one factor that `x` is better than `y`, then `x` is better than `y`. - if !comparable1 { + if !comparable1 && sum == 0 { return 0, false // No winner (0). Do not return the pseudo result } - if !comparable2 { + if !comparable2 && sum == 0 { return 0, false // No winner (0). Do not return the pseudo result } - if accessResult >= 0 && scanResult >= 0 && matchResult >= 0 && globalResult >= 0 && sum > 0 { + if accessResult >= 0 && scanResult >= 0 && matchResult >= 0 && globalResult >= 0 && corrResult >= 0 && sum > 0 { return 1, lhsPseudo // left wins - also return whether it has statistics (pseudo) or not } - if accessResult <= 0 && scanResult <= 0 && matchResult <= 0 && globalResult <= 0 && sum < 0 { + if accessResult <= 0 && scanResult <= 0 && matchResult <= 0 && globalResult <= 0 && corrResult <= 0 && sum < 0 { return -1, rhsPseudo // right wins - also return whether it has statistics (pseudo) or not } return 0, false // No winner (0). Do not return the pseudo result diff --git a/pkg/planner/core/stats.go b/pkg/planner/core/stats.go index 15bf9252d738a..2fad34d84f185 100644 --- a/pkg/planner/core/stats.go +++ b/pkg/planner/core/stats.go @@ -405,6 +405,9 @@ func detachCondAndBuildRangeForPath( } } path.CountAfterAccess, path.CorrCountAfterAccess, err = cardinality.GetRowCountByIndexRanges(sctx, histColl, path.Index.ID, path.Ranges) + if path.CorrCountAfterAccess == 0 { + path.CorrCountAfterAccess = path.CountAfterAccess + } return err } From ef79ba4a682fedeb28c0e4ebc8d59743ed6d742e Mon Sep 17 00:00:00 2001 From: tpp Date: Fri, 31 Jan 2025 14:55:42 -0800 Subject: [PATCH 23/23] testcase1 --- pkg/planner/core/find_best_task.go | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pkg/planner/core/find_best_task.go b/pkg/planner/core/find_best_task.go index 0d5862af84602..7d9826b7965cd 100644 --- a/pkg/planner/core/find_best_task.go +++ b/pkg/planner/core/find_best_task.go @@ -717,12 +717,8 @@ func compareCorrRatio(lhs, rhs *candidatePath) (int, float64) { lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess } - // rhs has lower index selectivity and lower risk - if rhs.path.CountAfterAccess < lhs.path.CountAfterAccess && rhsCorrRatio < lhsCorrRatio { - return -1, lhsCorrRatio - } // lhs has lower risk - if lhsCorrRatio < rhsCorrRatio { + if lhsCorrRatio < rhsCorrRatio && len(lhs.path.TableFilters) <= len(rhs.path.TableFilters) { // And lhs has lower index selectivity if lhs.path.CountAfterAccess < rhs.path.CountAfterAccess { return 1, lhsCorrRatio @@ -735,7 +731,7 @@ func compareCorrRatio(lhs, rhs *candidatePath) (int, float64) { } } // rhs has lower risk - if rhsCorrRatio < lhsCorrRatio { + if rhsCorrRatio < lhsCorrRatio && len(rhs.path.TableFilters) <= len(lhs.path.TableFilters) { // And rhs has lower index selectivity if rhs.path.CountAfterAccess < lhs.path.CountAfterAccess { return -1, rhsCorrRatio