Skip to content

Commit

Permalink
metrics: fix wrong metrics for balance region (tikv#6969)
Browse files Browse the repository at this point in the history
close tikv#6970

Signed-off-by: bufferflies <[email protected]>

Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com>
  • Loading branch information
bufferflies and ti-chi-bot[bot] authored Aug 23, 2023
1 parent de5f53e commit ebceb83
Show file tree
Hide file tree
Showing 7 changed files with 16 additions and 90 deletions.
8 changes: 4 additions & 4 deletions metrics/grafana/pd.json
Original file line number Diff line number Diff line change
Expand Up @@ -7091,14 +7091,14 @@
"steppedLine": false,
"targets": [
{
"expr": "-sum(delta(pd_scheduler_balance_leader{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store-out\", instance=\"$instance\", type=\"move-leader\"}[1m])) by (store)",
"expr": "-sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=\"$instance\",type=\"balance-leader-scheduler\"}[1m])) by (source)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "store-{{store}}",
"refId": "A"
},
{
"expr": "sum(delta(pd_scheduler_balance_leader{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store-in\", instance=\"$instance\", type=\"move-leader\"}[1m])) by (store)",
"expr": "sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=\"$instance\",type=\"balance-leader-scheduler\"}[1m])) by (target)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "store-{{store}}",
Expand Down Expand Up @@ -7195,14 +7195,14 @@
"steppedLine": false,
"targets": [
{
"expr": "-sum(delta(pd_scheduler_balance_region{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store-out\", instance=\"$instance\", type=\"move-peer\"}[1m])) by (store)",
"expr": "-sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=\"$instance\",type=\"balance-region-scheduler\"}[1m])) by (source)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "store-{{store}}",
"refId": "A"
},
{
"expr": "sum(delta(pd_scheduler_balance_region{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store-in\", instance=\"$instance\", type=\"move-peer\"}[1m])) by (store)",
"expr": "sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=\"$instance\",type=\"balance-region-scheduler\"}[1m])) by (target)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "store-{{store}}",
Expand Down
13 changes: 0 additions & 13 deletions pkg/schedule/schedulers/balance_leader.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import (

"github.com/gorilla/mux"
"github.com/pingcap/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/tikv/pd/pkg/core"
"github.com/tikv/pd/pkg/core/constant"
"github.com/tikv/pd/pkg/errs"
Expand Down Expand Up @@ -162,7 +161,6 @@ type balanceLeaderScheduler struct {
conf *balanceLeaderSchedulerConfig
handler http.Handler
filters []filter.Filter
counter *prometheus.CounterVec
filterCounter *filter.Counter
}

Expand All @@ -176,7 +174,6 @@ func newBalanceLeaderScheduler(opController *operator.Controller, conf *balanceL
name: BalanceLeaderName,
conf: conf,
handler: newBalanceLeaderHandler(conf),
counter: balanceLeaderCounter,
filterCounter: filter.NewCounter(filter.BalanceLeader.String()),
}
for _, option := range options {
Expand All @@ -196,13 +193,6 @@ func (l *balanceLeaderScheduler) ServeHTTP(w http.ResponseWriter, r *http.Reques
// BalanceLeaderCreateOption is used to create a scheduler with an option.
type BalanceLeaderCreateOption func(s *balanceLeaderScheduler)

// WithBalanceLeaderCounter sets the counter for the scheduler.
func WithBalanceLeaderCounter(counter *prometheus.CounterVec) BalanceLeaderCreateOption {
return func(s *balanceLeaderScheduler) {
s.counter = counter
}
}

// WithBalanceLeaderName sets the name for the scheduler.
func WithBalanceLeaderName(name string) BalanceLeaderCreateOption {
return func(s *balanceLeaderScheduler) {
Expand Down Expand Up @@ -544,9 +534,6 @@ func (l *balanceLeaderScheduler) createOperator(solver *solver, collector *plan.
)
op.FinishedCounters = append(op.FinishedCounters,
balanceDirectionCounter.WithLabelValues(l.GetName(), solver.SourceMetricLabel(), solver.TargetMetricLabel()),
// TODO: pre-allocate gauge metrics
l.counter.WithLabelValues("move-leader", solver.SourceMetricLabel()+"-out"),
l.counter.WithLabelValues("move-leader", solver.TargetMetricLabel()+"-in"),
)
op.AdditionalInfos["sourceScore"] = strconv.FormatFloat(solver.sourceScore, 'f', 2, 64)
op.AdditionalInfos["targetScore"] = strconv.FormatFloat(solver.targetScore, 'f', 2, 64)
Expand Down
29 changes: 8 additions & 21 deletions pkg/schedule/schedulers/balance_region.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (

"github.com/pingcap/kvproto/pkg/metapb"
"github.com/pingcap/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/tikv/pd/pkg/core"
"github.com/tikv/pd/pkg/core/constant"
sche "github.com/tikv/pd/pkg/schedule/core"
Expand All @@ -39,14 +38,14 @@ const (

var (
// WithLabelValues is a heavy operation, define variable to avoid call it every time.
balanceRegionScheduleCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "schedule")
balanceRegionNoRegionCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "no-region")
balanceRegionHotCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "region-hot")
balanceRegionNoLeaderCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "no-leader")
balanceRegionNewOpCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "new-operator")
balanceRegionSkipCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "skip")
balanceRegionCreateOpFailCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "create-operator-fail")
balanceRegionNoReplacementCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "no-replacement")
balanceRegionScheduleCounter = schedulerCounter.WithLabelValues(BalanceRegionName, "schedule")
balanceRegionNoRegionCounter = schedulerCounter.WithLabelValues(BalanceRegionName, "no-region")
balanceRegionHotCounter = schedulerCounter.WithLabelValues(BalanceRegionName, "region-hot")
balanceRegionNoLeaderCounter = schedulerCounter.WithLabelValues(BalanceRegionName, "no-leader")
balanceRegionNewOpCounter = schedulerCounter.WithLabelValues(BalanceRegionName, "new-operator")
balanceRegionSkipCounter = schedulerCounter.WithLabelValues(BalanceRegionName, "skip")
balanceRegionCreateOpFailCounter = schedulerCounter.WithLabelValues(BalanceRegionName, "create-operator-fail")
balanceRegionNoReplacementCounter = schedulerCounter.WithLabelValues(BalanceRegionName, "no-replacement")
)

type balanceRegionSchedulerConfig struct {
Expand All @@ -59,7 +58,6 @@ type balanceRegionScheduler struct {
*retryQuota
conf *balanceRegionSchedulerConfig
filters []filter.Filter
counter *prometheus.CounterVec
filterCounter *filter.Counter
}

Expand All @@ -71,7 +69,6 @@ func newBalanceRegionScheduler(opController *operator.Controller, conf *balanceR
BaseScheduler: base,
retryQuota: newRetryQuota(),
conf: conf,
counter: balanceRegionCounter,
filterCounter: filter.NewCounter(filter.BalanceRegion.String()),
}
for _, setOption := range opts {
Expand All @@ -87,13 +84,6 @@ func newBalanceRegionScheduler(opController *operator.Controller, conf *balanceR
// BalanceRegionCreateOption is used to create a scheduler with an option.
type BalanceRegionCreateOption func(s *balanceRegionScheduler)

// WithBalanceRegionCounter sets the counter for the scheduler.
func WithBalanceRegionCounter(counter *prometheus.CounterVec) BalanceRegionCreateOption {
return func(s *balanceRegionScheduler) {
s.counter = counter
}
}

// WithBalanceRegionName sets the name for the scheduler.
func WithBalanceRegionName(name string) BalanceRegionCreateOption {
return func(s *balanceRegionScheduler) {
Expand Down Expand Up @@ -286,9 +276,6 @@ func (s *balanceRegionScheduler) transferPeer(solver *solver, collector *plan.Co
targetLabel := strconv.FormatUint(targetID, 10)
op.FinishedCounters = append(op.FinishedCounters,
balanceDirectionCounter.WithLabelValues(s.GetName(), sourceLabel, targetLabel),
// TODO: pre-allocate gauge metrics
s.counter.WithLabelValues("move-peer", sourceLabel+"-out"),
s.counter.WithLabelValues("move-peer", targetLabel+"-in"),
)
op.AdditionalInfos["sourceScore"] = strconv.FormatFloat(solver.sourceScore, 'f', 2, 64)
op.AdditionalInfos["targetScore"] = strconv.FormatFloat(solver.targetScore, 'f', 2, 64)
Expand Down
2 changes: 1 addition & 1 deletion pkg/schedule/schedulers/balance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ func (suite *balanceLeaderRangeSchedulerTestSuite) TestSingleRangeBalance() {
suite.NotEmpty(ops)
suite.Len(ops, 1)
suite.Len(ops[0].Counters, 1)
suite.Len(ops[0].FinishedCounters, 3)
suite.Len(ops[0].FinishedCounters, 1)
lb, err = CreateScheduler(BalanceLeaderType, suite.oc, storage.NewStorageWithMemoryBackend(), ConfigSliceDecoder(BalanceLeaderType, []string{"h", "n"}))
suite.NoError(err)
ops, _ = lb.Schedule(suite.tc, false)
Expand Down
5 changes: 2 additions & 3 deletions pkg/schedule/schedulers/hot_region.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,7 @@ var (
writeSkipKeyDimUniformStoreCounter = schedulerCounter.WithLabelValues(HotRegionName, "write-skip-key-uniform-store")
readSkipQueryDimUniformStoreCounter = schedulerCounter.WithLabelValues(HotRegionName, "read-skip-query-uniform-store")
writeSkipQueryDimUniformStoreCounter = schedulerCounter.WithLabelValues(HotRegionName, "write-skip-query-uniform-store")

pendingOpFails = schedulerStatus.WithLabelValues(HotRegionName, "pending_op_fails")
pendingOpFailsStoreCounter = schedulerCounter.WithLabelValues(HotRegionName, "pending-op-fails")
)

type baseHotScheduler struct {
Expand Down Expand Up @@ -307,7 +306,7 @@ func (h *hotScheduler) tryAddPendingInfluence(op *operator.Operator, srcStore []
regionID := op.RegionID()
_, ok := h.regionPendings[regionID]
if ok {
pendingOpFails.Inc()
pendingOpFailsStoreCounter.Inc()
return false
}

Expand Down
47 changes: 1 addition & 46 deletions pkg/schedule/schedulers/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,6 @@ var (
Help: "Counter of scheduler events.",
}, []string{"type", "name"})

schedulerStatus = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pd",
Subsystem: "scheduler",
Name: "inner_status",
Help: "Inner status of the scheduler.",
}, []string{"type", "name"})

// TODO: pre-allocate gauge metrics
opInfluenceStatus = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Expand All @@ -59,14 +51,6 @@ var (
Help: "Store status for schedule",
}, []string{"scheduler"})

balanceLeaderCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "pd",
Subsystem: "scheduler",
Name: "balance_leader",
Help: "Counter of balance leader scheduler.",
}, []string{"type", "store"})

balanceWitnessCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "pd",
Expand All @@ -75,14 +59,6 @@ var (
Help: "Counter of balance witness scheduler.",
}, []string{"type", "store"})

balanceRegionCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "pd",
Subsystem: "scheduler",
Name: "balance_region",
Help: "Counter of balance region scheduler.",
}, []string{"type", "store"})

// TODO: pre-allocate gauge metrics
hotSchedulerResultCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Expand All @@ -109,22 +85,6 @@ var (
Help: "Counter of hot region scheduler.",
}, []string{"type", "rw", "store", "direction", "dim"})

scatterRangeLeaderCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "pd",
Subsystem: "scheduler",
Name: "scatter_range_leader",
Help: "Counter of scatter range leader scheduler.",
}, []string{"type", "store"})

scatterRangeRegionCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "pd",
Subsystem: "scheduler",
Name: "scatter_range_region",
Help: "Counter of scatter range region scheduler.",
}, []string{"type", "store"})

hotPendingStatus = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pd",
Expand Down Expand Up @@ -163,7 +123,7 @@ var (
Namespace: "pd",
Subsystem: "scheduler",
Name: "store_slow_trend_misc",
Help: "Store trend internal uncatelogued values",
Help: "Store trend internal uncatalogued values",
}, []string{"type"})

// HotPendingSum is the sum of pending influence in hot region scheduler.
Expand All @@ -179,15 +139,10 @@ var (
func init() {
prometheus.MustRegister(schedulerStatusGauge)
prometheus.MustRegister(schedulerCounter)
prometheus.MustRegister(schedulerStatus)
prometheus.MustRegister(balanceLeaderCounter)
prometheus.MustRegister(balanceRegionCounter)
prometheus.MustRegister(balanceWitnessCounter)
prometheus.MustRegister(hotSchedulerResultCounter)
prometheus.MustRegister(hotDirectionCounter)
prometheus.MustRegister(balanceDirectionCounter)
prometheus.MustRegister(scatterRangeLeaderCounter)
prometheus.MustRegister(scatterRangeRegionCounter)
prometheus.MustRegister(opInfluenceStatus)
prometheus.MustRegister(tolerantResourceStatus)
prometheus.MustRegister(hotPendingStatus)
Expand Down
2 changes: 0 additions & 2 deletions pkg/schedule/schedulers/scatter_range.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,11 @@ func newScatterRangeScheduler(opController *operator.Controller, config *scatter
opController,
&balanceLeaderSchedulerConfig{Ranges: []core.KeyRange{core.NewKeyRange("", "")}},
WithBalanceLeaderName("scatter-range-leader"),
WithBalanceLeaderCounter(scatterRangeLeaderCounter),
),
balanceRegion: newBalanceRegionScheduler(
opController,
&balanceRegionSchedulerConfig{Ranges: []core.KeyRange{core.NewKeyRange("", "")}},
WithBalanceRegionName("scatter-range-region"),
WithBalanceRegionCounter(scatterRangeRegionCounter),
),
}
return scheduler
Expand Down

0 comments on commit ebceb83

Please sign in to comment.