Skip to content

Commit

Permalink
This is an automated cherry-pick of tikv#6831
Browse files Browse the repository at this point in the history
close tikv#6559

Signed-off-by: ti-chi-bot <[email protected]>
  • Loading branch information
nolouch authored and ti-chi-bot committed Jul 26, 2023
1 parent bd94a98 commit d9c8ffb
Show file tree
Hide file tree
Showing 12 changed files with 369 additions and 20 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ static: install-tools
@ echo "gofmt ..."
@ gofmt -s -l -d $(PACKAGE_DIRECTORIES) 2>&1 | awk '{ print } END { if (NR > 0) { exit 1 } }'
@ echo "golangci-lint ..."
@ golangci-lint run --verbose $(PACKAGE_DIRECTORIES)
@ golangci-lint run --verbose $(PACKAGE_DIRECTORIES) --allow-parallel-runners
@ echo "revive ..."
@ revive -formatter friendly -config revive.toml $(PACKAGES)

Expand Down
7 changes: 7 additions & 0 deletions client/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,15 @@ install-tools:

static: install-tools
@ gofmt -s -l -d . 2>&1 | awk '{ print } END { if (NR > 0) { exit 1 } }'
<<<<<<< HEAD
@ golangci-lint run -c ../.golangci.yml ./...
@ revive -formatter friendly -config ../revive.toml .
=======
@ echo "golangci-lint ..."
@ golangci-lint run -c ../.golangci.yml --verbose ./... --allow-parallel-runners
@ echo "revive ..."
@ revive -formatter friendly -config ../revive.toml ./...
>>>>>>> f916e90eb (rule_checker: can replace unhealthPeer with orphanPeer (#6831))

tidy:
@ go mod tidy
Expand Down
28 changes: 17 additions & 11 deletions server/api/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,13 @@ func (suite *transferRegionOperatorTestSuite) TestTransferRegionWithPlacementRul
regionURL := fmt.Sprintf("%s/operators/%d", suite.urlPrefix, region.GetId())
operator := mustReadURL(re, regionURL)
suite.Contains(operator, "operator not found")

convertStepsToStr := func(steps []string) string {
stepStrs := make([]string, len(steps))
for i := range steps {
stepStrs[i] = fmt.Sprintf("%d:{%s}", i, steps[i])
}
return strings.Join(stepStrs, ", ")
}
testCases := []struct {
name string
placementRuleEnable bool
Expand All @@ -231,25 +237,25 @@ func (suite *transferRegionOperatorTestSuite) TestTransferRegionWithPlacementRul
placementRuleEnable: false,
input: []byte(`{"name":"transfer-region", "region_id": 1, "to_store_ids": [2, 3]}`),
expectedError: nil,
expectSteps: strings.Join([]string{
expectSteps: convertStepsToStr([]string{
pdoperator.AddLearner{ToStore: 3, PeerID: 1}.String(),
pdoperator.PromoteLearner{ToStore: 3, PeerID: 1}.String(),
pdoperator.TransferLeader{FromStore: 1, ToStore: 2}.String(),
pdoperator.RemovePeer{FromStore: 1, PeerID: 1}.String(),
}, ", "),
}),
},
{
name: "placement rule disable with peer role",
placementRuleEnable: false,
input: []byte(`{"name":"transfer-region", "region_id": 1, "to_store_ids": [2, 3], "peer_roles":["follower", "leader"]}`),
expectedError: nil,
expectSteps: strings.Join([]string{
expectSteps: convertStepsToStr([]string{
pdoperator.AddLearner{ToStore: 3, PeerID: 2}.String(),
pdoperator.PromoteLearner{ToStore: 3, PeerID: 2}.String(),
pdoperator.TransferLeader{FromStore: 1, ToStore: 2}.String(),
pdoperator.RemovePeer{FromStore: 1, PeerID: 2}.String(),
pdoperator.TransferLeader{FromStore: 2, ToStore: 3}.String(),
}, ", "),
}),
},
{
name: "default placement rule without peer role",
Expand All @@ -262,13 +268,13 @@ func (suite *transferRegionOperatorTestSuite) TestTransferRegionWithPlacementRul
name: "default placement rule with peer role",
placementRuleEnable: true,
input: []byte(`{"name":"transfer-region", "region_id": 1, "to_store_ids": [2, 3], "peer_roles":["follower", "leader"]}`),
expectSteps: strings.Join([]string{
expectSteps: convertStepsToStr([]string{
pdoperator.AddLearner{ToStore: 3, PeerID: 3}.String(),
pdoperator.PromoteLearner{ToStore: 3, PeerID: 3}.String(),
pdoperator.TransferLeader{FromStore: 1, ToStore: 2}.String(),
pdoperator.RemovePeer{FromStore: 1, PeerID: 1}.String(),
pdoperator.TransferLeader{FromStore: 2, ToStore: 3}.String(),
}, ", "),
}),
},
{
name: "default placement rule with invalid input",
Expand Down Expand Up @@ -323,12 +329,12 @@ func (suite *transferRegionOperatorTestSuite) TestTransferRegionWithPlacementRul
},
input: []byte(`{"name":"transfer-region", "region_id": 1, "to_store_ids": [2, 3], "peer_roles":["follower", "leader"]}`),
expectedError: nil,
expectSteps: strings.Join([]string{
expectSteps: convertStepsToStr([]string{
pdoperator.AddLearner{ToStore: 3, PeerID: 5}.String(),
pdoperator.PromoteLearner{ToStore: 3, PeerID: 5}.String(),
pdoperator.TransferLeader{FromStore: 1, ToStore: 3}.String(),
pdoperator.RemovePeer{FromStore: 1, PeerID: 1}.String(),
}, ", "),
}),
},
{
name: "customized placement rule with valid peer role2",
Expand Down Expand Up @@ -363,12 +369,12 @@ func (suite *transferRegionOperatorTestSuite) TestTransferRegionWithPlacementRul
},
input: []byte(`{"name":"transfer-region", "region_id": 1, "to_store_ids": [2, 3], "peer_roles":["leader", "follower"]}`),
expectedError: nil,
expectSteps: strings.Join([]string{
expectSteps: convertStepsToStr([]string{
pdoperator.AddLearner{ToStore: 3, PeerID: 6}.String(),
pdoperator.PromoteLearner{ToStore: 3, PeerID: 6}.String(),
pdoperator.TransferLeader{FromStore: 1, ToStore: 2}.String(),
pdoperator.RemovePeer{FromStore: 1, PeerID: 1}.String(),
}, ", "),
}),
},
}
for _, testCase := range testCases {
Expand Down
89 changes: 85 additions & 4 deletions server/schedule/checker/rule_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,38 @@ var (
errPeerCannotBeWitness = errors.New("peer cannot be witness")
errNoNewLeader = errors.New("no new leader")
errRegionNoLeader = errors.New("region no leader")
<<<<<<< HEAD:server/schedule/checker/rule_checker.go
=======
// WithLabelValues is a heavy operation, define variable to avoid call it every time.
ruleCheckerCounter = checkerCounter.WithLabelValues(ruleChecker, "check")
ruleCheckerPausedCounter = checkerCounter.WithLabelValues(ruleChecker, "paused")
ruleCheckerRegionNoLeaderCounter = checkerCounter.WithLabelValues(ruleChecker, "region-no-leader")
ruleCheckerGetCacheCounter = checkerCounter.WithLabelValues(ruleChecker, "get-cache")
ruleCheckerNeedSplitCounter = checkerCounter.WithLabelValues(ruleChecker, "need-split")
ruleCheckerSetCacheCounter = checkerCounter.WithLabelValues(ruleChecker, "set-cache")
ruleCheckerReplaceDownCounter = checkerCounter.WithLabelValues(ruleChecker, "replace-down")
ruleCheckerPromoteWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "promote-witness")
ruleCheckerReplaceOfflineCounter = checkerCounter.WithLabelValues(ruleChecker, "replace-offline")
ruleCheckerAddRulePeerCounter = checkerCounter.WithLabelValues(ruleChecker, "add-rule-peer")
ruleCheckerNoStoreAddCounter = checkerCounter.WithLabelValues(ruleChecker, "no-store-add")
ruleCheckerNoStoreReplaceCounter = checkerCounter.WithLabelValues(ruleChecker, "no-store-replace")
ruleCheckerFixPeerRoleCounter = checkerCounter.WithLabelValues(ruleChecker, "fix-peer-role")
ruleCheckerFixLeaderRoleCounter = checkerCounter.WithLabelValues(ruleChecker, "fix-leader-role")
ruleCheckerNotAllowLeaderCounter = checkerCounter.WithLabelValues(ruleChecker, "not-allow-leader")
ruleCheckerFixFollowerRoleCounter = checkerCounter.WithLabelValues(ruleChecker, "fix-follower-role")
ruleCheckerNoNewLeaderCounter = checkerCounter.WithLabelValues(ruleChecker, "no-new-leader")
ruleCheckerDemoteVoterRoleCounter = checkerCounter.WithLabelValues(ruleChecker, "demote-voter-role")
ruleCheckerRecentlyPromoteToNonWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "recently-promote-to-non-witness")
ruleCheckerCancelSwitchToWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "cancel-switch-to-witness")
ruleCheckerSetVoterWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "set-voter-witness")
ruleCheckerSetLearnerWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "set-learner-witness")
ruleCheckerSetVoterNonWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "set-voter-non-witness")
ruleCheckerSetLearnerNonWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "set-learner-non-witness")
ruleCheckerMoveToBetterLocationCounter = checkerCounter.WithLabelValues(ruleChecker, "move-to-better-location")
ruleCheckerSkipRemoveOrphanPeerCounter = checkerCounter.WithLabelValues(ruleChecker, "skip-remove-orphan-peer")
ruleCheckerRemoveOrphanPeerCounter = checkerCounter.WithLabelValues(ruleChecker, "remove-orphan-peer")
ruleCheckerReplaceOrphanPeerCounter = checkerCounter.WithLabelValues(ruleChecker, "replace-orphan-peer")
>>>>>>> f916e90eb (rule_checker: can replace unhealthPeer with orphanPeer (#6831)):pkg/schedule/checker/rule_checker.go
)

const maxPendingListLen = 100000
Expand Down Expand Up @@ -390,14 +422,15 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg
if len(fit.OrphanPeers) == 0 {
return nil, nil
}
var pinDownPeer *metapb.Peer
isUnhealthyPeer := func(id uint64) bool {
for _, pendingPeer := range region.GetPendingPeers() {
if pendingPeer.GetId() == id {
for _, downPeer := range region.GetDownPeers() {
if downPeer.Peer.GetId() == id {
return true
}
}
for _, downPeer := range region.GetDownPeers() {
if downPeer.Peer.GetId() == id {
for _, pendingPeer := range region.GetPendingPeers() {
if pendingPeer.GetId() == id {
return true
}
}
Expand All @@ -414,16 +447,56 @@ loopFits:
}
for _, p := range rf.Peers {
if isUnhealthyPeer(p.GetId()) {
// make sure is down peer.
if region.GetDownPeer(p.GetId()) != nil {
pinDownPeer = p
}
hasUnhealthyFit = true
break loopFits
}
}
}

// If hasUnhealthyFit is false, it is safe to delete the OrphanPeer.
if !hasUnhealthyFit {
checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc()
return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, fit.OrphanPeers[0].StoreId)
}

// try to use orphan peers to replace unhealthy down peers.
for _, orphanPeer := range fit.OrphanPeers {
if pinDownPeer != nil {
// make sure the orphan peer is healthy.
if isUnhealthyPeer(orphanPeer.GetId()) {
continue
}
// no consider witness in this path.
if pinDownPeer.GetIsWitness() || orphanPeer.GetIsWitness() {
continue
}
// down peer's store should be down.
if !c.isStoreDownTimeHitMaxDownTime(pinDownPeer.GetStoreId()) {
continue
}
// check if down peer can replace with orphan peer.
dstStore := c.cluster.GetStore(orphanPeer.GetStoreId())
if fit.Replace(pinDownPeer.GetStoreId(), dstStore) {
destRole := pinDownPeer.GetRole()
orphanPeerRole := orphanPeer.GetRole()
ruleCheckerReplaceOrphanPeerCounter.Inc()
switch {
case orphanPeerRole == metapb.PeerRole_Learner && destRole == metapb.PeerRole_Voter:
return operator.CreatePromoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer)
case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Learner:
return operator.CreateDemoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer)
default:
// destRole should not same with orphanPeerRole. if role is same, it fit with orphanPeer should be better than now.
// destRole never be leader, so we not consider it.
}
}
}
}

// If hasUnhealthyFit is true, try to remove unhealthy orphan peers only if number of OrphanPeers is >= 2.
// Ref https://github.com/tikv/pd/issues/4045
if len(fit.OrphanPeers) >= 2 {
Expand Down Expand Up @@ -462,7 +535,15 @@ func (c *RuleChecker) isDownPeer(region *core.RegionInfo, peer *metapb.Peer) boo

func (c *RuleChecker) isStoreDownTimeHitMaxDownTime(storeID uint64) bool {
store := c.cluster.GetStore(storeID)
<<<<<<< HEAD:server/schedule/checker/rule_checker.go
return store.DownTime() >= c.cluster.GetOpts().GetMaxStoreDownTime()
=======
if store == nil {
log.Warn("lost the store, maybe you are recovering the PD cluster", zap.Uint64("store-id", storeID))
return false
}
return store.DownTime() >= c.cluster.GetCheckerConfig().GetMaxStoreDownTime()
>>>>>>> f916e90eb (rule_checker: can replace unhealthPeer with orphanPeer (#6831)):pkg/schedule/checker/rule_checker.go
}

func (c *RuleChecker) isOfflinePeer(peer *metapb.Peer) bool {
Expand Down
127 changes: 126 additions & 1 deletion server/schedule/checker/rule_checker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,6 @@ func (suite *ruleCheckerTestSuite) TestFixRuleWitness() {
op := suite.rc.Check(suite.cluster.GetRegion(1))
suite.NotNil(op)
suite.Equal("add-rule-peer", op.Desc())
fmt.Println(op)
suite.Equal(uint64(3), op.Step(0).(operator.AddLearner).ToStore)
suite.True(op.Step(0).(operator.AddLearner).IsWitness)
}
Expand Down Expand Up @@ -685,6 +684,132 @@ func (suite *ruleCheckerTestSuite) TestPriorityFixOrphanPeer() {
suite.Equal("remove-orphan-peer", op.Desc())
}

func (suite *ruleCheckerTestSuite) TestPriorityFitHealthWithDifferentRole1() {
suite.cluster.SetEnableUseJointConsensus(true)
suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"})
suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"})
suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"})
suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"})
suite.cluster.AddRegionWithLearner(1, 1, []uint64{2, 3}, []uint64{4})
r1 := suite.cluster.GetRegion(1)
suite.cluster.GetStore(3).GetMeta().LastHeartbeat = time.Now().Add(-31 * time.Minute).UnixNano()

// set peer3 to pending and down
r1 = r1.Clone(core.WithPendingPeers([]*metapb.Peer{r1.GetPeer(3)}))
r1 = r1.Clone(core.WithDownPeers([]*pdpb.PeerStats{
{
Peer: r1.GetStorePeer(3),
DownSeconds: 30000,
},
}))
suite.cluster.PutRegion(r1)

op := suite.rc.Check(suite.cluster.GetRegion(1))
suite.Equal(uint64(3), op.Step(0).(operator.ChangePeerV2Enter).DemoteVoters[0].ToStore)
suite.Equal(uint64(4), op.Step(0).(operator.ChangePeerV2Enter).PromoteLearners[0].ToStore)
suite.Equal(uint64(3), op.Step(1).(operator.ChangePeerV2Leave).DemoteVoters[0].ToStore)
suite.Equal(uint64(4), op.Step(1).(operator.ChangePeerV2Leave).PromoteLearners[0].ToStore)
suite.Equal("replace-down-peer-with-orphan-peer", op.Desc())

// set peer3 only pending
r1 = r1.Clone(core.WithDownPeers(nil))
suite.cluster.PutRegion(r1)
op = suite.rc.Check(suite.cluster.GetRegion(1))
suite.Nil(op)
}

func (suite *ruleCheckerTestSuite) TestPriorityFitHealthWithDifferentRole2() {
suite.cluster.SetEnableUseJointConsensus(true)
suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"})
suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"})
suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"})
suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"})
suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"})
suite.cluster.AddLeaderRegion(1, 1, 2, 3, 4, 5)
r1 := suite.cluster.GetRegion(1)

// set peer3 to pending and down, and peer 3 to learner, and store 3 is down
suite.cluster.GetStore(3).GetMeta().LastHeartbeat = time.Now().Add(-31 * time.Minute).UnixNano()
r1 = r1.Clone(core.WithLearners([]*metapb.Peer{r1.GetPeer(3)}))
r1 = r1.Clone(
core.WithPendingPeers([]*metapb.Peer{r1.GetPeer(3)}),
core.WithDownPeers([]*pdpb.PeerStats{
{
Peer: r1.GetStorePeer(3),
DownSeconds: 30000,
},
}),
)
suite.cluster.PutRegion(r1)

// default and test group => 3 voter + 1 learner
err := suite.ruleManager.SetRule(&placement.Rule{
GroupID: "test",
ID: "10",
Role: placement.Learner,
Count: 1,
})
suite.NoError(err)

op := suite.rc.Check(suite.cluster.GetRegion(1))
suite.Equal(uint64(5), op.Step(0).(operator.ChangePeerV2Enter).DemoteVoters[0].ToStore)
suite.Equal(uint64(3), op.Step(1).(operator.RemovePeer).FromStore)
suite.Equal("replace-down-peer-with-orphan-peer", op.Desc())
}

func (suite *ruleCheckerTestSuite) TestPriorityFitHealthPeersAndTiFlash() {
suite.cluster.SetEnableUseJointConsensus(true)
suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"})
suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"})
suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"})
suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4", "engine": "tiflash"})
suite.cluster.AddRegionWithLearner(1, 1, []uint64{2, 3}, []uint64{4})
rule := &placement.Rule{
GroupID: "pd",
ID: "test",
Role: placement.Voter,
Count: 3,
}
rule2 := &placement.Rule{
GroupID: "pd",
ID: "test2",
Role: placement.Learner,
Count: 1,
LabelConstraints: []placement.LabelConstraint{
{
Key: "engine",
Op: placement.In,
Values: []string{"tiflash"},
},
},
}
suite.ruleManager.SetRule(rule)
suite.ruleManager.SetRule(rule2)
suite.ruleManager.DeleteRule("pd", "default")

r1 := suite.cluster.GetRegion(1)
// set peer3 to pending and down
r1 = r1.Clone(core.WithPendingPeers([]*metapb.Peer{r1.GetPeer(3)}))
r1 = r1.Clone(core.WithDownPeers([]*pdpb.PeerStats{
{
Peer: r1.GetStorePeer(3),
DownSeconds: 30000,
},
}))
suite.cluster.PutRegion(r1)
suite.cluster.GetStore(3).GetMeta().LastHeartbeat = time.Now().Add(-31 * time.Minute).UnixNano()

op := suite.rc.Check(suite.cluster.GetRegion(1))
// should not promote tiflash peer
suite.Nil(op)

// scale a node, can replace the down peer
suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"})
op = suite.rc.Check(suite.cluster.GetRegion(1))
suite.NotNil(op)
suite.Equal("fast-replace-rule-down-peer", op.Desc())
}

func (suite *ruleCheckerTestSuite) TestIssue3293() {
suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"})
suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host1"})
Expand Down
Loading

0 comments on commit d9c8ffb

Please sign in to comment.