From 52389b04f21726b54117ee29acf62923480ccbde Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Thu, 30 May 2024 15:07:21 +0800 Subject: [PATCH 01/17] simulator: make store,region,replica configurable in cases (#8215) ref tikv/pd#8135 Signed-off-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/utils/configutil/configutil.go | 7 +++ tools/pd-simulator/main.go | 34 +++++------ .../pd-simulator/simulator/cases/add_nodes.go | 49 +++++++-------- .../simulator/cases/add_nodes_dynamic.go | 60 +++++++++---------- .../simulator/cases/balance_leader.go | 42 +++++++------ .../simulator/cases/balance_region.go | 45 +++++++------- tools/pd-simulator/simulator/cases/cases.go | 42 +++++-------- .../simulator/cases/delete_nodes.go | 55 +++++++++-------- .../cases/diagnose_label_isolation.go | 7 ++- .../simulator/cases/diagnose_rule.go | 5 +- .../pd-simulator/simulator/cases/hot_read.go | 32 +++++----- .../pd-simulator/simulator/cases/hot_write.go | 33 +++++----- .../simulator/cases/import_data.go | 33 +++++----- .../simulator/cases/makeup_down_replica.go | 55 +++++++---------- .../simulator/cases/region_merge.go | 41 ++++++------- .../simulator/cases/region_split.go | 25 ++++---- tools/pd-simulator/simulator/client.go | 5 +- .../simulator/{ => config}/config.go | 23 ++++--- tools/pd-simulator/simulator/conn.go | 3 +- tools/pd-simulator/simulator/drive.go | 11 ++-- tools/pd-simulator/simulator/node.go | 5 +- tools/pd-simulator/simulator/raft.go | 5 +- .../simulator/simutil/case_config.go | 34 ----------- tools/pd-simulator/simulator/task.go | 2 +- 24 files changed, 303 insertions(+), 350 deletions(-) rename tools/pd-simulator/simulator/{ => config}/config.go (85%) delete mode 100644 tools/pd-simulator/simulator/simutil/case_config.go diff --git a/pkg/utils/configutil/configutil.go b/pkg/utils/configutil/configutil.go index 2e7c74d9f8c..086f74ff842 100644 --- a/pkg/utils/configutil/configutil.go +++ b/pkg/utils/configutil/configutil.go @@ -171,3 +171,10 @@ func AdjustPath(p *string) { *p = absPath } } + +// AdjustBool adjusts the value of a bool variable. +func AdjustBool(v *bool, defValue bool) { + if !*v { + *v = defValue + } +} diff --git a/tools/pd-simulator/main.go b/tools/pd-simulator/main.go index 73f4a0bba12..04de914f5f0 100644 --- a/tools/pd-simulator/main.go +++ b/tools/pd-simulator/main.go @@ -38,21 +38,19 @@ import ( "github.com/tikv/pd/tools/pd-analysis/analysis" "github.com/tikv/pd/tools/pd-simulator/simulator" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) var ( - pdAddr = flag.String("pd-endpoints", "", "pd address") - configFile = flag.String("config", "conf/simconfig.toml", "config file") - caseName = flag.String("case", "", "case name") - serverLogLevel = flag.String("serverLog", "info", "pd server log level") - simLogLevel = flag.String("simLog", "info", "simulator log level") - simLogFile = flag.String("log-file", "", "simulator log file") - regionNum = flag.Int("regionNum", 0, "regionNum of one store") - storeNum = flag.Int("storeNum", 0, "storeNum") - enableTransferRegionCounter = flag.Bool("enableTransferRegionCounter", false, "enableTransferRegionCounter") - statusAddress = flag.String("status-addr", "0.0.0.0:20180", "status address") + pdAddr = flag.String("pd-endpoints", "", "pd address") + configFile = flag.String("config", "conf/simconfig.toml", "config file") + caseName = flag.String("case", "", "case name") + serverLogLevel = flag.String("serverLog", "info", "pd server log level") + simLogLevel = flag.String("simLog", "info", "simulator log level") + simLogFile = flag.String("log-file", "", "simulator log file") + statusAddress = flag.String("status-addr", "0.0.0.0:20180", "status address") ) func main() { @@ -63,14 +61,12 @@ func main() { flag.Parse() simutil.InitLogger(*simLogLevel, *simLogFile) - simutil.InitCaseConfig(*storeNum, *regionNum, *enableTransferRegionCounter) statistics.Denoising = false - if simutil.CaseConfigure.EnableTransferRegionCounter { - analysis.GetTransferCounter().Init(simutil.CaseConfigure.StoreNum, simutil.CaseConfigure.RegionNum) - } - schedulers.Register() // register schedulers, which is needed by simConfig.Adjust - simConfig := simulator.NewSimConfig(*serverLogLevel) + simConfig := sc.NewSimConfig(*serverLogLevel) + if simConfig.EnableTransferRegionCounter { + analysis.GetTransferCounter().Init(simConfig.TotalStore, simConfig.TotalRegion) + } var meta toml.MetaData var err error if *configFile != "" { @@ -97,7 +93,7 @@ func main() { } } -func run(simCase string, simConfig *simulator.SimConfig) { +func run(simCase string, simConfig *sc.SimConfig) { if *pdAddr != "" { go runHTTPServer() simStart(*pdAddr, simCase, simConfig) @@ -136,7 +132,7 @@ func runHTTPServer() { } // NewSingleServer creates a pd server for simulator. -func NewSingleServer(ctx context.Context, simConfig *simulator.SimConfig) (*server.Server, testutil.CleanupFunc) { +func NewSingleServer(ctx context.Context, simConfig *sc.SimConfig) (*server.Server, testutil.CleanupFunc) { err := logutil.SetupLogger(simConfig.ServerConfig.Log, &simConfig.ServerConfig.Logger, &simConfig.ServerConfig.LogProps) if err == nil { log.ReplaceGlobals(simConfig.ServerConfig.Logger, simConfig.ServerConfig.LogProps) @@ -161,7 +157,7 @@ func cleanServer(cfg *config.Config) { os.RemoveAll(cfg.DataDir) } -func simStart(pdAddr string, simCase string, simConfig *simulator.SimConfig, clean ...testutil.CleanupFunc) { +func simStart(pdAddr string, simCase string, simConfig *sc.SimConfig, clean ...testutil.CleanupFunc) { start := time.Now() driver, err := simulator.NewDriver(pdAddr, simCase, simConfig) if err != nil { diff --git a/tools/pd-simulator/simulator/cases/add_nodes.go b/tools/pd-simulator/simulator/cases/add_nodes.go index 241b34a9473..5c73fe9764c 100644 --- a/tools/pd-simulator/simulator/cases/add_nodes.go +++ b/tools/pd-simulator/simulator/cases/add_nodes.go @@ -15,35 +15,35 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newAddNodes() *Case { +func newAddNodes(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyRatio := rand.Float64() // the ratio of noEmpty store to total store - noEmptyStoreNum := getNoEmptyStoreNum(storeNum, noEmptyRatio) + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + noEmptyStoreNum := getNoEmptyStoreNum(totalStore, replica) - for i := 1; i <= storeNum; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < regionNum*storeNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+1)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+2)%noEmptyStoreNum + 1}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%noEmptyStoreNum + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -54,21 +54,18 @@ func newAddNodes() *Case { }) } - threshold := 0.05 simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := true - leaderCounts := make([]int, 0, storeNum) - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { + for i := 1; i <= totalStore; i++ { leaderCount := regions.GetStoreLeaderCount(uint64(i)) - regionCount := regions.GetStoreRegionCount(uint64(i)) - leaderCounts = append(leaderCounts, leaderCount) - regionCounts = append(regionCounts, regionCount) - res = res && leaderAndRegionIsUniform(leaderCount, regionCount, regionNum, threshold) + peerCount := regions.GetStoreRegionCount(uint64(i)) + if !isUniform(leaderCount, totalRegion/totalStore) { + return false + } + if !isUniform(peerCount, totalRegion*replica/totalStore) { + return false + } } - - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts), zap.Ints("region", regionCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go b/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go index 59b0b54e1ca..aa585b48923 100644 --- a/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go +++ b/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go @@ -15,24 +15,22 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newAddNodesDynamic() *Case { +func newAddNodesDynamic(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyRatio := rand.Float64() // the ratio of noEmpty store to total store - noEmptyStoreNum := getNoEmptyStoreNum(storeNum, noEmptyRatio) + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + noEmptyStoreNum := getNoEmptyStoreNum(totalStore, replica) - for i := 1; i <= int(noEmptyStoreNum); i++ { + for i := 0; i < noEmptyStoreNum; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, @@ -40,15 +38,17 @@ func newAddNodesDynamic() *Case { } var ids []uint64 - for i := 1; i <= storeNum-int(noEmptyStoreNum); i++ { + for i := 0; i < totalStore-noEmptyStoreNum; i++ { ids = append(ids, IDAllocator.nextID()) } - for i := 0; i < regionNum*storeNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+1)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+2)%noEmptyStoreNum + 1}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%noEmptyStoreNum + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -59,11 +59,11 @@ func newAddNodesDynamic() *Case { }) } - numNodes := int(noEmptyStoreNum) + currentStoreCount := noEmptyStoreNum e := &AddNodesDescriptor{} e.Step = func(tick int64) uint64 { - if tick%100 == 0 && numNodes < storeNum { - numNodes++ + if tick%100 == 0 && currentStoreCount < totalStore { + currentStoreCount++ nodeID := ids[0] ids = append(ids[:0], ids[1:]...) return nodeID @@ -72,21 +72,21 @@ func newAddNodesDynamic() *Case { } simCase.Events = []EventDescriptor{e} - threshold := 0.05 simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := numNodes == storeNum - leaderCounts := make([]int, 0, numNodes) - regionCounts := make([]int, 0, numNodes) - for i := 1; i <= numNodes; i++ { + if currentStoreCount != totalStore { + return false + } + for i := 1; i <= currentStoreCount; i++ { leaderCount := regions.GetStoreLeaderCount(uint64(i)) - regionCount := regions.GetStoreRegionCount(uint64(i)) - leaderCounts = append(leaderCounts, leaderCount) - regionCounts = append(regionCounts, regionCount) - res = res && leaderAndRegionIsUniform(leaderCount, regionCount, regionNum, threshold) + peerCount := regions.GetStoreRegionCount(uint64(i)) + if !isUniform(leaderCount, totalRegion/totalStore) { + return false + } + if !isUniform(peerCount, totalRegion*replica/totalStore) { + return false + } } - - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts), zap.Ints("region", regionCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/balance_leader.go b/tools/pd-simulator/simulator/cases/balance_leader.go index bbc7ce97f68..c5315f85d8e 100644 --- a/tools/pd-simulator/simulator/cases/balance_leader.go +++ b/tools/pd-simulator/simulator/cases/balance_leader.go @@ -18,28 +18,35 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newBalanceLeader() *Case { +func newBalanceLeader(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeNum)}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%(storeNum-1)) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%(storeNum-1)) + 1}, + leaderStoreID := simCase.Stores[totalStore-1].ID + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: leaderStoreID, + }) + for j := 1; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%(totalStore-1) + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -50,17 +57,14 @@ func newBalanceLeader() *Case { }) } - threshold := 0.05 simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := true - leaderCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { + for i := 1; i <= totalStore; i++ { leaderCount := regions.GetStoreLeaderCount(uint64(i)) - leaderCounts = append(leaderCounts, leaderCount) - res = res && isUniform(leaderCount, regionNum/3, threshold) + if !isUniform(leaderCount, totalRegion/totalStore) { + return false + } } - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/balance_region.go b/tools/pd-simulator/simulator/cases/balance_region.go index 3b0c46f1670..a559a335c97 100644 --- a/tools/pd-simulator/simulator/cases/balance_region.go +++ b/tools/pd-simulator/simulator/cases/balance_region.go @@ -19,21 +19,18 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newRedundantBalanceRegion() *Case { +func newRedundantBalanceRegion(config *sc.SimConfig) *Case { var simCase Case - storeNum := simutil.CaseConfigure.StoreNum - regionNum := simutil.CaseConfigure.RegionNum - if storeNum == 0 || regionNum == 0 { - storeNum, regionNum = 6, 4000 - } + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) - for i := 0; i < storeNum; i++ { + for i := 0; i < totalStore; i++ { s := &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, @@ -44,11 +41,13 @@ func newRedundantBalanceRegion() *Case { simCase.Stores = append(simCase.Stores, s) } - for i := 0; i < regionNum; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i%storeNum + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%storeNum + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%storeNum + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -57,30 +56,26 @@ func newRedundantBalanceRegion() *Case { }) } - storesLastUpdateTime := make([]int64, storeNum+1) - storeLastAvailable := make([]uint64, storeNum+1) + storesLastUpdateTime := make([]int64, totalStore+1) + storeLastAvailable := make([]uint64, totalStore+1) simCase.Checker = func(_ *core.RegionsInfo, stats []info.StoreStats) bool { - res := true curTime := time.Now().Unix() - storesAvailable := make([]uint64, 0, storeNum+1) - for i := 1; i <= storeNum; i++ { + for i := 1; i <= totalStore; i++ { available := stats[i].GetAvailable() - storesAvailable = append(storesAvailable, available) if curTime-storesLastUpdateTime[i] > 60 { if storeLastAvailable[i] != available { - res = false + return false } if stats[i].ToCompactionSize != 0 { - res = false + return false } storesLastUpdateTime[i] = curTime storeLastAvailable[i] = available } else { - res = false + return false } } - simutil.Logger.Info("current counts", zap.Uint64s("storesAvailable", storesAvailable)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/cases.go b/tools/pd-simulator/simulator/cases/cases.go index 0a8967a8d86..f2e79a81924 100644 --- a/tools/pd-simulator/simulator/cases/cases.go +++ b/tools/pd-simulator/simulator/cases/cases.go @@ -15,12 +15,14 @@ package cases import ( + "math/rand" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) // Store is used to simulate tikv. @@ -86,7 +88,7 @@ func (a *idAllocator) GetID() uint64 { var IDAllocator idAllocator // CaseMap is a mapping of the cases to the their corresponding initialize functions. -var CaseMap = map[string]func() *Case{ +var CaseMap = map[string]func(*config.SimConfig) *Case{ "balance-leader": newBalanceLeader, "redundant-balance-region": newRedundantBalanceRegion, "add-nodes": newAddNodes, @@ -106,43 +108,27 @@ var CaseMap = map[string]func() *Case{ } // NewCase creates a new case. -func NewCase(name string) *Case { +func NewCase(name string, simConfig *config.SimConfig) *Case { if f, ok := CaseMap[name]; ok { - return f() + return f(simConfig) } return nil } -func leaderAndRegionIsUniform(leaderCount, regionCount, regionNum int, threshold float64) bool { - return isUniform(leaderCount, regionNum/3, threshold) && isUniform(regionCount, regionNum, threshold) -} - -func isUniform(count, meanCount int, threshold float64) bool { +func isUniform(count, meanCount int) bool { + threshold := 0.05 maxCount := int((1.0 + threshold) * float64(meanCount)) minCount := int((1.0 - threshold) * float64(meanCount)) return minCount <= count && count <= maxCount } -func getStoreNum() int { - storeNum := simutil.CaseConfigure.StoreNum - if storeNum < 3 { - simutil.Logger.Fatal("store num should be larger than or equal to 3") - } - return storeNum -} - -func getRegionNum() int { - regionNum := simutil.CaseConfigure.RegionNum - if regionNum <= 0 { - simutil.Logger.Fatal("region num should be larger than 0") +func getNoEmptyStoreNum(storeNum int, replica int) int { + noEmptyStoreNum := rand.Intn(storeNum) + if noEmptyStoreNum < replica { + return replica } - return regionNum -} - -func getNoEmptyStoreNum(storeNum int, noEmptyRatio float64) uint64 { - noEmptyStoreNum := uint64(float64(storeNum) * noEmptyRatio) - if noEmptyStoreNum < 3 || noEmptyStoreNum == uint64(storeNum) { - noEmptyStoreNum = 3 + if noEmptyStoreNum == storeNum { + return storeNum - 1 } return noEmptyStoreNum } diff --git a/tools/pd-simulator/simulator/cases/delete_nodes.go b/tools/pd-simulator/simulator/cases/delete_nodes.go index 4ba8e5064a4..80650cf109d 100644 --- a/tools/pd-simulator/simulator/cases/delete_nodes.go +++ b/tools/pd-simulator/simulator/cases/delete_nodes.go @@ -20,28 +20,31 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newDeleteNodes() *Case { +func newDeleteNodes(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyStoreNum := storeNum - 1 - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + noEmptyStoreNum := totalStore - 1 + for i := 1; i <= totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < regionNum*storeNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%storeNum) + 1}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -57,12 +60,12 @@ func newDeleteNodes() *Case { ids = append(ids, store.ID) } - numNodes := storeNum + currentStoreCount := totalStore e := &DeleteNodesDescriptor{} e.Step = func(tick int64) uint64 { - if numNodes > noEmptyStoreNum && tick%100 == 0 { - idx := rand.Intn(numNodes) - numNodes-- + if currentStoreCount > noEmptyStoreNum && tick%100 == 0 { + idx := rand.Intn(currentStoreCount) + currentStoreCount-- nodeID := ids[idx] ids = append(ids[:idx], ids[idx+1:]...) return nodeID @@ -71,21 +74,21 @@ func newDeleteNodes() *Case { } simCase.Events = []EventDescriptor{e} - threshold := 0.05 simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := numNodes == noEmptyStoreNum - leaderCounts := make([]int, 0, numNodes) - regionCounts := make([]int, 0, numNodes) + if currentStoreCount != noEmptyStoreNum { + return false + } for _, i := range ids { leaderCount := regions.GetStoreLeaderCount(i) - regionCount := regions.GetStoreRegionCount(i) - leaderCounts = append(leaderCounts, leaderCount) - regionCounts = append(regionCounts, regionCount) - res = res && leaderAndRegionIsUniform(leaderCount, regionCount, regionNum*storeNum/noEmptyStoreNum, threshold) + peerCount := regions.GetStoreRegionCount(i) + if !isUniform(leaderCount, totalRegion/noEmptyStoreNum) { + return false + } + if !isUniform(peerCount, totalRegion*replica/noEmptyStoreNum) { + return false + } } - - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts), zap.Ints("region", regionCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go b/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go index 7fa50e56197..09037136608 100644 --- a/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go +++ b/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go @@ -21,12 +21,13 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) -func newLabelNotMatch1() *Case { +func newLabelNotMatch1(_ *sc.SimConfig) *Case { var simCase Case simCase.Labels = []string{"host"} @@ -88,7 +89,7 @@ func newLabelNotMatch1() *Case { return &simCase } -func newLabelIsolation1() *Case { +func newLabelIsolation1(_ *sc.SimConfig) *Case { var simCase Case simCase.Labels = []string{"host"} @@ -154,7 +155,7 @@ func newLabelIsolation1() *Case { return &simCase } -func newLabelIsolation2() *Case { +func newLabelIsolation2(_ *sc.SimConfig) *Case { var simCase Case simCase.Labels = []string{"dc", "zone", "host"} diff --git a/tools/pd-simulator/simulator/cases/diagnose_rule.go b/tools/pd-simulator/simulator/cases/diagnose_rule.go index 15c5942d810..5d34e051071 100644 --- a/tools/pd-simulator/simulator/cases/diagnose_rule.go +++ b/tools/pd-simulator/simulator/cases/diagnose_rule.go @@ -21,12 +21,13 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) -func newRule1() *Case { +func newRule1(_ *sc.SimConfig) *Case { var simCase Case simCase.Rules = make([]*placement.Rule, 0) @@ -126,7 +127,7 @@ func newRule1() *Case { return &simCase } -func newRule2() *Case { +func newRule2(_ *sc.SimConfig) *Case { var simCase Case simCase.Rules = make([]*placement.Rule, 0) diff --git a/tools/pd-simulator/simulator/cases/hot_read.go b/tools/pd-simulator/simulator/cases/hot_read.go index d4ec6831d95..50ad08d6011 100644 --- a/tools/pd-simulator/simulator/cases/hot_read.go +++ b/tools/pd-simulator/simulator/cases/hot_read.go @@ -15,35 +15,34 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newHotRead() *Case { +func newHotRead(config *sc.SimConfig) *Case { var simCase Case - - storeNum, regionNum := getStoreNum(), getRegionNum() + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) // Initialize the cluster - for i := 1; i <= storeNum; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - storeIDs := rand.Perm(storeNum) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -56,7 +55,7 @@ func newHotRead() *Case { // Events description // select regions on store 1 as hot read regions. - selectRegionNum := 4 * storeNum + selectRegionNum := 4 * totalStore readFlow := make(map[uint64]int64, selectRegionNum) for _, r := range simCase.Regions { if r.Leader.GetStoreId() == 1 { @@ -73,12 +72,11 @@ func newHotRead() *Case { simCase.Events = []EventDescriptor{e} // Checker description simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - leaderCount := make([]int, storeNum) + leaderCount := make([]int, totalStore) for id := range readFlow { leaderStore := regions.GetRegion(id).GetLeader().GetStoreId() leaderCount[int(leaderStore-1)]++ } - simutil.Logger.Info("current hot region counts", zap.Reflect("hot-region", leaderCount)) // check count diff < 2. var min, max int diff --git a/tools/pd-simulator/simulator/cases/hot_write.go b/tools/pd-simulator/simulator/cases/hot_write.go index 8428afa75b5..a30afd1a8ec 100644 --- a/tools/pd-simulator/simulator/cases/hot_write.go +++ b/tools/pd-simulator/simulator/cases/hot_write.go @@ -15,34 +15,34 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newHotWrite() *Case { +func newHotWrite(config *sc.SimConfig) *Case { var simCase Case + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) - storeNum, regionNum := getStoreNum(), getRegionNum() // Initialize the cluster - for i := 1; i <= storeNum; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - storeIDs := rand.Perm(storeNum) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -55,7 +55,7 @@ func newHotWrite() *Case { // Events description // select regions on store 1 as hot write regions. - selectStoreNum := storeNum + selectStoreNum := totalStore writeFlow := make(map[uint64]int64, selectStoreNum) for _, r := range simCase.Regions { if r.Leader.GetStoreId() == 1 { @@ -74,8 +74,8 @@ func newHotWrite() *Case { // Checker description simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - leaderCount := make([]int, storeNum) - peerCount := make([]int, storeNum) + leaderCount := make([]int, totalStore) + peerCount := make([]int, totalStore) for id := range writeFlow { region := regions.GetRegion(id) leaderCount[int(region.GetLeader().GetStoreId()-1)]++ @@ -83,7 +83,6 @@ func newHotWrite() *Case { peerCount[int(p.GetStoreId()-1)]++ } } - simutil.Logger.Info("current hot region counts", zap.Reflect("leader", leaderCount), zap.Reflect("peer", peerCount)) // check count diff <= 2. var minLeader, maxLeader, minPeer, maxPeer int diff --git a/tools/pd-simulator/simulator/cases/import_data.go b/tools/pd-simulator/simulator/cases/import_data.go index 6cf3b79a736..b9f448a6cf6 100644 --- a/tools/pd-simulator/simulator/cases/import_data.go +++ b/tools/pd-simulator/simulator/cases/import_data.go @@ -17,7 +17,6 @@ package cases import ( "bytes" "fmt" - "math/rand" "os" "github.com/docker/go-units" @@ -26,27 +25,33 @@ import ( "github.com/pingcap/log" "github.com/tikv/pd/pkg/codec" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) -func newImportData() *Case { +func newImportData(config *sc.SimConfig) *Case { var simCase Case + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + // Initialize the cluster - for i := 1; i <= 10; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < getRegionNum(); i++ { - storeIDs := rand.Perm(10) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -65,7 +70,7 @@ func newImportData() *Case { table12 := string(codec.EncodeBytes(codec.GenerateTableKey(12))) table13 := string(codec.EncodeBytes(codec.GenerateTableKey(13))) e.Step = func(tick int64) map[string]int64 { - if tick > int64(getRegionNum())/10 { + if tick > int64(totalRegion)/10 { return nil } return map[string]int64{ @@ -141,14 +146,14 @@ func newImportData() *Case { if dev > 0.02 { simutil.Logger.Warn("Not balanced, change scheduler or store limit", zap.Float64("dev score", dev)) } - if checkCount > uint64(getRegionNum())/5 { + if checkCount > uint64(totalRegion)/5 { isEnd = true - } else if checkCount > uint64(getRegionNum())/10 { + } else if checkCount > uint64(totalRegion)/10 { isEnd = dev < 0.01 } if isEnd { - renderPlot("new_region.html", newRegionCount, int(checkCount), 0, getRegionNum()/10) - renderPlot("all_region.html", allRegionCount, int(checkCount), 28*getRegionNum()/100, getRegionNum()/3) + renderPlot("new_region.html", newRegionCount, int(checkCount), 0, totalRegion/10) + renderPlot("all_region.html", allRegionCount, int(checkCount), 28*totalRegion/100, totalRegion/3) } return isEnd } diff --git a/tools/pd-simulator/simulator/cases/makeup_down_replica.go b/tools/pd-simulator/simulator/cases/makeup_down_replica.go index 86c9b4cac1d..28de9577cfc 100644 --- a/tools/pd-simulator/simulator/cases/makeup_down_replica.go +++ b/tools/pd-simulator/simulator/cases/makeup_down_replica.go @@ -18,27 +18,31 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newMakeupDownReplicas() *Case { +func newMakeupDownReplicas(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyStoreNum := storeNum - 1 - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + + noEmptyStoreNum := totalStore - 1 + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64((i)%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%storeNum) + 1}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -49,7 +53,7 @@ func newMakeupDownReplicas() *Case { }) } - numNodes := storeNum + numNodes := totalStore down := false e := &DeleteNodesDescriptor{} e.Step = func(tick int64) uint64 { @@ -65,31 +69,16 @@ func newMakeupDownReplicas() *Case { simCase.Events = []EventDescriptor{e} simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - sum := 0 - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { - regionCount := regions.GetStoreRegionCount(uint64(i)) - regionCounts = append(regionCounts, regionCount) - sum += regionCount - } - simutil.Logger.Info("current region counts", zap.Ints("region", regionCounts)) - - if down && sum < storeNum*regionNum { - // only need to print once - down = false - simutil.Logger.Error("making up replicas don't start immediately") + if !down { return false } - - res := true - threshold := 0.05 - for index, regionCount := range regionCounts { - if index == 0 { // storeId == 1 - continue + for i := 1; i <= totalStore; i++ { + peerCount := regions.GetStoreRegionCount(uint64(i)) + if isUniform(peerCount, replica*totalRegion/noEmptyStoreNum) { + return false } - res = res && isUniform(regionCount, storeNum*regionNum/noEmptyStoreNum, threshold) } - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/region_merge.go b/tools/pd-simulator/simulator/cases/region_merge.go index 3d5d57f804f..953b0e309e1 100644 --- a/tools/pd-simulator/simulator/cases/region_merge.go +++ b/tools/pd-simulator/simulator/cases/region_merge.go @@ -15,33 +15,33 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newRegionMerge() *Case { +func newRegionMerge(config *sc.SimConfig) *Case { var simCase Case - // Initialize the cluster - storeNum, regionNum := getStoreNum(), getRegionNum() - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - storeIDs := rand.Perm(storeNum) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -52,18 +52,13 @@ func newRegionMerge() *Case { }) } // Checker description - threshold := 0.05 mergeRatio := 4 // when max-merge-region-size is 20, per region will reach 40MB simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - sum := 0 - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { - regionCount := regions.GetStoreRegionCount(uint64(i)) - regionCounts = append(regionCounts, regionCount) - sum += regionCount + currentPeerCount := 0 + for i := 1; i <= totalStore; i++ { + currentPeerCount += regions.GetStoreRegionCount(uint64(i)) } - simutil.Logger.Info("current counts", zap.Ints("region", regionCounts), zap.Int64("average region size", regions.GetAverageRegionSize())) - return isUniform(sum, storeNum*regionNum/mergeRatio, threshold) + return isUniform(currentPeerCount, totalRegion*replica/mergeRatio) } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/region_split.go b/tools/pd-simulator/simulator/cases/region_split.go index b85cd319494..7b712f4dc48 100644 --- a/tools/pd-simulator/simulator/cases/region_split.go +++ b/tools/pd-simulator/simulator/cases/region_split.go @@ -18,16 +18,15 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newRegionSplit() *Case { +func newRegionSplit(config *sc.SimConfig) *Case { var simCase Case - // Initialize the cluster - storeNum := getStoreNum() - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: uint64(i), Status: metapb.StoreState_Up, @@ -57,15 +56,13 @@ func newRegionSplit() *Case { // Checker description simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := true - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { - regionCount := regions.GetStoreRegionCount(uint64(i)) - regionCounts = append(regionCounts, regionCount) - res = res && regionCount > 5 + for i := 1; i <= totalStore; i++ { + peerCount := regions.GetStoreRegionCount(uint64(i)) + if peerCount < 5 { + return false + } } - simutil.Logger.Info("current counts", zap.Ints("region", regionCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/client.go b/tools/pd-simulator/simulator/client.go index 808c991e97f..50ed57995df 100644 --- a/tools/pd-simulator/simulator/client.go +++ b/tools/pd-simulator/simulator/client.go @@ -30,6 +30,7 @@ import ( "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" "google.golang.org/grpc" @@ -45,7 +46,7 @@ type Client interface { PutStore(ctx context.Context, store *metapb.Store) error StoreHeartbeat(ctx context.Context, stats *pdpb.StoreStats) error RegionHeartbeat(ctx context.Context, region *core.RegionInfo) error - PutPDConfig(*PDConfig) error + PutPDConfig(*sc.PDConfig) error Close() } @@ -316,7 +317,7 @@ func (c *client) PutStore(ctx context.Context, store *metapb.Store) error { return nil } -func (c *client) PutPDConfig(config *PDConfig) error { +func (c *client) PutPDConfig(config *sc.PDConfig) error { if len(config.PlacementRules) > 0 { path := fmt.Sprintf("%s/%s/config/rules/batch", c.url, httpPrefix) ruleOps := make([]*placement.RuleOp, 0) diff --git a/tools/pd-simulator/simulator/config.go b/tools/pd-simulator/simulator/config/config.go similarity index 85% rename from tools/pd-simulator/simulator/config.go rename to tools/pd-simulator/simulator/config/config.go index 4f197fb83c2..01bf8199ab4 100644 --- a/tools/pd-simulator/simulator/config.go +++ b/tools/pd-simulator/simulator/config/config.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package simulator +package config import ( "fmt" @@ -31,8 +31,11 @@ import ( ) const ( - // tick - defaultSimTickInterval = 100 * time.Millisecond + // simulator + defaultSimTickInterval = 100 * time.Millisecond + defaultTotalStore = 3 + defaultTotalRegion = 1000 + defaultEnableTransferRegionCounter = false // store defaultStoreIOMBPerSecond = 40 defaultStoreHeartbeat = 10 * time.Second @@ -53,9 +56,12 @@ const ( // SimConfig is the simulator configuration. type SimConfig struct { - // tick - CaseName string `toml:"case-name"` - SimTickInterval typeutil.Duration `toml:"sim-tick-interval"` + // Simulator + CaseName string `toml:"case-name"` + TotalStore int `toml:"total-store"` + TotalRegion int `toml:"total-region"` + EnableTransferRegionCounter bool `toml:"enable-transfer-region-counter"` + SimTickInterval typeutil.Duration `toml:"sim-tick-interval"` // store StoreIOMBPerSecond int64 `toml:"store-io-per-second"` StoreVersion string `toml:"store-version"` @@ -99,6 +105,9 @@ func NewSimConfig(serverLogLevel string) *SimConfig { // Adjust is used to adjust configurations func (sc *SimConfig) Adjust(meta *toml.MetaData) error { configutil.AdjustDuration(&sc.SimTickInterval, defaultSimTickInterval) + configutil.AdjustInt(&sc.TotalStore, defaultTotalStore) + configutil.AdjustInt(&sc.TotalRegion, defaultTotalRegion) + configutil.AdjustBool(&sc.EnableTransferRegionCounter, defaultEnableTransferRegionCounter) configutil.AdjustInt64(&sc.StoreIOMBPerSecond, defaultStoreIOMBPerSecond) configutil.AdjustString(&sc.StoreVersion, versioninfo.PDReleaseVersion) configutil.AdjustDuration(&sc.RaftStore.RegionHeartBeatInterval, defaultRegionHeartbeat) @@ -118,7 +127,7 @@ func (sc *SimConfig) Adjust(meta *toml.MetaData) error { return sc.ServerConfig.Adjust(meta, false) } -func (sc *SimConfig) speed() uint64 { +func (sc *SimConfig) Speed() uint64 { return uint64(time.Second / sc.SimTickInterval.Duration) } diff --git a/tools/pd-simulator/simulator/conn.go b/tools/pd-simulator/simulator/conn.go index 588fec246d4..b95b33ee63d 100644 --- a/tools/pd-simulator/simulator/conn.go +++ b/tools/pd-simulator/simulator/conn.go @@ -17,6 +17,7 @@ package simulator import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" ) // Connection records the information of connection among nodes. @@ -26,7 +27,7 @@ type Connection struct { } // NewConnection creates nodes according to the configuration and returns the connection among nodes. -func NewConnection(simCase *cases.Case, pdAddr string, storeConfig *SimConfig) (*Connection, error) { +func NewConnection(simCase *cases.Case, pdAddr string, storeConfig *config.SimConfig) (*Connection, error) { conn := &Connection{ pdAddr: pdAddr, Nodes: make(map[uint64]*Node), diff --git a/tools/pd-simulator/simulator/drive.go b/tools/pd-simulator/simulator/drive.go index c7f64324c19..3d2bce74675 100644 --- a/tools/pd-simulator/simulator/drive.go +++ b/tools/pd-simulator/simulator/drive.go @@ -26,6 +26,7 @@ import ( "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/utils/typeutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.etcd.io/etcd/clientv3" @@ -42,17 +43,17 @@ type Driver struct { eventRunner *EventRunner raftEngine *RaftEngine conn *Connection - simConfig *SimConfig - pdConfig *PDConfig + simConfig *config.SimConfig + pdConfig *config.PDConfig } // NewDriver returns a driver. -func NewDriver(pdAddr string, caseName string, simConfig *SimConfig) (*Driver, error) { - simCase := cases.NewCase(caseName) +func NewDriver(pdAddr string, caseName string, simConfig *config.SimConfig) (*Driver, error) { + simCase := cases.NewCase(caseName, simConfig) if simCase == nil { return nil, errors.Errorf("failed to create case %s", caseName) } - pdConfig := &PDConfig{} + pdConfig := &config.PDConfig{} pdConfig.PlacementRules = simCase.Rules pdConfig.LocationLabels = simCase.Labels return &Driver{ diff --git a/tools/pd-simulator/simulator/node.go b/tools/pd-simulator/simulator/node.go index 68a10a8638e..883b5d4474b 100644 --- a/tools/pd-simulator/simulator/node.go +++ b/tools/pd-simulator/simulator/node.go @@ -27,6 +27,7 @@ import ( "github.com/tikv/pd/pkg/ratelimit" "github.com/tikv/pd/pkg/utils/syncutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" @@ -57,7 +58,7 @@ type Node struct { } // NewNode returns a Node. -func NewNode(s *cases.Store, pdAddr string, config *SimConfig) (*Node, error) { +func NewNode(s *cases.Store, pdAddr string, config *sc.SimConfig) (*Node, error) { ctx, cancel := context.WithCancel(context.Background()) store := &metapb.Store{ Id: s.ID, @@ -93,7 +94,7 @@ func NewNode(s *cases.Store, pdAddr string, config *SimConfig) (*Node, error) { cancel() return nil, err } - ratio := config.speed() + ratio := config.Speed() speed := config.StoreIOMBPerSecond * units.MiB * int64(ratio) return &Node{ Store: store, diff --git a/tools/pd-simulator/simulator/raft.go b/tools/pd-simulator/simulator/raft.go index fccf75781d3..d416f69ff80 100644 --- a/tools/pd-simulator/simulator/raft.go +++ b/tools/pd-simulator/simulator/raft.go @@ -22,6 +22,7 @@ import ( "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/utils/syncutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) @@ -34,12 +35,12 @@ type RaftEngine struct { regionChange map[uint64][]uint64 regionSplitSize int64 regionSplitKeys int64 - storeConfig *SimConfig + storeConfig *config.SimConfig useTiDBEncodedKey bool } // NewRaftEngine creates the initialized raft with the configuration. -func NewRaftEngine(conf *cases.Case, conn *Connection, storeConfig *SimConfig) *RaftEngine { +func NewRaftEngine(conf *cases.Case, conn *Connection, storeConfig *config.SimConfig) *RaftEngine { r := &RaftEngine{ regionsInfo: core.NewRegionsInfo(), conn: conn, diff --git a/tools/pd-simulator/simulator/simutil/case_config.go b/tools/pd-simulator/simulator/simutil/case_config.go deleted file mode 100644 index a34035c15aa..00000000000 --- a/tools/pd-simulator/simulator/simutil/case_config.go +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2019 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package simutil - -// CaseConfig is to save flags -type CaseConfig struct { - StoreNum int - RegionNum int - EnableTransferRegionCounter bool -} - -// CaseConfigure is an global instance for CaseConfig -var CaseConfigure *CaseConfig - -// InitCaseConfig is to init caseConfigure -func InitCaseConfig(storeNum, regionNum int, enableTransferRegionCounter bool) { - CaseConfigure = &CaseConfig{ - StoreNum: storeNum, - RegionNum: regionNum, - EnableTransferRegionCounter: enableTransferRegionCounter, - } -} diff --git a/tools/pd-simulator/simulator/task.go b/tools/pd-simulator/simulator/task.go index a19854b53ba..c0bfa1e691b 100644 --- a/tools/pd-simulator/simulator/task.go +++ b/tools/pd-simulator/simulator/task.go @@ -415,7 +415,7 @@ func (a *addPeer) tick(engine *RaftEngine, region *core.RegionInfo) (newRegion * pendingPeers := append(region.GetPendingPeers(), a.peer) return region.Clone(core.WithAddPeer(a.peer), core.WithIncConfVer(), core.WithPendingPeers(pendingPeers)), false } - speed := engine.storeConfig.speed() + speed := engine.storeConfig.Speed() // Step 2: Process Snapshot if !processSnapshot(sendNode, a.sendingStat, speed) { return nil, false From 71490f72b4c57a70f4f5b4e3486018859f85189c Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 30 May 2024 16:46:21 +0800 Subject: [PATCH 02/17] pkg/member: Fixing residual counts in campaign times (#8226) close tikv/pd#8225 Signed-off-by: husharp --- pkg/election/leadership.go | 15 +++++++------- pkg/election/leadership_test.go | 33 ++++++++++++++++++++++++++++++ pkg/member/member.go | 3 ++- tests/server/member/member_test.go | 10 +++++++-- 4 files changed, 51 insertions(+), 10 deletions(-) diff --git a/pkg/election/leadership.go b/pkg/election/leadership.go index 02f519dbc75..3ee413818a5 100644 --- a/pkg/election/leadership.go +++ b/pkg/election/leadership.go @@ -34,11 +34,12 @@ import ( ) const ( - defaultCampaignTimesSlot = 10 - watchLoopUnhealthyTimeout = 60 * time.Second - campaignTimesRecordTimeout = 5 * time.Minute + defaultCampaignTimesSlot = 10 + watchLoopUnhealthyTimeout = 60 * time.Second ) +var campaignTimesRecordTimeout = 5 * time.Minute + // GetLeader gets the corresponding leader from etcd by given leaderPath (as the key). func GetLeader(c *clientv3.Client, leaderPath string) (*pdpb.Member, int64, error) { leader := &pdpb.Member{} @@ -114,6 +115,7 @@ func (ls *Leadership) GetLeaderKey() string { } // GetCampaignTimesNum is used to get the campaign times of the leader within `campaignTimesRecordTimeout`. +// Need to make sure `AddCampaignTimes` is called before this function. func (ls *Leadership) GetCampaignTimesNum() int { if ls == nil { return 0 @@ -129,8 +131,8 @@ func (ls *Leadership) ResetCampaignTimes() { ls.campaignTimes = make([]time.Time, 0, defaultCampaignTimesSlot) } -// addCampaignTimes is used to add the campaign times of the leader. -func (ls *Leadership) addCampaignTimes() { +// AddCampaignTimes is used to add the campaign times of the leader. +func (ls *Leadership) AddCampaignTimes() { if ls == nil { return } @@ -138,7 +140,7 @@ func (ls *Leadership) addCampaignTimes() { if time.Since(ls.campaignTimes[i]) > campaignTimesRecordTimeout { // remove the time which is more than `campaignTimesRecordTimeout` // array is sorted by time - ls.campaignTimes = ls.campaignTimes[i:] + ls.campaignTimes = ls.campaignTimes[i+1:] break } } @@ -148,7 +150,6 @@ func (ls *Leadership) addCampaignTimes() { // Campaign is used to campaign the leader with given lease and returns a leadership func (ls *Leadership) Campaign(leaseTimeout int64, leaderData string, cmps ...clientv3.Cmp) error { - ls.addCampaignTimes() ls.leaderValue = leaderData // Create a new lease to campaign newLease := &lease{ diff --git a/pkg/election/leadership_test.go b/pkg/election/leadership_test.go index 1fde4ddeba7..40f0bcbee23 100644 --- a/pkg/election/leadership_test.go +++ b/pkg/election/leadership_test.go @@ -262,3 +262,36 @@ func TestRequestProgress(t *testing.T) { checkWatcherRequestProgress(false) checkWatcherRequestProgress(true) } + +func TestCampaignTimes(t *testing.T) { + re := require.New(t) + _, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() + leadership := NewLeadership(client, "test_leader", "test_leader") + + // all the campaign times are within the timeout. + campaignTimesRecordTimeout = 10 * time.Second + defer func() { + campaignTimesRecordTimeout = 5 * time.Minute + }() + for i := 0; i < 3; i++ { + leadership.AddCampaignTimes() + time.Sleep(100 * time.Millisecond) + } + re.Equal(3, leadership.GetCampaignTimesNum()) + + // only the last 2 records are valid. + campaignTimesRecordTimeout = 200 * time.Millisecond + for i := 0; i < 3; i++ { + leadership.AddCampaignTimes() + time.Sleep(100 * time.Millisecond) + } + re.Equal(2, leadership.GetCampaignTimesNum()) + + time.Sleep(200 * time.Millisecond) + // need to wait for the next addCampaignTimes to update the campaign time. + re.Equal(2, leadership.GetCampaignTimesNum()) + // check campaign leader frequency. + leadership.AddCampaignTimes() + re.Equal(1, leadership.GetCampaignTimesNum()) +} diff --git a/pkg/member/member.go b/pkg/member/member.go index af504d83963..bbf46d8f167 100644 --- a/pkg/member/member.go +++ b/pkg/member/member.go @@ -182,11 +182,12 @@ func (m *EmbeddedEtcdMember) GetLastLeaderUpdatedTime() time.Time { // and make it become a PD leader. // leader should be changed when campaign leader frequently. func (m *EmbeddedEtcdMember) CampaignLeader(ctx context.Context, leaseTimeout int64) error { + m.leadership.AddCampaignTimes() failpoint.Inject("skipCampaignLeaderCheck", func() { failpoint.Return(m.leadership.Campaign(leaseTimeout, m.MemberValue())) }) - if m.leadership.GetCampaignTimesNum() >= campaignLeaderFrequencyTimes { + if m.leadership.GetCampaignTimesNum() > campaignLeaderFrequencyTimes { if err := m.ResignEtcdLeader(ctx, m.Name(), ""); err != nil { return err } diff --git a/tests/server/member/member_test.go b/tests/server/member/member_test.go index c581eb39390..edff14a3b98 100644 --- a/tests/server/member/member_test.go +++ b/tests/server/member/member_test.go @@ -328,20 +328,26 @@ func TestCampaignLeaderFrequently(t *testing.T) { re := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - cluster, err := tests.NewTestCluster(ctx, 5) + cluster, err := tests.NewTestCluster(ctx, 3) defer cluster.Destroy() re.NoError(err) err = cluster.RunInitialServers() re.NoError(err) + // the 1st time campaign leader. cluster.WaitLeader() leader := cluster.GetLeader() re.NotEmpty(cluster.GetLeader()) - for i := 0; i < 3; i++ { + // need to prevent 3 times(including the above 1st time) campaign leader in 5 min. + for i := 0; i < 2; i++ { cluster.GetLeaderServer().ResetPDLeader() cluster.WaitLeader() + re.Equal(leader, cluster.GetLeader()) } + // check for the 4th time. + cluster.GetLeaderServer().ResetPDLeader() + cluster.WaitLeader() // PD leader should be different from before because etcd leader changed. re.NotEmpty(cluster.GetLeader()) re.NotEqual(leader, cluster.GetLeader()) From 632cda452a5284d272330d02278ed4882355a7aa Mon Sep 17 00:00:00 2001 From: JmPotato Date: Thu, 30 May 2024 17:25:22 +0800 Subject: [PATCH 03/17] api/middleware: avoid redirecting when the leader remains unchanged (#8228) ref tikv/pd#7300 Avoid redirecting when the leader remains unchanged. Signed-off-by: JmPotato Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/utils/apiutil/serverapi/middleware.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/utils/apiutil/serverapi/middleware.go b/pkg/utils/apiutil/serverapi/middleware.go index 18dd2f52155..1cd3d5b53d6 100755 --- a/pkg/utils/apiutil/serverapi/middleware.go +++ b/pkg/utils/apiutil/serverapi/middleware.go @@ -208,10 +208,16 @@ func (h *redirector) ServeHTTP(w http.ResponseWriter, r *http.Request, next http w.Header().Add(apiutil.XForwardedToMicroServiceHeader, "true") } else if name := r.Header.Get(apiutil.PDRedirectorHeader); len(name) == 0 { leader := h.waitForLeader(r) + // The leader has not been elected yet. if leader == nil { http.Error(w, "no leader", http.StatusServiceUnavailable) return } + // If the leader is the current server now, we can handle the request directly. + if h.s.GetMember().IsLeader() || leader.GetName() == h.s.Name() { + next(w, r) + return + } clientUrls = leader.GetClientUrls() r.Header.Set(apiutil.PDRedirectorHeader, h.s.Name()) } else { From 19c9852decda4cb49a2319b453c4f01c6a26014f Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Fri, 31 May 2024 12:28:22 +0800 Subject: [PATCH 04/17] tools: support triggering an event through HTTP API (#5677) close tikv/pd#5451, ref tikv/pd#5468 Signed-off-by: Ryan Leung --- tools/pd-simulator/main.go | 30 +---- .../pd-simulator/simulator/cases/add_nodes.go | 71 ------------ .../simulator/cases/add_nodes_dynamic.go | 92 --------------- .../simulator/cases/balance_leader.go | 9 +- .../simulator/cases/balance_region.go | 7 +- tools/pd-simulator/simulator/cases/cases.go | 16 --- .../simulator/cases/delete_nodes.go | 94 ---------------- .../pd-simulator/simulator/cases/hot_read.go | 7 +- .../pd-simulator/simulator/cases/hot_write.go | 7 +- .../simulator/cases/makeup_down_replica.go | 7 +- .../simulator/cases/region_merge.go | 7 +- tools/pd-simulator/simulator/conn.go | 10 ++ tools/pd-simulator/simulator/drive.go | 59 +++++++--- tools/pd-simulator/simulator/event.go | 106 ++++++++++++------ tools/pd-simulator/simulator/simutil/id.go | 39 +++++++ 15 files changed, 190 insertions(+), 371 deletions(-) delete mode 100644 tools/pd-simulator/simulator/cases/add_nodes.go delete mode 100644 tools/pd-simulator/simulator/cases/add_nodes_dynamic.go delete mode 100644 tools/pd-simulator/simulator/cases/delete_nodes.go create mode 100644 tools/pd-simulator/simulator/simutil/id.go diff --git a/tools/pd-simulator/main.go b/tools/pd-simulator/main.go index 04de914f5f0..45b3ecd75c9 100644 --- a/tools/pd-simulator/main.go +++ b/tools/pd-simulator/main.go @@ -17,8 +17,6 @@ package main import ( "context" "fmt" - "net/http" - "net/http/pprof" "os" "os/signal" "syscall" @@ -26,7 +24,6 @@ import ( "github.com/BurntSushi/toml" "github.com/pingcap/log" - "github.com/prometheus/client_golang/prometheus/promhttp" flag "github.com/spf13/pflag" "github.com/tikv/pd/pkg/schedule/schedulers" "github.com/tikv/pd/pkg/statistics" @@ -95,8 +92,7 @@ func main() { func run(simCase string, simConfig *sc.SimConfig) { if *pdAddr != "" { - go runHTTPServer() - simStart(*pdAddr, simCase, simConfig) + simStart(*pdAddr, *statusAddress, simCase, simConfig) } else { local, clean := NewSingleServer(context.Background(), simConfig) err := local.Run() @@ -109,28 +105,10 @@ func run(simCase string, simConfig *sc.SimConfig) { } time.Sleep(100 * time.Millisecond) } - simStart(local.GetAddr(), simCase, simConfig, clean) + simStart(local.GetAddr(), "", simCase, simConfig, clean) } } -func runHTTPServer() { - http.Handle("/metrics", promhttp.Handler()) - // profile API - http.HandleFunc("/pprof/profile", pprof.Profile) - http.HandleFunc("/pprof/trace", pprof.Trace) - http.HandleFunc("/pprof/symbol", pprof.Symbol) - http.Handle("/pprof/heap", pprof.Handler("heap")) - http.Handle("/pprof/mutex", pprof.Handler("mutex")) - http.Handle("/pprof/allocs", pprof.Handler("allocs")) - http.Handle("/pprof/block", pprof.Handler("block")) - http.Handle("/pprof/goroutine", pprof.Handler("goroutine")) - server := &http.Server{ - Addr: *statusAddress, - ReadHeaderTimeout: 3 * time.Second, - } - server.ListenAndServe() -} - // NewSingleServer creates a pd server for simulator. func NewSingleServer(ctx context.Context, simConfig *sc.SimConfig) (*server.Server, testutil.CleanupFunc) { err := logutil.SetupLogger(simConfig.ServerConfig.Log, &simConfig.ServerConfig.Logger, &simConfig.ServerConfig.LogProps) @@ -157,9 +135,9 @@ func cleanServer(cfg *config.Config) { os.RemoveAll(cfg.DataDir) } -func simStart(pdAddr string, simCase string, simConfig *sc.SimConfig, clean ...testutil.CleanupFunc) { +func simStart(pdAddr, statusAddress string, simCase string, simConfig *sc.SimConfig, clean ...testutil.CleanupFunc) { start := time.Now() - driver, err := simulator.NewDriver(pdAddr, simCase, simConfig) + driver, err := simulator.NewDriver(pdAddr, statusAddress, simCase, simConfig) if err != nil { simutil.Logger.Fatal("create driver error", zap.Error(err)) } diff --git a/tools/pd-simulator/simulator/cases/add_nodes.go b/tools/pd-simulator/simulator/cases/add_nodes.go deleted file mode 100644 index 5c73fe9764c..00000000000 --- a/tools/pd-simulator/simulator/cases/add_nodes.go +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2017 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package cases - -import ( - "github.com/docker/go-units" - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/tikv/pd/pkg/core" - sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" - "github.com/tikv/pd/tools/pd-simulator/simulator/info" -) - -func newAddNodes(config *sc.SimConfig) *Case { - var simCase Case - - totalStore := config.TotalStore - totalRegion := config.TotalRegion - replica := int(config.ServerConfig.Replication.MaxReplicas) - noEmptyStoreNum := getNoEmptyStoreNum(totalStore, replica) - - for i := 0; i < totalStore; i++ { - simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), - Status: metapb.StoreState_Up, - }) - } - - for i := 0; i < totalRegion; i++ { - peers := make([]*metapb.Peer, 0, replica) - for j := 0; j < replica; j++ { - peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), - StoreId: uint64((i+j)%noEmptyStoreNum + 1), - }) - } - simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), - Peers: peers, - Leader: peers[0], - Size: 96 * units.MiB, - Keys: 960000, - }) - } - - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - for i := 1; i <= totalStore; i++ { - leaderCount := regions.GetStoreLeaderCount(uint64(i)) - peerCount := regions.GetStoreRegionCount(uint64(i)) - if !isUniform(leaderCount, totalRegion/totalStore) { - return false - } - if !isUniform(peerCount, totalRegion*replica/totalStore) { - return false - } - } - return true - } - return &simCase -} diff --git a/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go b/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go deleted file mode 100644 index aa585b48923..00000000000 --- a/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2018 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package cases - -import ( - "github.com/docker/go-units" - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/tikv/pd/pkg/core" - sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" - "github.com/tikv/pd/tools/pd-simulator/simulator/info" -) - -func newAddNodesDynamic(config *sc.SimConfig) *Case { - var simCase Case - - totalStore := config.TotalStore - totalRegion := config.TotalRegion - replica := int(config.ServerConfig.Replication.MaxReplicas) - noEmptyStoreNum := getNoEmptyStoreNum(totalStore, replica) - - for i := 0; i < noEmptyStoreNum; i++ { - simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), - Status: metapb.StoreState_Up, - }) - } - - var ids []uint64 - for i := 0; i < totalStore-noEmptyStoreNum; i++ { - ids = append(ids, IDAllocator.nextID()) - } - - for i := 0; i < totalRegion; i++ { - peers := make([]*metapb.Peer, 0, replica) - for j := 0; j < replica; j++ { - peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), - StoreId: uint64((i+j)%noEmptyStoreNum + 1), - }) - } - simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), - Peers: peers, - Leader: peers[0], - Size: 96 * units.MiB, - Keys: 960000, - }) - } - - currentStoreCount := noEmptyStoreNum - e := &AddNodesDescriptor{} - e.Step = func(tick int64) uint64 { - if tick%100 == 0 && currentStoreCount < totalStore { - currentStoreCount++ - nodeID := ids[0] - ids = append(ids[:0], ids[1:]...) - return nodeID - } - return 0 - } - simCase.Events = []EventDescriptor{e} - - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - if currentStoreCount != totalStore { - return false - } - for i := 1; i <= currentStoreCount; i++ { - leaderCount := regions.GetStoreLeaderCount(uint64(i)) - peerCount := regions.GetStoreRegionCount(uint64(i)) - if !isUniform(leaderCount, totalRegion/totalStore) { - return false - } - if !isUniform(peerCount, totalRegion*replica/totalStore) { - return false - } - } - return true - } - return &simCase -} diff --git a/tools/pd-simulator/simulator/cases/balance_leader.go b/tools/pd-simulator/simulator/cases/balance_leader.go index c5315f85d8e..fd9028bc91a 100644 --- a/tools/pd-simulator/simulator/cases/balance_leader.go +++ b/tools/pd-simulator/simulator/cases/balance_leader.go @@ -20,6 +20,7 @@ import ( "github.com/tikv/pd/pkg/core" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" + "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) func newBalanceLeader(config *sc.SimConfig) *Case { @@ -30,7 +31,7 @@ func newBalanceLeader(config *sc.SimConfig) *Case { replica := int(config.ServerConfig.Replication.MaxReplicas) for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } @@ -39,17 +40,17 @@ func newBalanceLeader(config *sc.SimConfig) *Case { for i := 0; i < totalRegion; i++ { peers := make([]*metapb.Peer, 0, replica) peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: leaderStoreID, }) for j := 1; j < replica; j++ { peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: uint64((i+j)%(totalStore-1) + 1), }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 96 * units.MiB, diff --git a/tools/pd-simulator/simulator/cases/balance_region.go b/tools/pd-simulator/simulator/cases/balance_region.go index a559a335c97..82a7ac2d704 100644 --- a/tools/pd-simulator/simulator/cases/balance_region.go +++ b/tools/pd-simulator/simulator/cases/balance_region.go @@ -21,6 +21,7 @@ import ( "github.com/tikv/pd/pkg/core" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" + "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) func newRedundantBalanceRegion(config *sc.SimConfig) *Case { @@ -32,7 +33,7 @@ func newRedundantBalanceRegion(config *sc.SimConfig) *Case { for i := 0; i < totalStore; i++ { s := &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, } if i%2 == 1 { @@ -45,12 +46,12 @@ func newRedundantBalanceRegion(config *sc.SimConfig) *Case { peers := make([]*metapb.Peer, 0, replica) for j := 0; j < replica; j++ { peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: uint64((i+j)%totalStore + 1), }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], }) diff --git a/tools/pd-simulator/simulator/cases/cases.go b/tools/pd-simulator/simulator/cases/cases.go index f2e79a81924..00b5404669f 100644 --- a/tools/pd-simulator/simulator/cases/cases.go +++ b/tools/pd-simulator/simulator/cases/cases.go @@ -15,8 +15,6 @@ package cases import ( - "math/rand" - "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" @@ -91,9 +89,6 @@ var IDAllocator idAllocator var CaseMap = map[string]func(*config.SimConfig) *Case{ "balance-leader": newBalanceLeader, "redundant-balance-region": newRedundantBalanceRegion, - "add-nodes": newAddNodes, - "add-nodes-dynamic": newAddNodesDynamic, - "delete-nodes": newDeleteNodes, "region-split": newRegionSplit, "region-merge": newRegionMerge, "hot-read": newHotRead, @@ -121,14 +116,3 @@ func isUniform(count, meanCount int) bool { minCount := int((1.0 - threshold) * float64(meanCount)) return minCount <= count && count <= maxCount } - -func getNoEmptyStoreNum(storeNum int, replica int) int { - noEmptyStoreNum := rand.Intn(storeNum) - if noEmptyStoreNum < replica { - return replica - } - if noEmptyStoreNum == storeNum { - return storeNum - 1 - } - return noEmptyStoreNum -} diff --git a/tools/pd-simulator/simulator/cases/delete_nodes.go b/tools/pd-simulator/simulator/cases/delete_nodes.go deleted file mode 100644 index 80650cf109d..00000000000 --- a/tools/pd-simulator/simulator/cases/delete_nodes.go +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2018 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package cases - -import ( - "math/rand" - - "github.com/docker/go-units" - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/tikv/pd/pkg/core" - sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" - "github.com/tikv/pd/tools/pd-simulator/simulator/info" -) - -func newDeleteNodes(config *sc.SimConfig) *Case { - var simCase Case - - totalStore := config.TotalStore - totalRegion := config.TotalRegion - replica := int(config.ServerConfig.Replication.MaxReplicas) - noEmptyStoreNum := totalStore - 1 - for i := 1; i <= totalStore; i++ { - simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), - Status: metapb.StoreState_Up, - }) - } - - for i := 0; i < totalRegion; i++ { - peers := make([]*metapb.Peer, 0, replica) - for j := 0; j < replica; j++ { - peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), - StoreId: uint64((i+j)%totalStore + 1), - }) - } - simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), - Peers: peers, - Leader: peers[0], - Size: 96 * units.MiB, - Keys: 960000, - }) - } - - ids := make([]uint64, 0, len(simCase.Stores)) - for _, store := range simCase.Stores { - ids = append(ids, store.ID) - } - - currentStoreCount := totalStore - e := &DeleteNodesDescriptor{} - e.Step = func(tick int64) uint64 { - if currentStoreCount > noEmptyStoreNum && tick%100 == 0 { - idx := rand.Intn(currentStoreCount) - currentStoreCount-- - nodeID := ids[idx] - ids = append(ids[:idx], ids[idx+1:]...) - return nodeID - } - return 0 - } - simCase.Events = []EventDescriptor{e} - - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - if currentStoreCount != noEmptyStoreNum { - return false - } - for _, i := range ids { - leaderCount := regions.GetStoreLeaderCount(i) - peerCount := regions.GetStoreRegionCount(i) - if !isUniform(leaderCount, totalRegion/noEmptyStoreNum) { - return false - } - if !isUniform(peerCount, totalRegion*replica/noEmptyStoreNum) { - return false - } - } - return true - } - return &simCase -} diff --git a/tools/pd-simulator/simulator/cases/hot_read.go b/tools/pd-simulator/simulator/cases/hot_read.go index 50ad08d6011..d154886b0a4 100644 --- a/tools/pd-simulator/simulator/cases/hot_read.go +++ b/tools/pd-simulator/simulator/cases/hot_read.go @@ -20,6 +20,7 @@ import ( "github.com/tikv/pd/pkg/core" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" + "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) func newHotRead(config *sc.SimConfig) *Case { @@ -31,7 +32,7 @@ func newHotRead(config *sc.SimConfig) *Case { // Initialize the cluster for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } @@ -40,12 +41,12 @@ func newHotRead(config *sc.SimConfig) *Case { peers := make([]*metapb.Peer, 0, replica) for j := 0; j < replica; j++ { peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: uint64((i+j)%totalStore + 1), }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 96 * units.MiB, diff --git a/tools/pd-simulator/simulator/cases/hot_write.go b/tools/pd-simulator/simulator/cases/hot_write.go index a30afd1a8ec..e73ca6f3ce3 100644 --- a/tools/pd-simulator/simulator/cases/hot_write.go +++ b/tools/pd-simulator/simulator/cases/hot_write.go @@ -20,6 +20,7 @@ import ( "github.com/tikv/pd/pkg/core" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" + "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) func newHotWrite(config *sc.SimConfig) *Case { @@ -31,7 +32,7 @@ func newHotWrite(config *sc.SimConfig) *Case { // Initialize the cluster for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } @@ -40,12 +41,12 @@ func newHotWrite(config *sc.SimConfig) *Case { peers := make([]*metapb.Peer, 0, replica) for j := 0; j < replica; j++ { peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: uint64((i+j)%totalStore + 1), }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 96 * units.MiB, diff --git a/tools/pd-simulator/simulator/cases/makeup_down_replica.go b/tools/pd-simulator/simulator/cases/makeup_down_replica.go index 28de9577cfc..a5ee63e71a0 100644 --- a/tools/pd-simulator/simulator/cases/makeup_down_replica.go +++ b/tools/pd-simulator/simulator/cases/makeup_down_replica.go @@ -20,6 +20,7 @@ import ( "github.com/tikv/pd/pkg/core" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" + "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) func newMakeupDownReplicas(config *sc.SimConfig) *Case { @@ -31,7 +32,7 @@ func newMakeupDownReplicas(config *sc.SimConfig) *Case { noEmptyStoreNum := totalStore - 1 for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } @@ -40,12 +41,12 @@ func newMakeupDownReplicas(config *sc.SimConfig) *Case { peers := make([]*metapb.Peer, 0, replica) for j := 0; j < replica; j++ { peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: uint64((i+j)%totalStore + 1), }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 96 * units.MiB, diff --git a/tools/pd-simulator/simulator/cases/region_merge.go b/tools/pd-simulator/simulator/cases/region_merge.go index 953b0e309e1..8097565d1a7 100644 --- a/tools/pd-simulator/simulator/cases/region_merge.go +++ b/tools/pd-simulator/simulator/cases/region_merge.go @@ -20,6 +20,7 @@ import ( "github.com/tikv/pd/pkg/core" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" + "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) func newRegionMerge(config *sc.SimConfig) *Case { @@ -30,7 +31,7 @@ func newRegionMerge(config *sc.SimConfig) *Case { for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } @@ -39,12 +40,12 @@ func newRegionMerge(config *sc.SimConfig) *Case { peers := make([]*metapb.Peer, 0, replica) for j := 0; j < replica; j++ { peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: uint64((i+j)%totalStore + 1), }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 10 * units.MiB, diff --git a/tools/pd-simulator/simulator/conn.go b/tools/pd-simulator/simulator/conn.go index b95b33ee63d..4be8a2b76dc 100644 --- a/tools/pd-simulator/simulator/conn.go +++ b/tools/pd-simulator/simulator/conn.go @@ -52,3 +52,13 @@ func (c *Connection) nodeHealth(storeID uint64) bool { return n.GetNodeState() == metapb.NodeState_Preparing || n.GetNodeState() == metapb.NodeState_Serving } + +func (c *Connection) getNodes() []*Node { + var nodes []*Node + for _, n := range c.Nodes { + if n.GetNodeState() != metapb.NodeState_Removed { + nodes = append(nodes, n) + } + } + return nodes +} diff --git a/tools/pd-simulator/simulator/drive.go b/tools/pd-simulator/simulator/drive.go index 3d2bce74675..700dd58f87a 100644 --- a/tools/pd-simulator/simulator/drive.go +++ b/tools/pd-simulator/simulator/drive.go @@ -16,6 +16,8 @@ package simulator import ( "context" + "net/http" + "net/http/pprof" "path" "strconv" "sync" @@ -23,6 +25,7 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/utils/typeutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" @@ -35,20 +38,21 @@ import ( // Driver promotes the cluster status change. type Driver struct { - wg sync.WaitGroup - pdAddr string - simCase *cases.Case - client Client - tickCount int64 - eventRunner *EventRunner - raftEngine *RaftEngine - conn *Connection - simConfig *config.SimConfig - pdConfig *config.PDConfig + wg sync.WaitGroup + pdAddr string + statusAddress string + simCase *cases.Case + client Client + tickCount int64 + eventRunner *EventRunner + raftEngine *RaftEngine + conn *Connection + simConfig *config.SimConfig + pdConfig *config.PDConfig } // NewDriver returns a driver. -func NewDriver(pdAddr string, caseName string, simConfig *config.SimConfig) (*Driver, error) { +func NewDriver(pdAddr, statusAddress, caseName string, simConfig *config.SimConfig) (*Driver, error) { simCase := cases.NewCase(caseName, simConfig) if simCase == nil { return nil, errors.Errorf("failed to create case %s", caseName) @@ -57,10 +61,11 @@ func NewDriver(pdAddr string, caseName string, simConfig *config.SimConfig) (*Dr pdConfig.PlacementRules = simCase.Rules pdConfig.LocationLabels = simCase.Labels return &Driver{ - pdAddr: pdAddr, - simCase: simCase, - simConfig: simConfig, - pdConfig: pdConfig, + pdAddr: pdAddr, + statusAddress: statusAddress, + simCase: simCase, + simConfig: simConfig, + pdConfig: pdConfig, }, nil } @@ -77,6 +82,9 @@ func (d *Driver) Prepare() error { d.updateNodeAvailable() + if d.statusAddress != "" { + go d.runHTTPServer() + } // Bootstrap. store, region, err := d.GetBootstrapInfo(d.raftEngine) if err != nil { @@ -95,7 +103,7 @@ func (d *Driver) Prepare() error { // Setup alloc id. // TODO: This is a hack way. Once we have reset alloc ID API, we need to replace it. - maxID := cases.IDAllocator.GetID() + maxID := simutil.IDAllocator.GetID() requestTimeout := 10 * time.Second etcdTimeout := 3 * time.Second etcdClient, err := clientv3.New(clientv3.Config{ @@ -123,7 +131,7 @@ func (d *Driver) Prepare() error { return errors.WithStack(err) } if id > maxID { - cases.IDAllocator.ResetID() + simutil.IDAllocator.ResetID() break } } @@ -226,3 +234,20 @@ func (d *Driver) updateNodeAvailable() { } } } + +func (d *Driver) runHTTPServer() { + http.Handle("/metrics", promhttp.Handler()) + // profile API + http.HandleFunc("/pprof/profile", pprof.Profile) + http.HandleFunc("/pprof/trace", pprof.Trace) + http.HandleFunc("/pprof/symbol", pprof.Symbol) + http.Handle("/pprof/heap", pprof.Handler("heap")) + http.Handle("/pprof/mutex", pprof.Handler("mutex")) + http.Handle("/pprof/allocs", pprof.Handler("allocs")) + http.Handle("/pprof/block", pprof.Handler("block")) + http.Handle("/pprof/goroutine", pprof.Handler("goroutine")) + eventHandler := newEventHandler(d.eventRunner) + http.HandleFunc("/event", eventHandler.createEvent) + // nolint + http.ListenAndServe(d.statusAddress, nil) +} diff --git a/tools/pd-simulator/simulator/event.go b/tools/pd-simulator/simulator/event.go index 04ad10a0db8..8be8f89d759 100644 --- a/tools/pd-simulator/simulator/event.go +++ b/tools/pd-simulator/simulator/event.go @@ -15,6 +15,12 @@ package simulator import ( + "context" + "fmt" + "math/rand" + "net/http" + "sync" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/tikv/pd/pkg/core" @@ -30,6 +36,7 @@ type Event interface { // EventRunner includes all events. type EventRunner struct { + sync.RWMutex events []Event raftEngine *RaftEngine } @@ -46,6 +53,33 @@ func NewEventRunner(events []cases.EventDescriptor, raftEngine *RaftEngine) *Eve return er } +type eventHandler struct { + er *EventRunner +} + +func newEventHandler(er *EventRunner) *eventHandler { + return &eventHandler{ + er: er, + } +} + +func (e *eventHandler) createEvent(w http.ResponseWriter, r *http.Request) { + event := r.URL.Query().Get("event") + if len(event) < 1 { + fmt.Fprintf(w, "no given event") + return + } + switch event { + case "add-node": + e.er.addEvent(&AddNode{}) + return + case "down-node": + e.er.addEvent(&DownNode{}) + return + default: + } +} + func parserEvent(e cases.EventDescriptor) Event { switch t := e.(type) { case *cases.WriteFlowOnSpotDescriptor: @@ -54,16 +88,20 @@ func parserEvent(e cases.EventDescriptor) Event { return &WriteFlowOnRegion{descriptor: t} case *cases.ReadFlowOnRegionDescriptor: return &ReadFlowOnRegion{descriptor: t} - case *cases.AddNodesDescriptor: - return &AddNodes{descriptor: t} - case *cases.DeleteNodesDescriptor: - return &DeleteNodes{descriptor: t} } return nil } +func (er *EventRunner) addEvent(e Event) { + er.Lock() + defer er.Unlock() + er.events = append(er.events, e) +} + // Tick ticks the event run func (er *EventRunner) Tick(tickCount int64) { + er.Lock() + defer er.Unlock() var finishedIndex int for i, e := range er.events { isFinished := e.Run(er.raftEngine, tickCount) @@ -126,24 +164,18 @@ func (e *ReadFlowOnRegion) Run(raft *RaftEngine, tickCount int64) bool { return false } -// AddNodes adds nodes. -type AddNodes struct { - descriptor *cases.AddNodesDescriptor -} +// AddNode adds nodes. +type AddNode struct{} // Run implements the event interface. -func (e *AddNodes) Run(raft *RaftEngine, tickCount int64) bool { - id := e.descriptor.Step(tickCount) - if id == 0 { - return false - } - - if _, ok := raft.conn.Nodes[id]; ok { - simutil.Logger.Info("node has already existed", zap.Uint64("node-id", id)) +func (*AddNode) Run(raft *RaftEngine, _ int64) bool { + config := raft.storeConfig + nodes := raft.conn.getNodes() + id, err := nodes[0].client.AllocID(context.TODO()) + if err != nil { + simutil.Logger.Error("alloc node id failed", zap.Error(err)) return false } - - config := raft.storeConfig s := &cases.Store{ ID: id, Status: metapb.StoreState_Up, @@ -152,49 +184,51 @@ func (e *AddNodes) Run(raft *RaftEngine, tickCount int64) bool { } n, err := NewNode(s, raft.conn.pdAddr, config) if err != nil { - simutil.Logger.Error("add node failed", zap.Uint64("node-id", id), zap.Error(err)) + simutil.Logger.Error("create node failed", zap.Error(err)) return false } - raft.conn.Nodes[id] = n + + raft.conn.Nodes[s.ID] = n n.raftEngine = raft err = n.Start() if err != nil { - simutil.Logger.Error("start node failed", zap.Uint64("node-id", id), zap.Error(err)) + delete(raft.conn.Nodes, s.ID) + simutil.Logger.Error("start node failed", zap.Uint64("node-id", s.ID), zap.Error(err)) + return false } - return false + return true } -// DeleteNodes deletes nodes. -type DeleteNodes struct { - descriptor *cases.DeleteNodesDescriptor -} +// DownNode deletes nodes. +type DownNode struct{} // Run implements the event interface. -func (e *DeleteNodes) Run(raft *RaftEngine, tickCount int64) bool { - id := e.descriptor.Step(tickCount) - if id == 0 { +func (*DownNode) Run(raft *RaftEngine, _ int64) bool { + nodes := raft.conn.getNodes() + if len(nodes) == 0 { + simutil.Logger.Error("can not find any node") return false } - - node := raft.conn.Nodes[id] + i := rand.Intn(len(nodes)) + node := nodes[i] if node == nil { - simutil.Logger.Error("node is not existed", zap.Uint64("node-id", id)) + simutil.Logger.Error("node is not existed", zap.Uint64("node-id", node.Id)) return false } - delete(raft.conn.Nodes, id) + delete(raft.conn.Nodes, node.Id) node.Stop() regions := raft.GetRegions() for _, region := range regions { storeIDs := region.GetStoreIDs() - if _, ok := storeIDs[id]; ok { + if _, ok := storeIDs[node.Id]; ok { downPeer := &pdpb.PeerStats{ - Peer: region.GetStorePeer(id), + Peer: region.GetStorePeer(node.Id), DownSeconds: 24 * 60 * 60, } region = region.Clone(core.WithDownPeers(append(region.GetDownPeers(), downPeer))) raft.SetRegion(region) } } - return false + return true } diff --git a/tools/pd-simulator/simulator/simutil/id.go b/tools/pd-simulator/simulator/simutil/id.go new file mode 100644 index 00000000000..8badddff3f1 --- /dev/null +++ b/tools/pd-simulator/simulator/simutil/id.go @@ -0,0 +1,39 @@ +// Copyright 2024 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package simutil + +// IDAllocator is used to alloc unique ID. +type idAllocator struct { + id uint64 +} + +// NextID gets the next unique ID. +func (a *idAllocator) NextID() uint64 { + a.id++ + return a.id +} + +// ResetID resets the IDAllocator. +func (a *idAllocator) ResetID() { + a.id = 0 +} + +// GetID gets the current ID. +func (a *idAllocator) GetID() uint64 { + return a.id +} + +// IDAllocator is used to alloc unique ID. +var IDAllocator idAllocator From 199b01792159e5d8e83ef419a5053401e998bb0e Mon Sep 17 00:00:00 2001 From: JmPotato Date: Fri, 31 May 2024 16:29:52 +0800 Subject: [PATCH 05/17] client/retry: only return the latest error in backoffer (#8227) ref tikv/pd#8142 Due to the return of historical errors causing the client's retry logic to fail, and since we currently do not need to obtain all errors during retries, this PR removes `multierr` from backoffer and add tests to ensure the correctness of the retry logic. Signed-off-by: JmPotato Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- client/go.mod | 2 +- client/http/client.go | 5 +- client/retry/backoff.go | 45 ++++++-------- client/retry/backoff_test.go | 62 +++++++++++++++---- tests/integrations/client/http_client_test.go | 50 +++++++++++++-- 5 files changed, 119 insertions(+), 45 deletions(-) diff --git a/client/go.mod b/client/go.mod index 89799796521..6baa2f112f4 100644 --- a/client/go.mod +++ b/client/go.mod @@ -16,7 +16,6 @@ require ( github.com/stretchr/testify v1.8.2 go.uber.org/atomic v1.10.0 go.uber.org/goleak v1.1.11 - go.uber.org/multierr v1.11.0 go.uber.org/zap v1.24.0 golang.org/x/exp v0.0.0-20230711005742-c3f37128e5a4 google.golang.org/grpc v1.62.1 @@ -34,6 +33,7 @@ require ( github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/common v0.46.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect + go.uber.org/multierr v1.11.0 // indirect golang.org/x/net v0.23.0 // indirect golang.org/x/sys v0.18.0 // indirect golang.org/x/text v0.14.0 // indirect diff --git a/client/http/client.go b/client/http/client.go index 30144ebe2c5..7b34193c2a4 100644 --- a/client/http/client.go +++ b/client/http/client.go @@ -153,10 +153,11 @@ func (ci *clientInner) requestWithRetry( } // Copy a new backoffer for each request. bo := *reqInfo.bo - // Backoffer also needs to check the status code to determine whether to retry. + // Set the retryable checker for the backoffer if it's not set. bo.SetRetryableChecker(func(err error) bool { + // Backoffer also needs to check the status code to determine whether to retry. return err != nil && !noNeedRetry(statusCode) - }) + }, false) return bo.Exec(ctx, execFunc) } diff --git a/client/retry/backoff.go b/client/retry/backoff.go index 580e466badb..9161ad0fea1 100644 --- a/client/retry/backoff.go +++ b/client/retry/backoff.go @@ -24,12 +24,9 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/failpoint" "github.com/pingcap/log" - "go.uber.org/multierr" "go.uber.org/zap" ) -const maxRecordErrorCount = 20 - // Option is used to customize the backoffer. type Option func(*Backoffer) @@ -50,7 +47,7 @@ type Backoffer struct { // total defines the max total time duration cost in retrying. If it's 0, it means infinite retry until success. total time.Duration // retryableChecker is used to check if the error is retryable. - // By default, all errors are retryable. + // If it's not set, it will always retry unconditionally no matter what the error is. retryableChecker func(err error) bool // logInterval defines the log interval for retrying. logInterval time.Duration @@ -69,28 +66,22 @@ func (bo *Backoffer) Exec( ) error { defer bo.resetBackoff() var ( - allErrors error - err error - after *time.Timer + err error + after *time.Timer ) fnName := getFunctionName(fn) for { err = fn() bo.attempt++ - if bo.attempt < maxRecordErrorCount { - // multierr.Append will ignore nil error. - allErrors = multierr.Append(allErrors, err) - } - if !bo.isRetryable(err) { + if err == nil || !bo.isRetryable(err) { break } currentInterval := bo.nextInterval() bo.nextLogTime += currentInterval - if err != nil { - if bo.logInterval > 0 && bo.nextLogTime >= bo.logInterval { - bo.nextLogTime %= bo.logInterval - log.Warn("call PD API failed and retrying", zap.String("api", fnName), zap.Int("retry-time", bo.attempt), zap.Error(err)) - } + if bo.logInterval > 0 && bo.nextLogTime >= bo.logInterval { + bo.nextLogTime %= bo.logInterval + log.Warn("[pd.backoffer] exec fn failed and retrying", + zap.String("fn-name", fnName), zap.Int("retry-time", bo.attempt), zap.Error(err)) } if after == nil { after = time.NewTimer(currentInterval) @@ -100,7 +91,7 @@ func (bo *Backoffer) Exec( select { case <-ctx.Done(): after.Stop() - return multierr.Append(allErrors, errors.Trace(ctx.Err())) + return errors.Trace(ctx.Err()) case <-after.C: failpoint.Inject("backOffExecute", func() { testBackOffExecuteFlag = true @@ -115,7 +106,7 @@ func (bo *Backoffer) Exec( } } } - return allErrors + return err } // InitialBackoffer make the initial state for retrying. @@ -132,12 +123,9 @@ func InitialBackoffer(base, max, total time.Duration, opts ...Option) *Backoffer total = base } bo := &Backoffer{ - base: base, - max: max, - total: total, - retryableChecker: func(err error) bool { - return err != nil - }, + base: base, + max: max, + total: total, next: base, currentTotal: 0, attempt: 0, @@ -148,8 +136,11 @@ func InitialBackoffer(base, max, total time.Duration, opts ...Option) *Backoffer return bo } -// SetRetryableChecker sets the retryable checker. -func (bo *Backoffer) SetRetryableChecker(checker func(err error) bool) { +// SetRetryableChecker sets the retryable checker, `overwrite` flag is used to indicate whether to overwrite the existing checker. +func (bo *Backoffer) SetRetryableChecker(checker func(err error) bool, overwrite bool) { + if !overwrite && bo.retryableChecker != nil { + return + } bo.retryableChecker = checker } diff --git a/client/retry/backoff_test.go b/client/retry/backoff_test.go index 8df06b75f94..22d487b1885 100644 --- a/client/retry/backoff_test.go +++ b/client/retry/backoff_test.go @@ -18,6 +18,7 @@ import ( "bytes" "context" "errors" + "fmt" "testing" "time" @@ -87,24 +88,64 @@ func TestBackoffer(t *testing.T) { return expectedErr }) re.InDelta(total, time.Since(start), float64(250*time.Millisecond)) - re.ErrorContains(err, "test; test; test; test") + re.ErrorContains(err, "test") re.ErrorIs(err, expectedErr) re.Equal(4, execCount) re.True(isBackofferReset(bo)) - // Test the retryable checker. + // Test the error returned. execCount = 0 - bo = InitialBackoffer(base, max, total) - bo.SetRetryableChecker(func(error) bool { - return execCount < 2 + err = bo.Exec(ctx, func() error { + execCount++ + return fmt.Errorf("test %d", execCount) }) + re.Error(err) + re.Equal("test 4", err.Error()) + re.Equal(4, execCount) + re.True(isBackofferReset(bo)) + execCount = 0 err = bo.Exec(ctx, func() error { + if execCount == 1 { + return nil + } execCount++ - return nil + return expectedErr }) + re.Equal(1, execCount) re.NoError(err) + re.True(isBackofferReset(bo)) + + // Test the retryable checker. + execCount = 0 + bo = InitialBackoffer(base, max, total) + retryableChecker := func(error) bool { + return execCount < 2 + } + bo.SetRetryableChecker(retryableChecker, false) + execFunc := func() error { + execCount++ + return expectedErr + } + err = bo.Exec(ctx, execFunc) + re.ErrorIs(err, expectedErr) + re.Equal(2, execCount) + re.True(isBackofferReset(bo)) + // Test the retryable checker with overwrite. + execCount = 0 + retryableChecker = func(error) bool { + return execCount < 4 + } + bo.SetRetryableChecker(retryableChecker, false) + err = bo.Exec(ctx, execFunc) + re.ErrorIs(err, expectedErr) re.Equal(2, execCount) re.True(isBackofferReset(bo)) + execCount = 0 + bo.SetRetryableChecker(retryableChecker, true) + err = bo.Exec(ctx, execFunc) + re.ErrorIs(err, expectedErr) + re.Equal(4, execCount) + re.True(isBackofferReset(bo)) } func isBackofferReset(bo *Backoffer) bool { @@ -129,21 +170,20 @@ func TestBackofferWithLog(t *testing.T) { // 10 + 20 + 40 + 80(log) + 100(log) * 9 >= 1000, so log ten times. re.Len(ms, 10) // 10 + 20 + 40 + 80 + 100 * 9, 13 times retry. - rfc := `["call PD API failed and retrying"] [api=testFn] [retry-time=13] [error=test]` + rfc := `["[pd.backoffer] exec fn failed and retrying"] [fn-name=testFn] [retry-time=13] [error=test]` re.Contains(ms[len(ms)-1], rfc) // 10 + 20 + 40 + 80(log), 4 times retry. - rfc = `["call PD API failed and retrying"] [api=testFn] [retry-time=4] [error=test]` + rfc = `["[pd.backoffer] exec fn failed and retrying"] [fn-name=testFn] [retry-time=4] [error=test]` re.Contains(ms[0], rfc) - bo.resetBackoff() err = bo.Exec(ctx, testFn) re.ErrorIs(err, errTest) ms = lg.Messages() re.Len(ms, 20) - rfc = `["call PD API failed and retrying"] [api=testFn] [retry-time=13] [error=test]` + rfc = `["[pd.backoffer] exec fn failed and retrying"] [fn-name=testFn] [retry-time=13] [error=test]` re.Contains(ms[len(ms)-1], rfc) - rfc = `["call PD API failed and retrying"] [api=testFn] [retry-time=4] [error=test]` + rfc = `["[pd.backoffer] exec fn failed and retrying"] [fn-name=testFn] [retry-time=4] [error=test]` re.Contains(ms[len1], rfc) } diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index fa109946e4b..9d7e0985940 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -21,6 +21,7 @@ import ( "net/url" "sort" "strings" + "sync" "testing" "time" @@ -531,14 +532,15 @@ func (suite *httpClientTestSuite) TestSchedulers() { defer cancel() schedulers, err := client.GetSchedulers(ctx) re.NoError(err) - re.Empty(schedulers) + const schedulerName = "evict-leader-scheduler" + re.NotContains(schedulers, schedulerName) - err = client.CreateScheduler(ctx, "evict-leader-scheduler", 1) + err = client.CreateScheduler(ctx, schedulerName, 1) re.NoError(err) schedulers, err = client.GetSchedulers(ctx) re.NoError(err) - re.Len(schedulers, 1) - err = client.SetSchedulerDelay(ctx, "evict-leader-scheduler", 100) + re.Contains(schedulers, schedulerName) + err = client.SetSchedulerDelay(ctx, schedulerName, 100) re.NoError(err) err = client.SetSchedulerDelay(ctx, "not-exist", 100) re.ErrorContains(err, "500 Internal Server Error") // TODO: should return friendly error message @@ -757,3 +759,43 @@ func (suite *httpClientTestSuite) TestGetHealthStatus() { re.Equal("pd2", healths[1].Name) re.True(healths[0].Health && healths[1].Health) } + +func (suite *httpClientTestSuite) TestRetryOnLeaderChange() { + re := suite.Require() + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + bo := retry.InitialBackoffer(100*time.Millisecond, time.Second, 0) + client := suite.client.WithBackoffer(bo) + for { + healths, err := client.GetHealthStatus(ctx) + if err != nil && strings.Contains(err.Error(), "context canceled") { + return + } + re.NoError(err) + re.Len(healths, 2) + select { + case <-ctx.Done(): + return + default: + } + } + }() + + leader := suite.cluster.GetLeaderServer() + re.NotNil(leader) + for i := 0; i < 3; i++ { + leader.ResignLeader() + re.NotEmpty(suite.cluster.WaitLeader()) + leader = suite.cluster.GetLeaderServer() + re.NotNil(leader) + } + + // Cancel the context to stop the goroutine. + cancel() + wg.Wait() +} From a929a546a790222299b556e449816e622288a5d1 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 3 Jun 2024 16:28:25 +0800 Subject: [PATCH 06/17] client/http, api/middleware: enhance the retry logic of the HTTP client (#8229) ref tikv/pd#7300 Schedule a member change check when the HTTP status code is 503 or receives a leader/primary change error. Signed-off-by: JmPotato --- client/client.go | 11 ----- client/errs/errno.go | 13 +++-- client/errs/errs.go | 18 +++++++ client/http/client.go | 49 +++++++++++++------ client/http/request_info.go | 11 +++++ client/pd_service_discovery_test.go | 3 +- client/resource_manager_client.go | 7 +-- client/tso_dispatcher.go | 2 +- errors.toml | 10 ++++ pkg/errs/errno.go | 9 ++-- .../apiutil/multiservicesapi/middleware.go | 4 +- pkg/utils/apiutil/serverapi/middleware.go | 4 +- server/apiv2/middlewares/redirector.go | 4 +- tests/integrations/client/client_test.go | 3 +- .../mcs/tso/keyspace_group_manager_test.go | 5 +- tests/server/cluster/cluster_test.go | 2 +- 16 files changed, 99 insertions(+), 56 deletions(-) diff --git a/client/client.go b/client/client.go index 1865fd0866e..1c8ef3cafe8 100644 --- a/client/client.go +++ b/client/client.go @@ -1431,17 +1431,6 @@ func (c *client) scatterRegionsWithOptions(ctx context.Context, regionsID []uint return resp, nil } -// IsLeaderChange will determine whether there is a leader change. -func IsLeaderChange(err error) bool { - if err == errs.ErrClientTSOStreamClosed { - return true - } - errMsg := err.Error() - return strings.Contains(errMsg, errs.NotLeaderErr) || - strings.Contains(errMsg, errs.MismatchLeaderErr) || - strings.Contains(errMsg, errs.NotServedErr) -} - const ( httpSchemePrefix = "http://" httpsSchemePrefix = "https://" diff --git a/client/errs/errno.go b/client/errs/errno.go index 50c136dd5f2..0dbcb4fe147 100644 --- a/client/errs/errno.go +++ b/client/errs/errno.go @@ -20,21 +20,20 @@ import ( "github.com/pingcap/errors" ) +// Note: keep the same as the ones defined on the server side to ensure the client can use them correctly. const ( + // NoLeaderErr indicates there is no leader in the cluster currently. + NoLeaderErr = "no leader" // NotLeaderErr indicates the non-leader member received the requests which should be received by leader. - // Note: keep the same as the ones defined on the server side, because the client side checks if an error message - // contains this string to judge whether the leader is changed. - NotLeaderErr = "is not leader" + NotLeaderErr = "not leader" // MismatchLeaderErr indicates the non-leader member received the requests which should be received by leader. - // Note: keep the same as the ones defined on the server side, because the client side checks if an error message - // contains this string to judge whether the leader is changed. MismatchLeaderErr = "mismatch leader id" // NotServedErr indicates an tso node/pod received the requests for the keyspace groups which are not served by it. - // Note: keep the same as the ones defined on the server side, because the client side checks if an error message - // contains this string to judge whether the leader is changed. NotServedErr = "is not served" // RetryTimeoutErr indicates the server is busy. RetryTimeoutErr = "retry timeout" + // NotPrimaryErr indicates the non-primary member received the requests which should be received by primary. + NotPrimaryErr = "not primary" ) // client errors diff --git a/client/errs/errs.go b/client/errs/errs.go index 47f7c29a467..da333efda4c 100644 --- a/client/errs/errs.go +++ b/client/errs/errs.go @@ -15,11 +15,29 @@ package errs import ( + "strings" + "github.com/pingcap/errors" "go.uber.org/zap" "go.uber.org/zap/zapcore" ) +// IsLeaderChange will determine whether there is a leader/primary change. +func IsLeaderChange(err error) bool { + if err == nil { + return false + } + if err == ErrClientTSOStreamClosed { + return true + } + errMsg := err.Error() + return strings.Contains(errMsg, NoLeaderErr) || + strings.Contains(errMsg, NotLeaderErr) || + strings.Contains(errMsg, MismatchLeaderErr) || + strings.Contains(errMsg, NotServedErr) || + strings.Contains(errMsg, NotPrimaryErr) +} + // ZapError is used to make the log output easier. func ZapError(err error, causeError ...error) zap.Field { if err == nil { diff --git a/client/http/client.go b/client/http/client.go index 7b34193c2a4..123ca616422 100644 --- a/client/http/client.go +++ b/client/http/client.go @@ -120,10 +120,25 @@ func (ci *clientInner) requestWithRetry( headerOpts ...HeaderOption, ) error { var ( + serverURL string + isLeader bool statusCode int err error + logFields = append(reqInfo.logFields(), zap.String("source", ci.source)) ) execFunc := func() error { + defer func() { + // If the status code is 503, it indicates that there may be PD leader/follower changes. + // If the error message contains the leader/primary change information, it indicates that there may be PD leader/primary change. + if statusCode == http.StatusServiceUnavailable || errs.IsLeaderChange(err) { + ci.sd.ScheduleCheckMemberChanged() + } + log.Debug("[pd] http request finished", append(logFields, + zap.String("server-url", serverURL), + zap.Bool("is-leader", isLeader), + zap.Int("status-code", statusCode), + zap.Error(err))...) + }() // It will try to send the request to the PD leader first and then try to send the request to the other PD followers. clients := ci.sd.GetAllServiceClients() if len(clients) == 0 { @@ -131,17 +146,21 @@ func (ci *clientInner) requestWithRetry( } skipNum := 0 for _, cli := range clients { - url := cli.GetURL() - if reqInfo.targetURL != "" && reqInfo.targetURL != url { + serverURL = cli.GetURL() + isLeader = cli.IsConnectedToLeader() + if len(reqInfo.targetURL) > 0 && reqInfo.targetURL != serverURL { skipNum++ continue } - statusCode, err = ci.doRequest(ctx, url, reqInfo, headerOpts...) + statusCode, err = ci.doRequest(ctx, serverURL, reqInfo, headerOpts...) if err == nil || noNeedRetry(statusCode) { return err } - log.Debug("[pd] request url failed", - zap.String("source", ci.source), zap.Bool("is-leader", cli.IsConnectedToLeader()), zap.String("url", url), zap.Error(err)) + log.Debug("[pd] http request url failed", append(logFields, + zap.String("server-url", serverURL), + zap.Bool("is-leader", isLeader), + zap.Int("status-code", statusCode), + zap.Error(err))...) } if skipNum == len(clients) { return errs.ErrClientNoTargetMember @@ -169,26 +188,21 @@ func noNeedRetry(statusCode int) bool { func (ci *clientInner) doRequest( ctx context.Context, - url string, reqInfo *requestInfo, + serverURL string, reqInfo *requestInfo, headerOpts ...HeaderOption, ) (int, error) { var ( - source = ci.source callerID = reqInfo.callerID name = reqInfo.name method = reqInfo.method body = reqInfo.body res = reqInfo.res respHandler = reqInfo.respHandler + url = reqInfo.getURL(serverURL) + logFields = append(reqInfo.logFields(), + zap.String("source", ci.source), + zap.String("url", url)) ) - url = reqInfo.getURL(url) - logFields := []zap.Field{ - zap.String("source", source), - zap.String("name", name), - zap.String("url", url), - zap.String("method", method), - zap.String("caller-id", callerID), - } log.Debug("[pd] request the http url", logFields...) req, err := http.NewRequestWithContext(ctx, method, url, bytes.NewBuffer(body)) if err != nil { @@ -229,11 +243,14 @@ func (ci *clientInner) doRequest( if readErr != nil { logFields = append(logFields, zap.NamedError("read-body-error", err)) } else { + // API server will return a JSON body containing the detailed error message + // when the status code is not `http.StatusOK` 200. + bs = bytes.TrimSpace(bs) logFields = append(logFields, zap.ByteString("body", bs)) } log.Error("[pd] request failed with a non-200 status", logFields...) - return resp.StatusCode, errors.Errorf("request pd http api failed with status: '%s'", resp.Status) + return resp.StatusCode, errors.Errorf("request pd http api failed with status: '%s', body: '%s'", resp.Status, bs) } if res == nil { diff --git a/client/http/request_info.go b/client/http/request_info.go index 202eab1150f..3fb91c6ca97 100644 --- a/client/http/request_info.go +++ b/client/http/request_info.go @@ -18,6 +18,7 @@ import ( "fmt" "github.com/tikv/pd/client/retry" + "go.uber.org/zap" ) // The following constants are the names of the requests. @@ -157,3 +158,13 @@ func (ri *requestInfo) WithTargetURL(targetURL string) *requestInfo { func (ri *requestInfo) getURL(addr string) string { return fmt.Sprintf("%s%s", addr, ri.uri) } + +func (ri *requestInfo) logFields() []zap.Field { + return []zap.Field{ + zap.String("caller-id", ri.callerID), + zap.String("name", ri.name), + zap.String("uri", ri.uri), + zap.String("method", ri.method), + zap.String("target-url", ri.targetURL), + } +} diff --git a/client/pd_service_discovery_test.go b/client/pd_service_discovery_test.go index f4cde0e1911..44171873b1a 100644 --- a/client/pd_service_discovery_test.go +++ b/client/pd_service_discovery_test.go @@ -29,6 +29,7 @@ import ( "github.com/pingcap/kvproto/pkg/pdpb" "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" + "github.com/tikv/pd/client/errs" "github.com/tikv/pd/client/grpcutil" "github.com/tikv/pd/client/testutil" "google.golang.org/grpc" @@ -205,7 +206,7 @@ func (suite *serviceClientTestSuite) TestServiceClient() { re.NotNil(leaderConn) _, err := pb.NewGreeterClient(followerConn).SayHello(suite.ctx, &pb.HelloRequest{Name: "pd"}) - re.ErrorContains(err, "not leader") + re.ErrorContains(err, errs.NotLeaderErr) resp, err := pb.NewGreeterClient(leaderConn).SayHello(suite.ctx, &pb.HelloRequest{Name: "pd"}) re.NoError(err) re.Equal("Hello pd", resp.GetMessage()) diff --git a/client/resource_manager_client.go b/client/resource_manager_client.go index 872b241cfe7..98b123c0823 100644 --- a/client/resource_manager_client.go +++ b/client/resource_manager_client.go @@ -16,7 +16,6 @@ package pd import ( "context" - "strings" "time" "github.com/gogo/protobuf/proto" @@ -35,10 +34,6 @@ const ( modify actionType = 1 groupSettingsPathPrefix = "resource_group/settings" controllerConfigPathPrefix = "resource_group/controller" - // errNotPrimary is returned when the requested server is not primary. - errNotPrimary = "not primary" - // errNotLeader is returned when the requested server is not pd leader. - errNotLeader = "not leader" ) // GroupSettingsPathPrefixBytes is used to watch or get resource groups. @@ -83,7 +78,7 @@ func (c *client) resourceManagerClient() (rmpb.ResourceManagerClient, error) { // gRPCErrorHandler is used to handle the gRPC error returned by the resource manager service. func (c *client) gRPCErrorHandler(err error) { - if strings.Contains(err.Error(), errNotPrimary) || strings.Contains(err.Error(), errNotLeader) { + if errs.IsLeaderChange(err) { c.pdSvcDiscovery.ScheduleCheckMemberChanged() } } diff --git a/client/tso_dispatcher.go b/client/tso_dispatcher.go index d5b52ad6039..0919fd84744 100644 --- a/client/tso_dispatcher.go +++ b/client/tso_dispatcher.go @@ -303,7 +303,7 @@ tsoBatchLoop: cancel() stream = nil // Because ScheduleCheckMemberChanged is asynchronous, if the leader changes, we better call `updateMember` ASAP. - if IsLeaderChange(err) { + if errs.IsLeaderChange(err) { if err := bo.Exec(ctx, svcDiscovery.CheckMemberChanged); err != nil { select { case <-ctx.Done(): diff --git a/errors.toml b/errors.toml index 64101000478..a61c23a6fbd 100644 --- a/errors.toml +++ b/errors.toml @@ -16,11 +16,21 @@ error = ''' redirect failed ''' +["PD:apiutil:ErrRedirectNoLeader"] +error = ''' +redirect finds no leader +''' + ["PD:apiutil:ErrRedirectToNotLeader"] error = ''' redirect to not leader ''' +["PD:apiutil:ErrRedirectToNotPrimary"] +error = ''' +redirect to not primary +''' + ["PD:autoscaling:ErrEmptyMetricsResponse"] error = ''' metrics response from Prometheus is empty diff --git a/pkg/errs/errno.go b/pkg/errs/errno.go index 8c3e914531b..1f56a821032 100644 --- a/pkg/errs/errno.go +++ b/pkg/errs/errno.go @@ -195,10 +195,11 @@ var ( // apiutil errors var ( - ErrRedirect = errors.Normalize("redirect failed", errors.RFCCodeText("PD:apiutil:ErrRedirect")) - ErrOptionNotExist = errors.Normalize("the option %s does not exist", errors.RFCCodeText("PD:apiutil:ErrOptionNotExist")) - // ErrRedirectToNotLeader is the error message for redirect to not leader. - ErrRedirectToNotLeader = errors.Normalize("redirect to not leader", errors.RFCCodeText("PD:apiutil:ErrRedirectToNotLeader")) + ErrRedirect = errors.Normalize("redirect failed", errors.RFCCodeText("PD:apiutil:ErrRedirect")) + ErrOptionNotExist = errors.Normalize("the option %s does not exist", errors.RFCCodeText("PD:apiutil:ErrOptionNotExist")) + ErrRedirectNoLeader = errors.Normalize("redirect finds no leader", errors.RFCCodeText("PD:apiutil:ErrRedirectNoLeader")) + ErrRedirectToNotLeader = errors.Normalize("redirect to not leader", errors.RFCCodeText("PD:apiutil:ErrRedirectToNotLeader")) + ErrRedirectToNotPrimary = errors.Normalize("redirect to not primary", errors.RFCCodeText("PD:apiutil:ErrRedirectToNotPrimary")) ) // grpcutil errors diff --git a/pkg/utils/apiutil/multiservicesapi/middleware.go b/pkg/utils/apiutil/multiservicesapi/middleware.go index ed34ecc6afb..4343adcc981 100644 --- a/pkg/utils/apiutil/multiservicesapi/middleware.go +++ b/pkg/utils/apiutil/multiservicesapi/middleware.go @@ -48,8 +48,8 @@ func ServiceRedirector() gin.HandlerFunc { // Prevent more than one redirection. if name := c.Request.Header.Get(ServiceRedirectorHeader); len(name) != 0 { - log.Error("redirect but server is not primary", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirect)) - c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirect.FastGenByArgs().Error()) + log.Error("redirect but server is not primary", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirectToNotPrimary)) + c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirectToNotPrimary.FastGenByArgs().Error()) return } diff --git a/pkg/utils/apiutil/serverapi/middleware.go b/pkg/utils/apiutil/serverapi/middleware.go index 1cd3d5b53d6..0718702b5a5 100755 --- a/pkg/utils/apiutil/serverapi/middleware.go +++ b/pkg/utils/apiutil/serverapi/middleware.go @@ -210,7 +210,7 @@ func (h *redirector) ServeHTTP(w http.ResponseWriter, r *http.Request, next http leader := h.waitForLeader(r) // The leader has not been elected yet. if leader == nil { - http.Error(w, "no leader", http.StatusServiceUnavailable) + http.Error(w, errs.ErrRedirectNoLeader.FastGenByArgs().Error(), http.StatusServiceUnavailable) return } // If the leader is the current server now, we can handle the request directly. @@ -222,7 +222,7 @@ func (h *redirector) ServeHTTP(w http.ResponseWriter, r *http.Request, next http r.Header.Set(apiutil.PDRedirectorHeader, h.s.Name()) } else { // Prevent more than one redirection among PD/API servers. - log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", h.s.Name()), errs.ZapError(errs.ErrRedirect)) + log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", h.s.Name()), errs.ZapError(errs.ErrRedirectToNotLeader)) http.Error(w, errs.ErrRedirectToNotLeader.FastGenByArgs().Error(), http.StatusInternalServerError) return } diff --git a/server/apiv2/middlewares/redirector.go b/server/apiv2/middlewares/redirector.go index 37c06de1585..9c2c4081175 100644 --- a/server/apiv2/middlewares/redirector.go +++ b/server/apiv2/middlewares/redirector.go @@ -43,8 +43,8 @@ func Redirector() gin.HandlerFunc { // Prevent more than one redirection. if name := c.Request.Header.Get(apiutil.PDRedirectorHeader); len(name) != 0 { - log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirect)) - c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirect.FastGenByArgs().Error()) + log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirectToNotLeader)) + c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirectToNotLeader.FastGenByArgs().Error()) return } diff --git a/tests/integrations/client/client_test.go b/tests/integrations/client/client_test.go index dfe7a6980c7..65acd897726 100644 --- a/tests/integrations/client/client_test.go +++ b/tests/integrations/client/client_test.go @@ -40,6 +40,7 @@ import ( "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" pd "github.com/tikv/pd/client" + clierrs "github.com/tikv/pd/client/errs" "github.com/tikv/pd/client/retry" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/errs" @@ -528,7 +529,7 @@ func TestGlobalAndLocalTSO(t *testing.T) { re.NotEmpty(cluster.WaitLeader()) _, _, err = cli.GetTS(ctx) re.Error(err) - re.True(pd.IsLeaderChange(err)) + re.True(clierrs.IsLeaderChange(err)) _, _, err = cli.GetTS(ctx) re.NoError(err) re.NoError(failpoint.Disable("github.com/tikv/pd/client/skipUpdateMember")) diff --git a/tests/integrations/mcs/tso/keyspace_group_manager_test.go b/tests/integrations/mcs/tso/keyspace_group_manager_test.go index 25d9516bf63..6d861962d9b 100644 --- a/tests/integrations/mcs/tso/keyspace_group_manager_test.go +++ b/tests/integrations/mcs/tso/keyspace_group_manager_test.go @@ -28,6 +28,7 @@ import ( "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" pd "github.com/tikv/pd/client" + clierrs "github.com/tikv/pd/client/errs" "github.com/tikv/pd/pkg/election" "github.com/tikv/pd/pkg/errs" mcsutils "github.com/tikv/pd/pkg/mcs/utils" @@ -467,8 +468,8 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) dispatchClient( errMsg := err.Error() // Ignore the errors caused by the split and context cancellation. if strings.Contains(errMsg, "context canceled") || - strings.Contains(errMsg, "not leader") || - strings.Contains(errMsg, "not served") || + strings.Contains(errMsg, clierrs.NotLeaderErr) || + strings.Contains(errMsg, clierrs.NotServedErr) || strings.Contains(errMsg, "ErrKeyspaceNotAssigned") || strings.Contains(errMsg, "ErrKeyspaceGroupIsMerging") { continue diff --git a/tests/server/cluster/cluster_test.go b/tests/server/cluster/cluster_test.go index 07bcf3ee2a1..9e70a52d11d 100644 --- a/tests/server/cluster/cluster_test.go +++ b/tests/server/cluster/cluster_test.go @@ -662,7 +662,7 @@ func TestNotLeader(t *testing.T) { grpcStatus, ok := status.FromError(err) re.True(ok) re.Equal(codes.Unavailable, grpcStatus.Code()) - re.Equal("not leader", grpcStatus.Message()) + re.ErrorContains(server.ErrNotLeader, grpcStatus.Message()) } func TestStoreVersionChange(t *testing.T) { From fcec1882ec12655b1f1bf31c55f56aaf20dc7dfb Mon Sep 17 00:00:00 2001 From: JmPotato Date: Tue, 4 Jun 2024 16:04:26 +0800 Subject: [PATCH 07/17] server/join: log detailed info when a join failure member is detected (#8243) ref tikv/pd#7983 Log the detailed info when a join failure member is detected to help troubleshoot. Signed-off-by: JmPotato --- server/join/join.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/server/join/join.go b/server/join/join.go index d1711063313..1319dc08d07 100644 --- a/server/join/join.go +++ b/server/join/join.go @@ -136,7 +136,11 @@ func PrepareJoinCluster(cfg *config.Config) error { existed := false for _, m := range listResp.Members { if len(m.Name) == 0 { - return errors.New("there is a member that has not joined successfully") + log.Error("there is an abnormal joined member in the current member list", + zap.Uint64("id", m.ID), + zap.Strings("peer-urls", m.PeerURLs), + zap.Strings("client-urls", m.ClientURLs)) + return errors.Errorf("there is a member %d that has not joined successfully", m.ID) } if m.Name == cfg.Name { existed = true @@ -184,7 +188,11 @@ func PrepareJoinCluster(cfg *config.Config) error { listSucc = true } if len(n) == 0 { - return errors.New("there is a member that has not joined successfully") + log.Error("there is an abnormal joined member in the current member list", + zap.Uint64("id", memb.ID), + zap.Strings("peer-urls", memb.PeerURLs), + zap.Strings("client-urls", memb.ClientURLs)) + return errors.Errorf("there is a member %d that has not joined successfully", memb.ID) } for _, m := range memb.PeerURLs { pds = append(pds, fmt.Sprintf("%s=%s", n, m)) From 492a8735fd8935a475982a54934800c546524854 Mon Sep 17 00:00:00 2001 From: wuhuizuo Date: Tue, 4 Jun 2024 16:49:25 +0800 Subject: [PATCH 08/17] chore: add prow OWNERS files to control the approvals for critical configuration files (#8218) close tikv/pd#8167 Signed-off-by: wuhuizuo --- OWNERS_ALIASES | 6 ++++++ client/resource_group/controller/OWNERS | 7 +++++++ client/tlsutil/OWNERS | 7 +++++++ conf/OWNERS | 7 +++++++ pkg/encryption/OWNERS | 7 +++++++ pkg/mcs/resourcemanager/server/OWNERS | 7 +++++++ pkg/mcs/scheduling/server/config/OWNERS | 7 +++++++ pkg/mcs/tso/server/OWNERS | 7 +++++++ pkg/schedule/config/OWNERS | 7 +++++++ pkg/schedule/schedulers/OWNERS | 7 +++++++ server/config/OWNERS | 7 +++++++ 11 files changed, 76 insertions(+) create mode 100644 OWNERS_ALIASES create mode 100644 client/resource_group/controller/OWNERS create mode 100644 client/tlsutil/OWNERS create mode 100644 conf/OWNERS create mode 100644 pkg/encryption/OWNERS create mode 100644 pkg/mcs/resourcemanager/server/OWNERS create mode 100644 pkg/mcs/scheduling/server/config/OWNERS create mode 100644 pkg/mcs/tso/server/OWNERS create mode 100644 pkg/schedule/config/OWNERS create mode 100644 pkg/schedule/schedulers/OWNERS create mode 100644 server/config/OWNERS diff --git a/OWNERS_ALIASES b/OWNERS_ALIASES new file mode 100644 index 00000000000..516a466c91e --- /dev/null +++ b/OWNERS_ALIASES @@ -0,0 +1,6 @@ +# Sort the member alphabetically. +aliases: + sig-critical-approvers-config: + - easonn7 + - kevin-xianliu + - niubell diff --git a/client/resource_group/controller/OWNERS b/client/resource_group/controller/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/client/resource_group/controller/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/client/tlsutil/OWNERS b/client/tlsutil/OWNERS new file mode 100644 index 00000000000..211db06feee --- /dev/null +++ b/client/tlsutil/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|tlsconfig\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/conf/OWNERS b/conf/OWNERS new file mode 100644 index 00000000000..1a435c49089 --- /dev/null +++ b/conf/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.toml)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/encryption/OWNERS b/pkg/encryption/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/pkg/encryption/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/mcs/resourcemanager/server/OWNERS b/pkg/mcs/resourcemanager/server/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/pkg/mcs/resourcemanager/server/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/mcs/scheduling/server/config/OWNERS b/pkg/mcs/scheduling/server/config/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/pkg/mcs/scheduling/server/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/mcs/tso/server/OWNERS b/pkg/mcs/tso/server/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/pkg/mcs/tso/server/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/schedule/config/OWNERS b/pkg/schedule/config/OWNERS new file mode 100644 index 00000000000..ce5d15ddc19 --- /dev/null +++ b/pkg/schedule/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|(config|store_config)\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/schedule/schedulers/OWNERS b/pkg/schedule/schedulers/OWNERS new file mode 100644 index 00000000000..ae96e4f1f42 --- /dev/null +++ b/pkg/schedule/schedulers/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|hot_region_config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/server/config/OWNERS b/server/config/OWNERS new file mode 100644 index 00000000000..179de4843e6 --- /dev/null +++ b/server/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|(config|service_middleware_config)\\.go)$": + approvers: + - sig-critical-approvers-config From d44d7212b3d3fce03adf0f8420bcd5c2cab7f7b3 Mon Sep 17 00:00:00 2001 From: okJiang <819421878@qq.com> Date: Tue, 4 Jun 2024 19:14:43 +0800 Subject: [PATCH 09/17] ctl: fix https client panic (#8239) * fix https client Signed-off-by: okJiang <819421878@qq.com> * fix comment & add one ut Signed-off-by: okJiang <819421878@qq.com> * EnableTraverseRunHooks Signed-off-by: okJiang <819421878@qq.com> * fix comment Signed-off-by: okJiang <819421878@qq.com> * empty Signed-off-by: okJiang <819421878@qq.com> --------- Signed-off-by: okJiang <819421878@qq.com> Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- tools/pd-ctl/pdctl/command/global.go | 72 ++++++++----------- tools/pd-ctl/pdctl/command/global_test.go | 58 ++++++++++++++++ tools/pd-ctl/pdctl/ctl.go | 1 + tools/pd-ctl/tests/health/health_test.go | 84 +++++++++++++++++++++++ 4 files changed, 171 insertions(+), 44 deletions(-) create mode 100644 tools/pd-ctl/pdctl/command/global_test.go diff --git a/tools/pd-ctl/pdctl/command/global.go b/tools/pd-ctl/pdctl/command/global.go index f7c04c3ca5c..b29e2b63278 100644 --- a/tools/pd-ctl/pdctl/command/global.go +++ b/tools/pd-ctl/pdctl/command/global.go @@ -55,23 +55,15 @@ var PDCli pd.Client func requirePDClient(cmd *cobra.Command, _ []string) error { var ( - caPath string - err error + tlsConfig *tls.Config + err error ) - caPath, err = cmd.Flags().GetString("cacert") - if err == nil && len(caPath) != 0 { - var certPath, keyPath string - certPath, err = cmd.Flags().GetString("cert") - if err != nil { - return err - } - keyPath, err = cmd.Flags().GetString("key") - if err != nil { - return err - } - return initNewPDClientWithTLS(cmd, caPath, certPath, keyPath) + tlsConfig, err = parseTLSConfig(cmd) + if err != nil { + return err } - return initNewPDClient(cmd) + + return initNewPDClient(cmd, pd.WithTLSConfig(tlsConfig)) } // shouldInitPDClient checks whether we should create a new PD client according to the cluster information. @@ -111,44 +103,36 @@ func initNewPDClient(cmd *cobra.Command, opts ...pd.ClientOption) error { return nil } -func initNewPDClientWithTLS(cmd *cobra.Command, caPath, certPath, keyPath string) error { - tlsConfig, err := initTLSConfig(caPath, certPath, keyPath) - if err != nil { - return err - } - initNewPDClient(cmd, pd.WithTLSConfig(tlsConfig)) - return nil -} - // TODO: replace dialClient with the PD HTTP client completely. var dialClient = &http.Client{ Transport: apiutil.NewCallerIDRoundTripper(http.DefaultTransport, PDControlCallerID), } -// RequireHTTPSClient creates a HTTPS client if the related flags are set -func RequireHTTPSClient(cmd *cobra.Command, _ []string) error { +func parseTLSConfig(cmd *cobra.Command) (*tls.Config, error) { caPath, err := cmd.Flags().GetString("cacert") - if err == nil && len(caPath) != 0 { - certPath, err := cmd.Flags().GetString("cert") - if err != nil { - return err - } - keyPath, err := cmd.Flags().GetString("key") - if err != nil { - return err - } - err = initHTTPSClient(caPath, certPath, keyPath) - if err != nil { - cmd.Println(err) - return err - } + if err != nil || len(caPath) == 0 { + return nil, err + } + certPath, err := cmd.Flags().GetString("cert") + if err != nil { + return nil, err + } + keyPath, err := cmd.Flags().GetString("key") + if err != nil { + return nil, err } - return nil -} - -func initHTTPSClient(caPath, certPath, keyPath string) error { tlsConfig, err := initTLSConfig(caPath, certPath, keyPath) if err != nil { + return nil, err + } + + return tlsConfig, nil +} + +// RequireHTTPSClient creates a HTTPS client if the related flags are set +func RequireHTTPSClient(cmd *cobra.Command, _ []string) error { + tlsConfig, err := parseTLSConfig(cmd) + if err != nil || tlsConfig == nil { return err } dialClient = &http.Client{ diff --git a/tools/pd-ctl/pdctl/command/global_test.go b/tools/pd-ctl/pdctl/command/global_test.go new file mode 100644 index 00000000000..86eb4366d04 --- /dev/null +++ b/tools/pd-ctl/pdctl/command/global_test.go @@ -0,0 +1,58 @@ +// Copyright 2024 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package command + +import ( + "os" + "os/exec" + "testing" + + "github.com/spf13/cobra" + "github.com/stretchr/testify/require" +) + +func TestParseTLSConfig(t *testing.T) { + re := require.New(t) + + rootCmd := &cobra.Command{ + Use: "pd-ctl", + Short: "Placement Driver control", + SilenceErrors: true, + } + certPath := "../../tests/cert" + rootCmd.Flags().String("cacert", certPath+"/ca.pem", "path of file that contains list of trusted SSL CAs") + rootCmd.Flags().String("cert", certPath+"/client.pem", "path of file that contains X509 certificate in PEM format") + rootCmd.Flags().String("key", certPath+"/client-key.pem", "path of file that contains X509 key in PEM format") + + // generate certs + if err := os.Mkdir(certPath, 0755); err != nil { + t.Fatal(err) + } + certScript := "../../tests/cert_opt.sh" + if err := exec.Command(certScript, "generate", certPath).Run(); err != nil { + t.Fatal(err) + } + defer func() { + if err := exec.Command(certScript, "cleanup", certPath).Run(); err != nil { + t.Fatal(err) + } + if err := os.RemoveAll(certPath); err != nil { + t.Fatal(err) + } + }() + + tlsConfig, err := parseTLSConfig(rootCmd) + re.NoError(err) + re.NotNil(tlsConfig) +} diff --git a/tools/pd-ctl/pdctl/ctl.go b/tools/pd-ctl/pdctl/ctl.go index f8eaff5e76e..fbacd65dc53 100644 --- a/tools/pd-ctl/pdctl/ctl.go +++ b/tools/pd-ctl/pdctl/ctl.go @@ -30,6 +30,7 @@ import ( func init() { cobra.EnablePrefixMatching = true + cobra.EnableTraverseRunHooks = true } // GetRootCmd is exposed for integration tests. But it can be embedded into another suite, too. diff --git a/tools/pd-ctl/tests/health/health_test.go b/tools/pd-ctl/tests/health/health_test.go index 9150a56c91b..f1d3c7cfbf1 100644 --- a/tools/pd-ctl/tests/health/health_test.go +++ b/tools/pd-ctl/tests/health/health_test.go @@ -17,14 +17,21 @@ package health_test import ( "context" "encoding/json" + "os" + "os/exec" + "path/filepath" + "strings" "testing" "github.com/stretchr/testify/require" + "github.com/tikv/pd/pkg/utils/grpcutil" "github.com/tikv/pd/server/api" "github.com/tikv/pd/server/cluster" + "github.com/tikv/pd/server/config" pdTests "github.com/tikv/pd/tests" ctl "github.com/tikv/pd/tools/pd-ctl/pdctl" "github.com/tikv/pd/tools/pd-ctl/tests" + "go.etcd.io/etcd/pkg/transport" ) func TestHealth(t *testing.T) { @@ -68,3 +75,80 @@ func TestHealth(t *testing.T) { re.NoError(json.Unmarshal(output, &h)) re.Equal(healths, h) } + +func TestHealthTLS(t *testing.T) { + re := require.New(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + certPath := "../cert" + certScript := "../cert_opt.sh" + // generate certs + if err := os.Mkdir(certPath, 0755); err != nil { + t.Fatal(err) + } + if err := exec.Command(certScript, "generate", certPath).Run(); err != nil { + t.Fatal(err) + } + defer func() { + if err := exec.Command(certScript, "cleanup", certPath).Run(); err != nil { + t.Fatal(err) + } + if err := os.RemoveAll(certPath); err != nil { + t.Fatal(err) + } + }() + + tlsInfo := transport.TLSInfo{ + KeyFile: filepath.Join(certPath, "pd-server-key.pem"), + CertFile: filepath.Join(certPath, "pd-server.pem"), + TrustedCAFile: filepath.Join(certPath, "ca.pem"), + } + tc, err := pdTests.NewTestCluster(ctx, 1, func(conf *config.Config, _ string) { + conf.Security.TLSConfig = grpcutil.TLSConfig{ + KeyPath: tlsInfo.KeyFile, + CertPath: tlsInfo.CertFile, + CAPath: tlsInfo.TrustedCAFile, + } + conf.AdvertiseClientUrls = strings.ReplaceAll(conf.AdvertiseClientUrls, "http", "https") + conf.ClientUrls = strings.ReplaceAll(conf.ClientUrls, "http", "https") + conf.AdvertisePeerUrls = strings.ReplaceAll(conf.AdvertisePeerUrls, "http", "https") + conf.PeerUrls = strings.ReplaceAll(conf.PeerUrls, "http", "https") + conf.InitialCluster = strings.ReplaceAll(conf.InitialCluster, "http", "https") + }) + re.NoError(err) + defer tc.Destroy() + err = tc.RunInitialServers() + re.NoError(err) + tc.WaitLeader() + cmd := ctl.GetRootCmd() + + client := tc.GetEtcdClient() + members, err := cluster.GetMembers(client) + re.NoError(err) + healthMembers := cluster.CheckHealth(tc.GetHTTPClient(), members) + healths := []api.Health{} + for _, member := range members { + h := api.Health{ + Name: member.Name, + MemberID: member.MemberId, + ClientUrls: member.ClientUrls, + Health: false, + } + if _, ok := healthMembers[member.GetMemberId()]; ok { + h.Health = true + } + healths = append(healths, h) + } + + pdAddr := tc.GetConfig().GetClientURL() + pdAddr = strings.ReplaceAll(pdAddr, "http", "https") + args := []string{"-u", pdAddr, "health", + "--cacert=../cert/ca.pem", + "--cert=../cert/client.pem", + "--key=../cert/client-key.pem"} + output, err := tests.ExecuteCommand(cmd, args...) + re.NoError(err) + h := make([]api.Health, len(healths)) + re.NoError(json.Unmarshal(output, &h)) + re.Equal(healths, h) +} From 82d3a4a241e12d76218e1aba7a5845c1793305b1 Mon Sep 17 00:00:00 2001 From: Jack Yu Date: Wed, 5 Jun 2024 11:11:56 +0800 Subject: [PATCH 10/17] grafana: use log2 for y-axis of uptime (#8240) close tikv/pd#8241 Close #8241 grafana: use log2 for y-axis of uptime Signed-off-by: Jack Yu Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- metrics/grafana/pd.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index 69afb93f531..7965a341f6c 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -2096,7 +2096,7 @@ { "format": "dtdurations", "label": null, - "logBase": 1, + "logBase": 2, "max": null, "min": "0", "show": true From 301fabbedb64088f794b24809259efffe388d77d Mon Sep 17 00:00:00 2001 From: Hu# Date: Wed, 5 Jun 2024 11:20:55 +0800 Subject: [PATCH 11/17] tools/simulator: add store api and replace simulator http with SDK (#8245) ref tikv/pd#8135 Signed-off-by: husharp Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- client/http/interface.go | 9 ++++ client/http/request_info.go | 1 + tests/integrations/client/http_client_test.go | 34 +++++++++++---- tools/pd-simulator/main.go | 6 +++ tools/pd-simulator/simulator/cases/cases.go | 4 +- .../simulator/cases/diagnose_rule.go | 23 +++++----- tools/pd-simulator/simulator/client.go | 43 ++++--------------- tools/pd-simulator/simulator/config/config.go | 4 +- tools/pd-simulator/simulator/event.go | 6 +++ tools/pd-simulator/simulator/node.go | 1 + 10 files changed, 73 insertions(+), 58 deletions(-) diff --git a/client/http/interface.go b/client/http/interface.go index 11c24beaefd..3684e19b1f5 100644 --- a/client/http/interface.go +++ b/client/http/interface.go @@ -49,6 +49,7 @@ type Client interface { GetRegionStatusByKeyRange(context.Context, *KeyRange, bool) (*RegionStats, error) GetStores(context.Context) (*StoresInfo, error) GetStore(context.Context, uint64) (*StoreInfo, error) + DeleteStore(context.Context, uint64) error SetStoreLabels(context.Context, int64, map[string]string) error GetHealthStatus(context.Context) ([]Health, error) /* Config-related interfaces */ @@ -440,6 +441,14 @@ func (c *client) GetStore(ctx context.Context, storeID uint64) (*StoreInfo, erro return &store, nil } +// DeleteStore deletes the store by ID. +func (c *client) DeleteStore(ctx context.Context, storeID uint64) error { + return c.request(ctx, newRequestInfo(). + WithName(deleteStoreName). + WithURI(StoreByID(storeID)). + WithMethod(http.MethodDelete)) +} + // GetClusterVersion gets the cluster version. func (c *client) GetClusterVersion(ctx context.Context) (string, error) { var version string diff --git a/client/http/request_info.go b/client/http/request_info.go index 3fb91c6ca97..40bd0368250 100644 --- a/client/http/request_info.go +++ b/client/http/request_info.go @@ -39,6 +39,7 @@ const ( getRegionStatusByKeyRangeName = "GetRegionStatusByKeyRange" getStoresName = "GetStores" getStoreName = "GetStore" + deleteStoreName = "DeleteStore" setStoreLabelsName = "SetStoreLabels" getHealthStatusName = "GetHealthStatus" getConfigName = "GetConfig" diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index 9d7e0985940..f4a48dcd63e 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -26,6 +26,7 @@ import ( "time" "github.com/pingcap/errors" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/require" @@ -80,6 +81,15 @@ func (suite *httpClientTestSuite) SetupSuite() { leaderServer := cluster.GetLeaderServer() err = leaderServer.BootstrapCluster() + // Add 2 more stores to the cluster. + for i := 2; i <= 4; i++ { + tests.MustPutStore(re, cluster, &metapb.Store{ + Id: uint64(i), + State: metapb.StoreState_Up, + NodeState: metapb.NodeState_Serving, + LastHeartbeat: time.Now().UnixNano(), + }) + } re.NoError(err) for _, region := range []*core.RegionInfo{ core.NewTestRegionInfo(10, 1, []byte("a1"), []byte("a2")), @@ -165,29 +175,29 @@ func (suite *httpClientTestSuite) TestMeta() { re.Empty(regionStats.StoreLeaderCount) hotReadRegions, err := client.GetHotReadRegions(ctx) re.NoError(err) - re.Len(hotReadRegions.AsPeer, 1) - re.Len(hotReadRegions.AsLeader, 1) + re.Len(hotReadRegions.AsPeer, 4) + re.Len(hotReadRegions.AsLeader, 4) hotWriteRegions, err := client.GetHotWriteRegions(ctx) re.NoError(err) - re.Len(hotWriteRegions.AsPeer, 1) - re.Len(hotWriteRegions.AsLeader, 1) + re.Len(hotWriteRegions.AsPeer, 4) + re.Len(hotWriteRegions.AsLeader, 4) historyHorRegions, err := client.GetHistoryHotRegions(ctx, &pd.HistoryHotRegionsRequest{ StartTime: 0, EndTime: time.Now().AddDate(0, 0, 1).UnixNano() / int64(time.Millisecond), }) re.NoError(err) re.Empty(historyHorRegions.HistoryHotRegion) - store, err := client.GetStores(ctx) + stores, err := client.GetStores(ctx) re.NoError(err) - re.Equal(1, store.Count) - re.Len(store.Stores, 1) - storeID := uint64(store.Stores[0].Store.ID) // TODO: why type is different? + re.Equal(4, stores.Count) + re.Len(stores.Stores, 4) + storeID := uint64(stores.Stores[0].Store.ID) // TODO: why type is different? store2, err := client.GetStore(ctx, storeID) re.NoError(err) re.EqualValues(storeID, store2.Store.ID) version, err := client.GetClusterVersion(ctx) re.NoError(err) - re.Equal("0.0.0", version) + re.Equal("1.0.0", version) rgs, _ := client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a"), []byte("a1")), 100) re.Equal(int64(0), rgs.Count) rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), 100) @@ -196,6 +206,12 @@ func (suite *httpClientTestSuite) TestMeta() { re.Equal(int64(1), rgs.Count) rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte(""), []byte("")), 100) re.Equal(int64(2), rgs.Count) + // store 2 origin status:offline + err = client.DeleteStore(ctx, 2) + re.NoError(err) + store2, err = client.GetStore(ctx, 2) + re.NoError(err) + re.Equal(int64(metapb.StoreState_Offline), store2.Store.State) } func (suite *httpClientTestSuite) TestGetMinResolvedTSByStoresIDs() { diff --git a/tools/pd-simulator/main.go b/tools/pd-simulator/main.go index 45b3ecd75c9..e3dc43ca122 100644 --- a/tools/pd-simulator/main.go +++ b/tools/pd-simulator/main.go @@ -25,6 +25,7 @@ import ( "github.com/BurntSushi/toml" "github.com/pingcap/log" flag "github.com/spf13/pflag" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/schedule/schedulers" "github.com/tikv/pd/pkg/statistics" "github.com/tikv/pd/pkg/utils/logutil" @@ -92,6 +93,7 @@ func main() { func run(simCase string, simConfig *sc.SimConfig) { if *pdAddr != "" { + simulator.PDHTTPClient = pdHttp.NewClient("pd-simulator", []string{*pdAddr}) simStart(*pdAddr, *statusAddress, simCase, simConfig) } else { local, clean := NewSingleServer(context.Background(), simConfig) @@ -105,6 +107,7 @@ func run(simCase string, simConfig *sc.SimConfig) { } time.Sleep(100 * time.Millisecond) } + simulator.PDHTTPClient = pdHttp.NewClient("pd-simulator", []string{local.GetAddr()}) simStart(local.GetAddr(), "", simCase, simConfig, clean) } } @@ -183,6 +186,9 @@ EXIT: analysis.GetTransferCounter().PrintResult() } + if simulator.PDHTTPClient != nil { + simulator.PDHTTPClient.Close() + } if simResult != "OK" { os.Exit(1) } diff --git a/tools/pd-simulator/simulator/cases/cases.go b/tools/pd-simulator/simulator/cases/cases.go index 00b5404669f..c4e2f999978 100644 --- a/tools/pd-simulator/simulator/cases/cases.go +++ b/tools/pd-simulator/simulator/cases/cases.go @@ -16,8 +16,8 @@ package cases import ( "github.com/pingcap/kvproto/pkg/metapb" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" - "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" @@ -57,7 +57,7 @@ type Case struct { TableNumber int Checker CheckerFunc // To check the schedule is finished. - Rules []*placement.Rule + Rules []*pdHttp.Rule Labels typeutil.StringSlice } diff --git a/tools/pd-simulator/simulator/cases/diagnose_rule.go b/tools/pd-simulator/simulator/cases/diagnose_rule.go index 5d34e051071..2cd11b9624a 100644 --- a/tools/pd-simulator/simulator/cases/diagnose_rule.go +++ b/tools/pd-simulator/simulator/cases/diagnose_rule.go @@ -19,6 +19,7 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" @@ -30,15 +31,15 @@ import ( func newRule1(_ *sc.SimConfig) *Case { var simCase Case - simCase.Rules = make([]*placement.Rule, 0) - simCase.Rules = append(simCase.Rules, &placement.Rule{ + simCase.Rules = make([]*pdHttp.Rule, 0) + simCase.Rules = append(simCase.Rules, &pdHttp.Rule{ GroupID: "test1", ID: "test1", StartKeyHex: "", EndKeyHex: "", - Role: placement.Learner, + Role: pdHttp.Learner, Count: 1, - LabelConstraints: []placement.LabelConstraint{ + LabelConstraints: []pdHttp.LabelConstraint{ { Key: "region", Op: "in", @@ -46,14 +47,14 @@ func newRule1(_ *sc.SimConfig) *Case { }, }, LocationLabels: []string{"host"}, - }, &placement.Rule{ + }, &pdHttp.Rule{ GroupID: placement.DefaultGroupID, ID: placement.DefaultRuleID, StartKeyHex: "", EndKeyHex: "", - Role: placement.Voter, + Role: pdHttp.Voter, Count: 5, - LabelConstraints: []placement.LabelConstraint{ + LabelConstraints: []pdHttp.LabelConstraint{ { Key: "region", Op: "in", @@ -130,16 +131,16 @@ func newRule1(_ *sc.SimConfig) *Case { func newRule2(_ *sc.SimConfig) *Case { var simCase Case - simCase.Rules = make([]*placement.Rule, 0) + simCase.Rules = make([]*pdHttp.Rule, 0) simCase.Rules = append(simCase.Rules, - &placement.Rule{ + &pdHttp.Rule{ GroupID: "test1", ID: "test1", StartKeyHex: "", EndKeyHex: "", - Role: placement.Leader, + Role: pdHttp.Leader, Count: 1, - LabelConstraints: []placement.LabelConstraint{ + LabelConstraints: []pdHttp.LabelConstraint{ { Key: "region", Op: "in", diff --git a/tools/pd-simulator/simulator/client.go b/tools/pd-simulator/simulator/client.go index 50ed57995df..113eadab5e0 100644 --- a/tools/pd-simulator/simulator/client.go +++ b/tools/pd-simulator/simulator/client.go @@ -15,11 +15,7 @@ package simulator import ( - "bytes" "context" - "encoding/json" - "fmt" - "net/http" "strings" "sync" "time" @@ -27,8 +23,8 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" - "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" @@ -54,12 +50,12 @@ type Client interface { const ( pdTimeout = time.Second maxInitClusterRetries = 100 - httpPrefix = "pd/api/v1" ) var ( // errFailInitClusterID is returned when failed to load clusterID from all supplied PD addresses. errFailInitClusterID = errors.New("[pd] failed to get cluster id") + PDHTTPClient pdHttp.Client ) type client struct { @@ -67,7 +63,6 @@ type client struct { tag string clusterID uint64 clientConn *grpc.ClientConn - httpClient *http.Client reportRegionHeartbeatCh chan *core.RegionInfo receiveRegionHeartbeatCh chan *pdpb.RegionHeartbeatResponse @@ -88,7 +83,6 @@ func NewClient(pdAddr string, tag string) (Client, <-chan *pdpb.RegionHeartbeatR ctx: ctx, cancel: cancel, tag: tag, - httpClient: &http.Client{}, } cc, err := c.createConn() if err != nil { @@ -319,46 +313,27 @@ func (c *client) PutStore(ctx context.Context, store *metapb.Store) error { func (c *client) PutPDConfig(config *sc.PDConfig) error { if len(config.PlacementRules) > 0 { - path := fmt.Sprintf("%s/%s/config/rules/batch", c.url, httpPrefix) - ruleOps := make([]*placement.RuleOp, 0) + ruleOps := make([]*pdHttp.RuleOp, 0) for _, rule := range config.PlacementRules { - ruleOps = append(ruleOps, &placement.RuleOp{ + ruleOps = append(ruleOps, &pdHttp.RuleOp{ Rule: rule, - Action: placement.RuleOpAdd, + Action: pdHttp.RuleOpAdd, }) } - content, _ := json.Marshal(ruleOps) - req, err := http.NewRequest(http.MethodPost, path, bytes.NewBuffer(content)) - req.Header.Add("Content-Type", "application/json") + err := PDHTTPClient.SetPlacementRuleInBatch(c.ctx, ruleOps) if err != nil { return err } - res, err := c.httpClient.Do(req) - if err != nil { - return err - } - defer res.Body.Close() - simutil.Logger.Info("add placement rule success", zap.String("rules", string(content))) + simutil.Logger.Info("add placement rule success", zap.Any("rules", config.PlacementRules)) } if len(config.LocationLabels) > 0 { - path := fmt.Sprintf("%s/%s/config", c.url, httpPrefix) data := make(map[string]any) data["location-labels"] = config.LocationLabels - content, err := json.Marshal(data) - if err != nil { - return err - } - req, err := http.NewRequest(http.MethodPost, path, bytes.NewBuffer(content)) - req.Header.Add("Content-Type", "application/json") - if err != nil { - return err - } - res, err := c.httpClient.Do(req) + err := PDHTTPClient.SetConfig(c.ctx, data) if err != nil { return err } - defer res.Body.Close() - simutil.Logger.Info("add location labels success", zap.String("labels", string(content))) + simutil.Logger.Info("add location labels success", zap.Any("labels", config.LocationLabels)) } return nil } diff --git a/tools/pd-simulator/simulator/config/config.go b/tools/pd-simulator/simulator/config/config.go index 01bf8199ab4..6598cf35c0f 100644 --- a/tools/pd-simulator/simulator/config/config.go +++ b/tools/pd-simulator/simulator/config/config.go @@ -21,8 +21,8 @@ import ( "github.com/BurntSushi/toml" "github.com/docker/go-units" + pdHttp "github.com/tikv/pd/client/http" sc "github.com/tikv/pd/pkg/schedule/config" - "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/configutil" "github.com/tikv/pd/pkg/utils/tempurl" "github.com/tikv/pd/pkg/utils/typeutil" @@ -133,6 +133,6 @@ func (sc *SimConfig) Speed() uint64 { // PDConfig saves some config which may be changed in PD. type PDConfig struct { - PlacementRules []*placement.Rule + PlacementRules []*pdHttp.Rule LocationLabels typeutil.StringSlice } diff --git a/tools/pd-simulator/simulator/event.go b/tools/pd-simulator/simulator/event.go index 8be8f89d759..20c75b58384 100644 --- a/tools/pd-simulator/simulator/event.go +++ b/tools/pd-simulator/simulator/event.go @@ -216,6 +216,12 @@ func (*DownNode) Run(raft *RaftEngine, _ int64) bool { return false } delete(raft.conn.Nodes, node.Id) + // delete store + err := PDHTTPClient.DeleteStore(context.Background(), node.Id) + if err != nil { + simutil.Logger.Error("put store failed", zap.Uint64("node-id", node.Id), zap.Error(err)) + return false + } node.Stop() regions := raft.GetRegions() diff --git a/tools/pd-simulator/simulator/node.go b/tools/pd-simulator/simulator/node.go index 883b5d4474b..c51cdfd8a38 100644 --- a/tools/pd-simulator/simulator/node.go +++ b/tools/pd-simulator/simulator/node.go @@ -72,6 +72,7 @@ func NewNode(s *cases.Store, pdAddr string, config *sc.SimConfig) (*Node, error) StoreId: s.ID, Capacity: uint64(config.RaftStore.Capacity), StartTime: uint32(time.Now().Unix()), + Available: uint64(config.RaftStore.Capacity), }, } tag := fmt.Sprintf("store %d", s.ID) From 0bf9e90559f3d1efc7ded573505ddd6886f75264 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Wed, 5 Jun 2024 16:50:56 +0800 Subject: [PATCH 12/17] storelimit: fix datarace from `getOrCreateStoreLimit` (#8254) close tikv/pd#8253 Signed-off-by: lhy1024 --- pkg/core/storelimit/store_limit.go | 20 ++++++++-- .../operator/operator_controller_test.go | 37 +++++++++++++++++++ 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/pkg/core/storelimit/store_limit.go b/pkg/core/storelimit/store_limit.go index 8d70b2918a1..e35ec773d80 100644 --- a/pkg/core/storelimit/store_limit.go +++ b/pkg/core/storelimit/store_limit.go @@ -17,6 +17,7 @@ package storelimit import ( "github.com/tikv/pd/pkg/core/constant" "github.com/tikv/pd/pkg/ratelimit" + "github.com/tikv/pd/pkg/utils/syncutil" ) const ( @@ -106,7 +107,7 @@ func (l *StoreRateLimit) Rate(typ Type) float64 { if l.limits[typ] == nil { return 0.0 } - return l.limits[typ].ratePerSec + return l.limits[typ].GetRatePerSec() } // Take takes count tokens from the bucket without blocking. @@ -128,12 +129,15 @@ func (l *StoreRateLimit) Reset(rate float64, typ Type) { // limit the operators of a store type limit struct { - limiter *ratelimit.RateLimiter - ratePerSec float64 + limiter *ratelimit.RateLimiter + ratePerSecMutex syncutil.RWMutex + ratePerSec float64 } // Reset resets the rate limit. func (l *limit) Reset(ratePerSec float64) { + l.ratePerSecMutex.Lock() + defer l.ratePerSecMutex.Unlock() if l.ratePerSec == ratePerSec { return } @@ -155,6 +159,8 @@ func (l *limit) Reset(ratePerSec float64) { // Available returns the number of available tokens // It returns true if the rate per second is zero. func (l *limit) Available(n int64) bool { + l.ratePerSecMutex.RLock() + defer l.ratePerSecMutex.RUnlock() if l.ratePerSec == 0 { return true } @@ -164,8 +170,16 @@ func (l *limit) Available(n int64) bool { // Take takes count tokens from the bucket without blocking. func (l *limit) Take(count int64) bool { + l.ratePerSecMutex.RLock() + defer l.ratePerSecMutex.RUnlock() if l.ratePerSec == 0 { return true } return l.limiter.AllowN(int(count)) } + +func (l *limit) GetRatePerSec() float64 { + l.ratePerSecMutex.RLock() + defer l.ratePerSecMutex.RUnlock() + return l.ratePerSec +} diff --git a/pkg/schedule/operator/operator_controller_test.go b/pkg/schedule/operator/operator_controller_test.go index d3c50667fe0..2b16516c4c7 100644 --- a/pkg/schedule/operator/operator_controller_test.go +++ b/pkg/schedule/operator/operator_controller_test.go @@ -955,3 +955,40 @@ func (suite *operatorControllerTestSuite) TestInvalidStoreId() { // Although store 3 does not exist in PD, PD can also send op to TiKV. re.Equal(pdpb.OperatorStatus_RUNNING, oc.GetOperatorStatus(1).Status) } + +func TestConcurrentAddOperatorAndSetStoreLimit(t *testing.T) { + re := require.New(t) + opt := mockconfig.NewTestOptions() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + tc := mockcluster.NewCluster(ctx, opt) + stream := hbstream.NewTestHeartbeatStreams(ctx, tc.ID, tc, false /* no need to run */) + oc := NewController(ctx, tc.GetBasicCluster(), tc.GetSharedConfig(), stream) + + regionNum := 1000 + limit := 1600.0 + storeID := uint64(2) + for i := 1; i < 4; i++ { + tc.AddRegionStore(uint64(i), regionNum) + tc.SetStoreLimit(uint64(i), storelimit.AddPeer, limit) + } + for i := 1; i <= regionNum; i++ { + tc.AddLeaderRegion(uint64(i), 1, 3, 4) + } + + // Add operator and set store limit concurrently + var wg sync.WaitGroup + for i := 1; i < 10; i++ { + wg.Add(1) + go func(i uint64) { + defer wg.Done() + for j := 1; j < 10; j++ { + regionID := uint64(j) + i*100 + op := NewTestOperator(regionID, tc.GetRegion(regionID).GetRegionEpoch(), OpRegion, AddPeer{ToStore: storeID, PeerID: regionID}) + re.True(oc.AddOperator(op)) + tc.SetStoreLimit(storeID, storelimit.AddPeer, limit-float64(j)) // every goroutine set a different limit + } + }(uint64(i)) + } + wg.Wait() +} From 494c0e956b622a9b3cec8b98180b942cbbe5a0f3 Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 6 Jun 2024 10:41:55 +0800 Subject: [PATCH 13/17] tools/simulator: avoid redundant schedule (#8257) close tikv/pd#5290, ref tikv/pd#8135 Signed-off-by: husharp Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- tools/pd-simulator/main.go | 2 ++ tools/pd-simulator/simulator/client.go | 7 +++++++ tools/pd-simulator/simulator/node.go | 3 +++ 3 files changed, 12 insertions(+) diff --git a/tools/pd-simulator/main.go b/tools/pd-simulator/main.go index e3dc43ca122..05763cc93b8 100644 --- a/tools/pd-simulator/main.go +++ b/tools/pd-simulator/main.go @@ -154,6 +154,8 @@ func simStart(pdAddr, statusAddress string, simCase string, simConfig *sc.SimCon tick := time.NewTicker(tickInterval) defer tick.Stop() sc := make(chan os.Signal, 1) + // halt scheduling + simulator.ChooseToHaltPDSchedule(true) signal.Notify(sc, syscall.SIGHUP, syscall.SIGINT, diff --git a/tools/pd-simulator/simulator/client.go b/tools/pd-simulator/simulator/client.go index 113eadab5e0..0bbbebe4602 100644 --- a/tools/pd-simulator/simulator/client.go +++ b/tools/pd-simulator/simulator/client.go @@ -16,6 +16,7 @@ package simulator import ( "context" + "strconv" "strings" "sync" "time" @@ -366,3 +367,9 @@ func (c *client) requestHeader() *pdpb.RequestHeader { ClusterId: c.clusterID, } } + +func ChooseToHaltPDSchedule(halt bool) { + PDHTTPClient.SetConfig(context.Background(), map[string]any{ + "schedule.halt-scheduling": strconv.FormatBool(halt), + }) +} diff --git a/tools/pd-simulator/simulator/node.go b/tools/pd-simulator/simulator/node.go index c51cdfd8a38..fe8dc74a944 100644 --- a/tools/pd-simulator/simulator/node.go +++ b/tools/pd-simulator/simulator/node.go @@ -172,6 +172,8 @@ func (n *Node) stepTask() { } } +var schedulerCheck sync.Once + func (n *Node) stepHeartBeat() { config := n.raftEngine.storeConfig @@ -182,6 +184,7 @@ func (n *Node) stepHeartBeat() { period = uint64(config.RaftStore.RegionHeartBeatInterval.Duration / config.SimTickInterval.Duration) if n.tick%period == 0 { n.regionHeartBeat() + schedulerCheck.Do(func() { ChooseToHaltPDSchedule(false) }) } } From f69d600f4b6a0c20a8c75b941ee8c055c48f74e7 Mon Sep 17 00:00:00 2001 From: Hu# Date: Fri, 7 Jun 2024 12:53:57 +0800 Subject: [PATCH 14/17] tests/realcluster: using real pd with race (#8270) ref tikv/pd#7298 Signed-off-by: husharp --- tests/integrations/realcluster/deploy.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integrations/realcluster/deploy.sh b/tests/integrations/realcluster/deploy.sh index d6cd0b27f72..8cce60e8ee6 100755 --- a/tests/integrations/realcluster/deploy.sh +++ b/tests/integrations/realcluster/deploy.sh @@ -15,10 +15,12 @@ curl --proto '=https' --tlsv1.2 -sSf https://tiup-mirrors.pingcap.com/install.sh $TIUP_BIN_DIR update playground cd ../../.. -if [ ! -d "bin" ] || [ ! -e "bin/tikv-server" ] && [ ! -e "bin/tidb-server" ] && [ ! -e "bin/pd-server" ] && [ ! -e "bin/tiflash" ]; then +if [ ! -d "bin" ] || [ ! -e "bin/tikv-server" ] && [ ! -e "bin/tidb-server" ] && [ ! -e "bin/tiflash" ]; then color-green "downloading binaries..." color-green "this may take a few minutes, you can also download them manually and put them in the bin directory." + make pd-server WITH_RACE=1 $TIUP_BIN_DIR playground nightly --kv 3 --tiflash 1 --db 1 --pd 3 --without-monitor --tag pd_test \ + --pd.binpath ./bin/pd-server \ > $CUR_PATH/playground.log 2>&1 & else color-green "using existing binaries..." From e767c012fb46d7bd6425a89beb1ccb45d7d94473 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Fri, 7 Jun 2024 17:56:58 +0800 Subject: [PATCH 15/17] schedule: fix datarace in `operator.check` (#8264) close tikv/pd#8263 Signed-off-by: lhy1024 --- pkg/schedule/operator/operator.go | 5 ++-- pkg/schedule/operator/operator_controller.go | 2 +- pkg/schedule/operator/operator_test.go | 25 ++++++++++++++++++++ 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/pkg/schedule/operator/operator.go b/pkg/schedule/operator/operator.go index de197c4fba4..4d57d4fc6c7 100644 --- a/pkg/schedule/operator/operator.go +++ b/pkg/schedule/operator/operator.go @@ -376,10 +376,11 @@ func (o *Operator) Check(region *core.RegionInfo) OpStep { defer func() { _ = o.CheckTimeout() }() for step := atomic.LoadInt32(&o.currentStep); int(step) < len(o.steps); step++ { if o.steps[int(step)].IsFinish(region) { - if atomic.CompareAndSwapInt64(&(o.stepsTime[step]), 0, time.Now().UnixNano()) { + current := time.Now() + if atomic.CompareAndSwapInt64(&(o.stepsTime[step]), 0, current.UnixNano()) { startTime, _ := o.getCurrentTimeAndStep() operatorStepDuration.WithLabelValues(reflect.TypeOf(o.steps[int(step)]).Name()). - Observe(time.Unix(0, o.stepsTime[step]).Sub(startTime).Seconds()) + Observe(current.Sub(startTime).Seconds()) } atomic.StoreInt32(&o.currentStep, step+1) } else { diff --git a/pkg/schedule/operator/operator_controller.go b/pkg/schedule/operator/operator_controller.go index d63e843f52a..fe93bd98756 100644 --- a/pkg/schedule/operator/operator_controller.go +++ b/pkg/schedule/operator/operator_controller.go @@ -461,7 +461,7 @@ func (oc *Controller) checkAddOperator(isPromoting bool, ops ...*Operator) (bool return false, NotInCreateStatus } if !isPromoting && oc.wopStatus.getCount(op.Desc()) >= oc.config.GetSchedulerMaxWaitingOperator() { - log.Debug("exceed max return false", zap.Uint64("waiting", oc.wopStatus.ops[op.Desc()]), zap.String("desc", op.Desc()), zap.Uint64("max", oc.config.GetSchedulerMaxWaitingOperator())) + log.Debug("exceed max return false", zap.Uint64("waiting", oc.wopStatus.getCount(op.Desc())), zap.String("desc", op.Desc()), zap.Uint64("max", oc.config.GetSchedulerMaxWaitingOperator())) operatorCounter.WithLabelValues(op.Desc(), "exceed-max-waiting").Inc() return false, ExceedWaitLimit } diff --git a/pkg/schedule/operator/operator_test.go b/pkg/schedule/operator/operator_test.go index 693f5c17475..1f44d813f1e 100644 --- a/pkg/schedule/operator/operator_test.go +++ b/pkg/schedule/operator/operator_test.go @@ -17,6 +17,7 @@ package operator import ( "context" "encoding/json" + "sync" "sync/atomic" "testing" "time" @@ -570,3 +571,27 @@ func (suite *operatorTestSuite) TestToJSONObject() { obj = op.ToJSONObject() suite.Equal(TIMEOUT, obj.Status) } + +func TestOperatorCheckConcurrently(t *testing.T) { + re := require.New(t) + region := newTestRegion(1, 1, [2]uint64{1, 1}, [2]uint64{2, 2}) + // addPeer1, transferLeader1, removePeer3 + steps := []OpStep{ + AddPeer{ToStore: 1, PeerID: 1}, + TransferLeader{FromStore: 3, ToStore: 1}, + RemovePeer{FromStore: 3}, + } + op := NewTestOperator(1, &metapb.RegionEpoch{}, OpAdmin|OpLeader|OpRegion, steps...) + re.Equal(constant.Urgent, op.GetPriorityLevel()) + checkSteps(re, op, steps) + op.Start() + var wg sync.WaitGroup + for i := 0; i < 10; i++ { + wg.Add(1) + go func() { + defer wg.Done() + re.Nil(op.Check(region)) + }() + } + wg.Wait() +} From c015f140f49a47e60b3884a9179e79e92482461c Mon Sep 17 00:00:00 2001 From: Hu# Date: Tue, 11 Jun 2024 13:20:59 +0800 Subject: [PATCH 16/17] rc: fix group change will meet data race (#8268) close tikv/pd#8267 Signed-off-by: husharp Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/mcs/resourcemanager/server/manager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/mcs/resourcemanager/server/manager.go b/pkg/mcs/resourcemanager/server/manager.go index ef402b8cbf9..418d188823f 100644 --- a/pkg/mcs/resourcemanager/server/manager.go +++ b/pkg/mcs/resourcemanager/server/manager.go @@ -129,7 +129,9 @@ func (m *Manager) Init(ctx context.Context) error { return err } // Load resource group meta info from storage. + m.Lock() m.groups = make(map[string]*ResourceGroup) + m.Unlock() handler := func(k, v string) { group := &rmpb.ResourceGroup{} if err := proto.Unmarshal([]byte(v), group); err != nil { From 934816460a04cf674e79271b7099ddcc1ed35326 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Wed, 12 Jun 2024 20:42:00 +0800 Subject: [PATCH 17/17] mcs: add more comments about scheduler redirect (#8279) ref tikv/pd#5839 Signed-off-by: lhy1024 --- server/api/server.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/api/server.go b/server/api/server.go index ad614593b2f..7b7066c4f77 100644 --- a/server/api/server.go +++ b/server/api/server.go @@ -51,7 +51,7 @@ func NewHandler(_ context.Context, svr *server.Server) (http.Handler, apiutil.AP // "/checker/{name}", http.MethodPost // "/checker/{name}", http.MethodGet // "/schedulers", http.MethodGet - // "/schedulers/{name}", http.MethodPost + // "/schedulers/{name}", http.MethodPost, which is to be used to pause or resume the scheduler rather than create a new scheduler // "/schedulers/diagnostic/{name}", http.MethodGet // "/scheduler-config", http.MethodGet // "/hotspot/regions/read", http.MethodGet @@ -62,6 +62,8 @@ func NewHandler(_ context.Context, svr *server.Server) (http.Handler, apiutil.AP // Following requests are **not** redirected: // "/schedulers", http.MethodPost // "/schedulers/{name}", http.MethodDelete + // Because the writing of all the config of the scheduling service is in the API server, + // we should not post and delete the scheduler directly in the scheduling service. router.PathPrefix(apiPrefix).Handler(negroni.New( serverapi.NewRuntimeServiceValidator(svr, group), serverapi.NewRedirector(svr, @@ -163,7 +165,7 @@ func NewHandler(_ context.Context, svr *server.Server) (http.Handler, apiutil.AP mcs.SchedulingServiceName, []string{http.MethodGet}), serverapi.MicroserviceRedirectRule( - prefix+"/schedulers/", // Note: this means "/schedulers/{name}" + prefix+"/schedulers/", // Note: this means "/schedulers/{name}", which is to be used to pause or resume the scheduler scheapi.APIPathPrefix+"/schedulers", mcs.SchedulingServiceName, []string{http.MethodPost}),