From ce0bb5dc398bd56a1f557e632e2f0473a4cb53a9 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Mon, 30 Oct 2023 15:09:35 +0800 Subject: [PATCH 01/20] mcs: support admin/cache http interface in scheduling server (#7279) ref tikv/pd#5839 Signed-off-by: lhy1024 --- errors.toml | 10 ++++ pkg/errs/errno.go | 6 ++ pkg/mcs/scheduling/server/apis/v1/api.go | 51 +++++++++++++++- pkg/mcs/scheduling/server/cluster.go | 10 ++++ server/api/admin.go | 54 +++++++++++++++-- tests/integrations/mcs/scheduling/api_test.go | 60 +++++++++++++++++++ 6 files changed, 185 insertions(+), 6 deletions(-) diff --git a/errors.toml b/errors.toml index 1b96de8a209..1d10d40d294 100644 --- a/errors.toml +++ b/errors.toml @@ -496,6 +496,16 @@ error = ''' init file log error, %s ''' +["PD:mcs:ErrNotFoundSchedulingAddr"] +error = ''' +cannot find scheduling address +''' + +["PD:mcs:ErrSchedulingServer"] +error = ''' +scheduling server meets %v +''' + ["PD:member:ErrCheckCampaign"] error = ''' check campaign failed diff --git a/pkg/errs/errno.go b/pkg/errs/errno.go index 181dfc9b393..e5bac8519be 100644 --- a/pkg/errs/errno.go +++ b/pkg/errs/errno.go @@ -403,3 +403,9 @@ var ( ErrDeleteReservedGroup = errors.Normalize("cannot delete reserved group", errors.RFCCodeText("PD:resourcemanager:ErrDeleteReservedGroup")) ErrInvalidGroup = errors.Normalize("invalid group settings, please check the group name, priority and the number of resources", errors.RFCCodeText("PD:resourcemanager:ErrInvalidGroup")) ) + +// Micro service errors +var ( + ErrNotFoundSchedulingAddr = errors.Normalize("cannot find scheduling address", errors.RFCCodeText("PD:mcs:ErrNotFoundSchedulingAddr")) + ErrSchedulingServer = errors.Normalize("scheduling server meets %v", errors.RFCCodeText("PD:mcs:ErrSchedulingServer")) +) diff --git a/pkg/mcs/scheduling/server/apis/v1/api.go b/pkg/mcs/scheduling/server/apis/v1/api.go index 39be00ef9a0..d0acdf39a09 100644 --- a/pkg/mcs/scheduling/server/apis/v1/api.go +++ b/pkg/mcs/scheduling/server/apis/v1/api.go @@ -15,7 +15,6 @@ package apis import ( - "fmt" "net/http" "strconv" "sync" @@ -26,6 +25,7 @@ import ( "github.com/gin-gonic/gin" "github.com/joho/godotenv" "github.com/pingcap/log" + "github.com/tikv/pd/pkg/errs" scheserver "github.com/tikv/pd/pkg/mcs/scheduling/server" mcsutils "github.com/tikv/pd/pkg/mcs/utils" sche "github.com/tikv/pd/pkg/schedule/core" @@ -121,6 +121,8 @@ func NewService(srv *scheserver.Service) *Service { func (s *Service) RegisterAdminRouter() { router := s.root.Group("admin") router.PUT("/log", changeLogLevel) + router.DELETE("cache/regions", deleteAllRegionCache) + router.DELETE("cache/regions/:id", deleteRegionCacheByID) } // RegisterSchedulersRouter registers the router of the schedulers handler. @@ -160,6 +162,11 @@ func (s *Service) RegisterOperatorsRouter() { router.GET("/records", getOperatorRecords) } +// @Tags admin +// @Summary Change the log level. +// @Produce json +// @Success 200 {string} string "The log level is updated." +// @Router /admin/log [put] func changeLogLevel(c *gin.Context) { svr := c.MustGet(multiservicesapi.ServiceContextKey).(*scheserver.Server) var level string @@ -176,6 +183,46 @@ func changeLogLevel(c *gin.Context) { c.String(http.StatusOK, "The log level is updated.") } +// @Tags admin +// @Summary Drop all regions from cache. +// @Produce json +// @Success 200 {string} string "All regions are removed from server cache." +// @Router /admin/cache/regions [delete] +func deleteAllRegionCache(c *gin.Context) { + svr := c.MustGet(multiservicesapi.ServiceContextKey).(*scheserver.Server) + cluster := svr.GetCluster() + if cluster == nil { + c.String(http.StatusInternalServerError, errs.ErrNotBootstrapped.GenWithStackByArgs().Error()) + return + } + cluster.DropCacheAllRegion() + c.String(http.StatusOK, "All regions are removed from server cache.") +} + +// @Tags admin +// @Summary Drop a specific region from cache. +// @Param id path integer true "Region Id" +// @Produce json +// @Success 200 {string} string "The region is removed from server cache." +// @Failure 400 {string} string "The input is invalid." +// @Router /admin/cache/regions/{id} [delete] +func deleteRegionCacheByID(c *gin.Context) { + svr := c.MustGet(multiservicesapi.ServiceContextKey).(*scheserver.Server) + cluster := svr.GetCluster() + if cluster == nil { + c.String(http.StatusInternalServerError, errs.ErrNotBootstrapped.GenWithStackByArgs().Error()) + return + } + regionIDStr := c.Param("id") + regionID, err := strconv.ParseUint(regionIDStr, 10, 64) + if err != nil { + c.String(http.StatusBadRequest, err.Error()) + return + } + cluster.DropCacheRegion(regionID) + c.String(http.StatusOK, "The region is removed from server cache.") +} + // @Tags operators // @Summary Get an operator by ID. // @Param region_id path int true "A Region's Id" @@ -475,7 +522,7 @@ func getHotRegions(typ utils.RWType, c *gin.Context) { for _, storeID := range storeIDs { id, err := strconv.ParseUint(storeID, 10, 64) if err != nil { - c.String(http.StatusBadRequest, fmt.Sprintf("invalid store id: %s", storeID)) + c.String(http.StatusBadRequest, errs.ErrInvalidStoreID.FastGenByArgs(storeID).Error()) return } _, err = handler.GetStore(id) diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index 0c4e781e747..028c2a12b37 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -593,3 +593,13 @@ func (c *Cluster) processRegionHeartbeat(region *core.RegionInfo) error { func (c *Cluster) IsPrepared() bool { return c.coordinator.GetPrepareChecker().IsPrepared() } + +// DropCacheAllRegion removes all cached regions. +func (c *Cluster) DropCacheAllRegion() { + c.ResetRegionCache() +} + +// DropCacheRegion removes a region from the cache. +func (c *Cluster) DropCacheRegion(id uint64) { + c.RemoveRegionIfExist(id) +} diff --git a/server/api/admin.go b/server/api/admin.go index 7a1dfb0f1e8..246c9239f59 100644 --- a/server/api/admin.go +++ b/server/api/admin.go @@ -16,6 +16,7 @@ package api import ( "encoding/json" + "fmt" "io" "net/http" "strconv" @@ -24,6 +25,7 @@ import ( "github.com/pingcap/log" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/errs" + "github.com/tikv/pd/pkg/mcs/utils" "github.com/tikv/pd/pkg/utils/apiutil" "github.com/tikv/pd/server" "github.com/unrolled/render" @@ -59,7 +61,11 @@ func (h *adminHandler) DeleteRegionCache(w http.ResponseWriter, r *http.Request) return } rc.DropCacheRegion(regionID) - h.rd.JSON(w, http.StatusOK, "The region is removed from server cache.") + if h.svr.IsAPIServiceMode() { + err = h.DeleteRegionCacheInSchedulingServer(regionID) + } + msg := "The region is removed from server cache." + h.rd.JSON(w, http.StatusOK, h.buildMsg(msg, err)) } // @Tags admin @@ -95,8 +101,11 @@ func (h *adminHandler) DeleteRegionStorage(w http.ResponseWriter, r *http.Reques } // Remove region from cache. rc.DropCacheRegion(regionID) - - h.rd.JSON(w, http.StatusOK, "The region is removed from server cache and region meta storage.") + if h.svr.IsAPIServiceMode() { + err = h.DeleteRegionCacheInSchedulingServer(regionID) + } + msg := "The region is removed from server cache and region meta storage." + h.rd.JSON(w, http.StatusOK, h.buildMsg(msg, err)) } // @Tags admin @@ -105,9 +114,14 @@ func (h *adminHandler) DeleteRegionStorage(w http.ResponseWriter, r *http.Reques // @Success 200 {string} string "All regions are removed from server cache." // @Router /admin/cache/regions [delete] func (h *adminHandler) DeleteAllRegionCache(w http.ResponseWriter, r *http.Request) { + var err error rc := getCluster(r) rc.DropCacheAllRegion() - h.rd.JSON(w, http.StatusOK, "All regions are removed from server cache.") + if h.svr.IsAPIServiceMode() { + err = h.DeleteRegionCacheInSchedulingServer() + } + msg := "All regions are removed from server cache." + h.rd.JSON(w, http.StatusOK, h.buildMsg(msg, err)) } // Intentionally no swagger mark as it is supposed to be only used in @@ -200,3 +214,35 @@ func (h *adminHandler) RecoverAllocID(w http.ResponseWriter, r *http.Request) { _ = h.rd.Text(w, http.StatusOK, "") } + +func (h *adminHandler) DeleteRegionCacheInSchedulingServer(id ...uint64) error { + addr, ok := h.svr.GetServicePrimaryAddr(h.svr.Context(), utils.SchedulingServiceName) + if !ok { + return errs.ErrNotFoundSchedulingAddr.FastGenByArgs() + } + var idStr string + if len(id) > 0 { + idStr = strconv.FormatUint(id[0], 10) + } + url := fmt.Sprintf("%s/scheduling/api/v1/admin/cache/regions/%s", addr, idStr) + req, err := http.NewRequest(http.MethodDelete, url, nil) + if err != nil { + return err + } + resp, err := h.svr.GetHTTPClient().Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return errs.ErrSchedulingServer.FastGenByArgs(resp.StatusCode) + } + return nil +} + +func (h *adminHandler) buildMsg(msg string, err error) string { + if h.svr.IsAPIServiceMode() && err != nil { + return fmt.Sprintf("This operation was executed in API server but needs to be re-executed on scheduling server due to the following error: %s", err.Error()) + } + return msg +} diff --git a/tests/integrations/mcs/scheduling/api_test.go b/tests/integrations/mcs/scheduling/api_test.go index 5284913813c..d6028204325 100644 --- a/tests/integrations/mcs/scheduling/api_test.go +++ b/tests/integrations/mcs/scheduling/api_test.go @@ -9,7 +9,9 @@ import ( "time" "github.com/pingcap/failpoint" + "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" + "github.com/tikv/pd/pkg/core" _ "github.com/tikv/pd/pkg/mcs/scheduling/server/apis/v1" "github.com/tikv/pd/pkg/schedule/handler" "github.com/tikv/pd/pkg/statistics" @@ -218,3 +220,61 @@ func (suite *apiTestSuite) TestAPIForward() { testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) re.NoError(err) } + +func TestAdminRegionCache(t *testing.T) { + re := require.New(t) + checkAdminRegionCache := func(cluster *tests.TestCluster) { + r1 := core.NewTestRegionInfo(10, 1, []byte(""), []byte("b"), core.SetRegionConfVer(100), core.SetRegionVersion(100)) + tests.MustPutRegionInfo(re, cluster, r1) + r2 := core.NewTestRegionInfo(20, 1, []byte("b"), []byte("c"), core.SetRegionConfVer(100), core.SetRegionVersion(100)) + tests.MustPutRegionInfo(re, cluster, r2) + r3 := core.NewTestRegionInfo(30, 1, []byte("c"), []byte(""), core.SetRegionConfVer(100), core.SetRegionVersion(100)) + tests.MustPutRegionInfo(re, cluster, r3) + + schedulingServer := cluster.GetSchedulingPrimaryServer() + re.Equal(3, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) + + addr := schedulingServer.GetAddr() + urlPrefix := fmt.Sprintf("%s/scheduling/api/v1/admin/cache/regions", addr) + err := testutil.CheckDelete(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "30"), testutil.StatusOK(re)) + re.NoError(err) + re.Equal(2, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) + + err = testutil.CheckDelete(testDialClient, urlPrefix, testutil.StatusOK(re)) + re.NoError(err) + re.Equal(0, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) + } + env := tests.NewSchedulingTestEnvironment(t) + env.RunTestInAPIMode(checkAdminRegionCache) +} + +func TestAdminRegionCacheForward(t *testing.T) { + re := require.New(t) + checkAdminRegionCache := func(cluster *tests.TestCluster) { + r1 := core.NewTestRegionInfo(10, 1, []byte(""), []byte("b"), core.SetRegionConfVer(100), core.SetRegionVersion(100)) + tests.MustPutRegionInfo(re, cluster, r1) + r2 := core.NewTestRegionInfo(20, 1, []byte("b"), []byte("c"), core.SetRegionConfVer(100), core.SetRegionVersion(100)) + tests.MustPutRegionInfo(re, cluster, r2) + r3 := core.NewTestRegionInfo(30, 1, []byte("c"), []byte(""), core.SetRegionConfVer(100), core.SetRegionVersion(100)) + tests.MustPutRegionInfo(re, cluster, r3) + + apiServer := cluster.GetLeaderServer().GetServer() + schedulingServer := cluster.GetSchedulingPrimaryServer() + re.Equal(3, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) + re.Equal(3, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count) + + addr := cluster.GetLeaderServer().GetAddr() + urlPrefix := fmt.Sprintf("%s/pd/api/v1/admin/cache/region", addr) + err := testutil.CheckDelete(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "30"), testutil.StatusOK(re)) + re.NoError(err) + re.Equal(2, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) + re.Equal(2, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count) + + err = testutil.CheckDelete(testDialClient, urlPrefix+"s", testutil.StatusOK(re)) + re.NoError(err) + re.Equal(0, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) + re.Equal(0, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count) + } + env := tests.NewSchedulingTestEnvironment(t) + env.RunTestInAPIMode(checkAdminRegionCache) +} From ded917b47a4ca57ed614a8cf5e5a1b77ce549368 Mon Sep 17 00:00:00 2001 From: Yongbo Jiang Date: Tue, 31 Oct 2023 17:54:37 +0800 Subject: [PATCH 02/20] config: fix `Clone` in `RatelimitConfig` (#7289) close tikv/pd#7288 Signed-off-by: Cabinfever_B --- server/config/config_test.go | 26 ++++++++++++++++++++++ server/config/service_middleware_config.go | 10 +++++++++ 2 files changed, 36 insertions(+) diff --git a/server/config/config_test.go b/server/config/config_test.go index 75e69c26d5c..07cdc966409 100644 --- a/server/config/config_test.go +++ b/server/config/config_test.go @@ -26,6 +26,7 @@ import ( "github.com/BurntSushi/toml" "github.com/spf13/pflag" "github.com/stretchr/testify/require" + "github.com/tikv/pd/pkg/ratelimit" sc "github.com/tikv/pd/pkg/schedule/config" "github.com/tikv/pd/pkg/storage" "github.com/tikv/pd/pkg/utils/configutil" @@ -479,3 +480,28 @@ func newTestScheduleOption() (*PersistOptions, error) { opt := NewPersistOptions(cfg) return opt, nil } + +func TestRateLimitClone(t *testing.T) { + re := require.New(t) + cfg := &RateLimitConfig{ + EnableRateLimit: defaultEnableRateLimitMiddleware, + LimiterConfig: make(map[string]ratelimit.DimensionConfig), + } + clone := cfg.Clone() + clone.LimiterConfig["test"] = ratelimit.DimensionConfig{ + ConcurrencyLimit: 200, + } + dc := cfg.LimiterConfig["test"] + re.Equal(dc.ConcurrencyLimit, uint64(0)) + + gCfg := &GRPCRateLimitConfig{ + EnableRateLimit: defaultEnableGRPCRateLimitMiddleware, + LimiterConfig: make(map[string]ratelimit.DimensionConfig), + } + gClone := gCfg.Clone() + gClone.LimiterConfig["test"] = ratelimit.DimensionConfig{ + ConcurrencyLimit: 300, + } + gdc := gCfg.LimiterConfig["test"] + re.Equal(gdc.ConcurrencyLimit, uint64(0)) +} diff --git a/server/config/service_middleware_config.go b/server/config/service_middleware_config.go index ef0b04b2abd..b13e3398ac5 100644 --- a/server/config/service_middleware_config.go +++ b/server/config/service_middleware_config.go @@ -78,7 +78,12 @@ type RateLimitConfig struct { // Clone returns a cloned rate limit config. func (c *RateLimitConfig) Clone() *RateLimitConfig { + m := make(map[string]ratelimit.DimensionConfig, len(c.LimiterConfig)) + for k, v := range c.LimiterConfig { + m[k] = v + } cfg := *c + cfg.LimiterConfig = m return &cfg } @@ -92,6 +97,11 @@ type GRPCRateLimitConfig struct { // Clone returns a cloned rate limit config. func (c *GRPCRateLimitConfig) Clone() *GRPCRateLimitConfig { + m := make(map[string]ratelimit.DimensionConfig, len(c.LimiterConfig)) + for k, v := range c.LimiterConfig { + m[k] = v + } cfg := *c + cfg.LimiterConfig = m return &cfg } From 4e45e951c3d351fef247310ef569efb2b912850a Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Wed, 1 Nov 2023 13:56:07 +0800 Subject: [PATCH 03/20] mcs: support scheduler config forward and enable some tests (#7256) ref tikv/pd#5839 Signed-off-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/mcs/scheduling/server/apis/v1/api.go | 57 +++++ pkg/schedule/handler/handler.go | 45 +++- pkg/schedule/schedulers/scheduler.go | 18 ++ .../schedulers/scheduler_controller.go | 5 + .../schedulers/shuffle_region_config.go | 1 + pkg/utils/testutil/api_check.go | 16 +- server/api/server.go | 6 + server/config/persist_options.go | 5 +- server/server.go | 15 +- tests/integrations/mcs/scheduling/api_test.go | 21 ++ tests/pdctl/scheduler/scheduler_test.go | 231 ++++++++++-------- tests/server/api/scheduler_test.go | 101 ++++---- tools/pd-ctl/pdctl/command/scheduler.go | 14 +- 13 files changed, 350 insertions(+), 185 deletions(-) diff --git a/pkg/mcs/scheduling/server/apis/v1/api.go b/pkg/mcs/scheduling/server/apis/v1/api.go index d0acdf39a09..356dc5a7f42 100644 --- a/pkg/mcs/scheduling/server/apis/v1/api.go +++ b/pkg/mcs/scheduling/server/apis/v1/api.go @@ -31,6 +31,7 @@ import ( sche "github.com/tikv/pd/pkg/schedule/core" "github.com/tikv/pd/pkg/schedule/handler" "github.com/tikv/pd/pkg/schedule/operator" + "github.com/tikv/pd/pkg/schedule/schedulers" "github.com/tikv/pd/pkg/statistics/utils" "github.com/tikv/pd/pkg/storage" "github.com/tikv/pd/pkg/utils/apiutil" @@ -130,6 +131,8 @@ func (s *Service) RegisterSchedulersRouter() { router := s.root.Group("schedulers") router.GET("", getSchedulers) router.GET("/diagnostic/:name", getDiagnosticResult) + router.GET("/config", getSchedulerConfig) + router.GET("/config/:name/list", getSchedulerConfigByName) // TODO: in the future, we should split pauseOrResumeScheduler to two different APIs. // And we need to do one-to-two forwarding in the API middleware. router.POST("/:name", pauseOrResumeScheduler) @@ -432,6 +435,60 @@ func getSchedulers(c *gin.Context) { c.IndentedJSON(http.StatusOK, output) } +// @Tags schedulers +// @Summary List all scheduler configs. +// @Produce json +// @Success 200 {object} map[string]interface{} +// @Failure 500 {string} string "PD server failed to proceed the request." +// @Router /schedulers/config/ [get] +func getSchedulerConfig(c *gin.Context) { + handler := c.MustGet(handlerKey).(*handler.Handler) + sc, err := handler.GetSchedulersController() + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + sches, configs, err := sc.GetAllSchedulerConfigs() + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + c.IndentedJSON(http.StatusOK, schedulers.ToPayload(sches, configs)) +} + +// @Tags schedulers +// @Summary List scheduler config by name. +// @Produce json +// @Success 200 {object} map[string]interface{} +// @Failure 404 {string} string scheduler not found +// @Failure 500 {string} string "PD server failed to proceed the request." +// @Router /schedulers/config/{name}/list [get] +func getSchedulerConfigByName(c *gin.Context) { + handler := c.MustGet(handlerKey).(*handler.Handler) + sc, err := handler.GetSchedulersController() + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + handlers := sc.GetSchedulerHandlers() + name := c.Param("name") + if _, ok := handlers[name]; !ok { + c.String(http.StatusNotFound, errs.ErrSchedulerNotFound.GenWithStackByArgs().Error()) + return + } + isDisabled, err := sc.IsSchedulerDisabled(name) + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + if isDisabled { + c.String(http.StatusNotFound, errs.ErrSchedulerNotFound.GenWithStackByArgs().Error()) + return + } + c.Request.URL.Path = "/list" + handlers[name].ServeHTTP(c.Writer, c.Request) +} + // @Tags schedulers // @Summary List schedulers diagnostic result. // @Produce json diff --git a/pkg/schedule/handler/handler.go b/pkg/schedule/handler/handler.go index fca43f3eeeb..45b0eaf502f 100644 --- a/pkg/schedule/handler/handler.go +++ b/pkg/schedule/handler/handler.go @@ -765,13 +765,22 @@ func (h *Handler) GetCheckerStatus(name string) (map[string]bool, error) { }, nil } -// GetSchedulerNames returns all names of schedulers. -func (h *Handler) GetSchedulerNames() ([]string, error) { +// GetSchedulersController returns controller of schedulers. +func (h *Handler) GetSchedulersController() (*schedulers.Controller, error) { co := h.GetCoordinator() if co == nil { return nil, errs.ErrNotBootstrapped.GenWithStackByArgs() } - return co.GetSchedulersController().GetSchedulerNames(), nil + return co.GetSchedulersController(), nil +} + +// GetSchedulerNames returns all names of schedulers. +func (h *Handler) GetSchedulerNames() ([]string, error) { + sc, err := h.GetSchedulersController() + if err != nil { + return nil, err + } + return sc.GetSchedulerNames(), nil } type schedulerPausedPeriod struct { @@ -782,11 +791,10 @@ type schedulerPausedPeriod struct { // GetSchedulerByStatus returns all names of schedulers by status. func (h *Handler) GetSchedulerByStatus(status string, needTS bool) (interface{}, error) { - co := h.GetCoordinator() - if co == nil { - return nil, errs.ErrNotBootstrapped.GenWithStackByArgs() + sc, err := h.GetSchedulersController() + if err != nil { + return nil, err } - sc := co.GetSchedulersController() schedulers := sc.GetSchedulerNames() switch status { case "paused": @@ -837,7 +845,20 @@ func (h *Handler) GetSchedulerByStatus(status string, needTS bool) (interface{}, } return disabledSchedulers, nil default: - return schedulers, nil + // The default scheduler could not be deleted in scheduling server, + // so schedulers could only be disabled. + // We should not return the disabled schedulers here. + var enabledSchedulers []string + for _, scheduler := range schedulers { + disabled, err := sc.IsSchedulerDisabled(scheduler) + if err != nil { + return nil, err + } + if !disabled { + enabledSchedulers = append(enabledSchedulers, scheduler) + } + } + return enabledSchedulers, nil } } @@ -861,11 +882,11 @@ func (h *Handler) GetDiagnosticResult(name string) (*schedulers.DiagnosticResult // t == 0 : resume scheduler. // t > 0 : scheduler delays t seconds. func (h *Handler) PauseOrResumeScheduler(name string, t int64) (err error) { - co := h.GetCoordinator() - if co == nil { - return errs.ErrNotBootstrapped.GenWithStackByArgs() + sc, err := h.GetSchedulersController() + if err != nil { + return err } - if err = co.GetSchedulersController().PauseOrResumeScheduler(name, t); err != nil { + if err = sc.PauseOrResumeScheduler(name, t); err != nil { if t == 0 { log.Error("can not resume scheduler", zap.String("scheduler-name", name), errs.ZapError(err)) } else { diff --git a/pkg/schedule/schedulers/scheduler.go b/pkg/schedule/schedulers/scheduler.go index ba02c280d40..9262f7d0a65 100644 --- a/pkg/schedule/schedulers/scheduler.go +++ b/pkg/schedule/schedulers/scheduler.go @@ -66,6 +66,24 @@ func DecodeConfig(data []byte, v interface{}) error { return nil } +// ToPayload returns the payload of config. +func ToPayload(sches, configs []string) map[string]interface{} { + payload := make(map[string]interface{}) + for i, sche := range sches { + var config interface{} + err := DecodeConfig([]byte(configs[i]), &config) + if err != nil { + log.Error("failed to decode scheduler config", + zap.String("config", configs[i]), + zap.String("scheduler", sche), + errs.ZapError(err)) + continue + } + payload[sche] = config + } + return payload +} + // ConfigDecoder used to decode the config. type ConfigDecoder func(v interface{}) error diff --git a/pkg/schedule/schedulers/scheduler_controller.go b/pkg/schedule/schedulers/scheduler_controller.go index 0f2264392aa..79c8cbfbc92 100644 --- a/pkg/schedule/schedulers/scheduler_controller.go +++ b/pkg/schedule/schedulers/scheduler_controller.go @@ -418,6 +418,11 @@ func (c *Controller) CheckTransferWitnessLeader(region *core.RegionInfo) { } } +// GetAllSchedulerConfigs returns all scheduler configs. +func (c *Controller) GetAllSchedulerConfigs() ([]string, []string, error) { + return c.storage.LoadAllSchedulerConfigs() +} + // ScheduleController is used to manage a scheduler. type ScheduleController struct { Scheduler diff --git a/pkg/schedule/schedulers/shuffle_region_config.go b/pkg/schedule/schedulers/shuffle_region_config.go index f503a6f67c7..7d04879c992 100644 --- a/pkg/schedule/schedulers/shuffle_region_config.go +++ b/pkg/schedule/schedulers/shuffle_region_config.go @@ -69,6 +69,7 @@ func (conf *shuffleRegionSchedulerConfig) IsRoleAllow(role string) bool { func (conf *shuffleRegionSchedulerConfig) ServeHTTP(w http.ResponseWriter, r *http.Request) { router := mux.NewRouter() + router.HandleFunc("/list", conf.handleGetRoles).Methods(http.MethodGet) router.HandleFunc("/roles", conf.handleGetRoles).Methods(http.MethodGet) router.HandleFunc("/roles", conf.handleSetRoles).Methods(http.MethodPost) router.ServeHTTP(w, r) diff --git a/pkg/utils/testutil/api_check.go b/pkg/utils/testutil/api_check.go index 84af97f828d..4ce5e859f3f 100644 --- a/pkg/utils/testutil/api_check.go +++ b/pkg/utils/testutil/api_check.go @@ -37,29 +37,29 @@ func StatusOK(re *require.Assertions) func([]byte, int, http.Header) { // StatusNotOK is used to check whether http response code is not equal http.StatusOK. func StatusNotOK(re *require.Assertions) func([]byte, int, http.Header) { - return func(_ []byte, i int, _ http.Header) { - re.NotEqual(http.StatusOK, i) + return func(resp []byte, i int, _ http.Header) { + re.NotEqual(http.StatusOK, i, "resp: "+string(resp)) } } // ExtractJSON is used to check whether given data can be extracted successfully. func ExtractJSON(re *require.Assertions, data interface{}) func([]byte, int, http.Header) { - return func(res []byte, _ int, _ http.Header) { - re.NoError(json.Unmarshal(res, data)) + return func(resp []byte, _ int, _ http.Header) { + re.NoError(json.Unmarshal(resp, data), "resp: "+string(resp)) } } // StringContain is used to check whether response context contains given string. func StringContain(re *require.Assertions, sub string) func([]byte, int, http.Header) { - return func(res []byte, _ int, _ http.Header) { - re.Contains(string(res), sub) + return func(resp []byte, _ int, _ http.Header) { + re.Contains(string(resp), sub, "resp: "+string(resp)) } } // StringEqual is used to check whether response context equal given string. func StringEqual(re *require.Assertions, str string) func([]byte, int, http.Header) { - return func(res []byte, _ int, _ http.Header) { - re.Contains(string(res), str) + return func(resp []byte, _ int, _ http.Header) { + re.Contains(string(resp), str, "resp: "+string(resp)) } } diff --git a/server/api/server.go b/server/api/server.go index ee301ea54c8..ae877b8407c 100644 --- a/server/api/server.go +++ b/server/api/server.go @@ -52,6 +52,7 @@ func NewHandler(_ context.Context, svr *server.Server) (http.Handler, apiutil.AP // "/schedulers", http.MethodGet // "/schedulers/{name}", http.MethodPost // "/schedulers/diagnostic/{name}", http.MethodGet + // "/scheduler-config", http.MethodGet // "/hotspot/regions/read", http.MethodGet // "/hotspot/regions/write", http.MethodGet // "/hotspot/regions/history", http.MethodGet @@ -90,6 +91,11 @@ func NewHandler(_ context.Context, svr *server.Server) (http.Handler, apiutil.AP scheapi.APIPathPrefix+"/schedulers", mcs.SchedulingServiceName, []string{http.MethodGet}), + serverapi.MicroserviceRedirectRule( + prefix+"/scheduler-config", + scheapi.APIPathPrefix+"/schedulers/config", + mcs.SchedulingServiceName, + []string{http.MethodGet}), serverapi.MicroserviceRedirectRule( prefix+"/schedulers/", // Note: this means "/schedulers/{name}" scheapi.APIPathPrefix+"/schedulers", diff --git a/server/config/persist_options.go b/server/config/persist_options.go index c0a0ebf5c47..49a44449a22 100644 --- a/server/config/persist_options.go +++ b/server/config/persist_options.go @@ -789,11 +789,10 @@ func (o *PersistOptions) Persist(storage endpoint.ConfigStorage) error { }, StoreConfig: *o.GetStoreConfig(), } - err := storage.SaveConfig(cfg) failpoint.Inject("persistFail", func() { - err = errors.New("fail to persist") + failpoint.Return(errors.New("fail to persist")) }) - return err + return storage.SaveConfig(cfg) } // Reload reloads the configuration from the storage. diff --git a/server/server.go b/server/server.go index 160609e37a7..9cd7f18578e 100644 --- a/server/server.go +++ b/server/server.go @@ -948,20 +948,7 @@ func (s *Server) GetConfig() *config.Config { if err != nil { return cfg } - payload := make(map[string]interface{}) - for i, sche := range sches { - var config interface{} - err := schedulers.DecodeConfig([]byte(configs[i]), &config) - if err != nil { - log.Error("failed to decode scheduler config", - zap.String("config", configs[i]), - zap.String("scheduler", sche), - errs.ZapError(err)) - continue - } - payload[sche] = config - } - cfg.Schedule.SchedulersPayload = payload + cfg.Schedule.SchedulersPayload = schedulers.ToPayload(sches, configs) return cfg } diff --git a/tests/integrations/mcs/scheduling/api_test.go b/tests/integrations/mcs/scheduling/api_test.go index d6028204325..3793c09d883 100644 --- a/tests/integrations/mcs/scheduling/api_test.go +++ b/tests/integrations/mcs/scheduling/api_test.go @@ -172,6 +172,9 @@ func (suite *apiTestSuite) TestAPIForward() { // "/schedulers", http.MethodGet // "/schedulers/{name}", http.MethodPost // "/schedulers/diagnostic/{name}", http.MethodGet + // "/scheduler-config/", http.MethodGet + // "/scheduler-config/{name}/list", http.MethodGet + // "/scheduler-config/{name}/roles", http.MethodGet // Should not redirect: // "/schedulers", http.MethodPost // "/schedulers/{name}", http.MethodDelete @@ -191,6 +194,24 @@ func (suite *apiTestSuite) TestAPIForward() { testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) suite.NoError(err) + err = testutil.ReadGetJSON(re, testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "scheduler-config"), &resp, + testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) + suite.NoError(err) + re.Contains(resp, "balance-leader-scheduler") + re.Contains(resp, "balance-witness-scheduler") + re.Contains(resp, "balance-hot-region-scheduler") + + schedulers := []string{ + "balance-leader-scheduler", + "balance-witness-scheduler", + "balance-hot-region-scheduler", + } + for _, schedulerName := range schedulers { + err = testutil.ReadGetJSON(re, testDialClient, fmt.Sprintf("%s/%s/%s/%s", urlPrefix, "scheduler-config", schedulerName, "list"), &resp, + testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) + suite.NoError(err) + } + err = testutil.CheckPostJSON(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "schedulers"), pauseArgs, testutil.WithoutHeader(re, apiutil.ForwardToMicroServiceHeader)) re.NoError(err) diff --git a/tests/pdctl/scheduler/scheduler_test.go b/tests/pdctl/scheduler/scheduler_test.go index fe58e304791..3554b828269 100644 --- a/tests/pdctl/scheduler/scheduler_test.go +++ b/tests/pdctl/scheduler/scheduler_test.go @@ -17,6 +17,8 @@ package scheduler_test import ( "context" "encoding/json" + "reflect" + "strings" "testing" "time" @@ -43,8 +45,7 @@ func TestSchedulerTestSuite(t *testing.T) { func (suite *schedulerTestSuite) TestScheduler() { env := tests.NewSchedulingTestEnvironment(suite.T()) - // Fixme: use RunTestInTwoModes when sync deleted scheduler is supported. - env.RunTestInPDMode(suite.checkScheduler) + env.RunTestInTwoModes(suite.checkScheduler) env.RunTestInTwoModes(suite.checkSchedulerDiagnostic) } @@ -86,17 +87,27 @@ func (suite *schedulerTestSuite) checkScheduler(cluster *tests.TestCluster) { if args != nil { mustExec(re, cmd, args, nil) } - var schedulers []string - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "show"}, &schedulers) - for _, scheduler := range schedulers { - re.True(expected[scheduler]) - } + testutil.Eventually(re, func() bool { + var schedulers []string + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "show"}, &schedulers) + if len(schedulers) != len(expected) { + return false + } + for _, scheduler := range schedulers { + if _, ok := expected[scheduler]; !ok { + return false + } + } + return true + }) } checkSchedulerConfigCommand := func(expectedConfig map[string]interface{}, schedulerName string) { - configInfo := make(map[string]interface{}) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", schedulerName}, &configInfo) - re.Equal(expectedConfig, configInfo) + testutil.Eventually(re, func() bool { + configInfo := make(map[string]interface{}) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", schedulerName}, &configInfo) + return reflect.DeepEqual(expectedConfig, configInfo) + }) } leaderServer := cluster.GetLeaderServer() @@ -106,7 +117,6 @@ func (suite *schedulerTestSuite) checkScheduler(cluster *tests.TestCluster) { // note: because pdqsort is a unstable sort algorithm, set ApproximateSize for this region. tests.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetApproximateSize(10)) - time.Sleep(3 * time.Second) // scheduler show command expected := map[string]bool{ @@ -120,7 +130,6 @@ func (suite *schedulerTestSuite) checkScheduler(cluster *tests.TestCluster) { // scheduler delete command args := []string{"-u", pdAddr, "scheduler", "remove", "balance-region-scheduler"} - time.Sleep(10 * time.Second) expected = map[string]bool{ "balance-leader-scheduler": true, "balance-hot-region-scheduler": true, @@ -160,8 +169,11 @@ func (suite *schedulerTestSuite) checkScheduler(cluster *tests.TestCluster) { checkSchedulerCommand(args, expected) // check update success - expectedConfig["store-id-ranges"] = map[string]interface{}{"2": []interface{}{map[string]interface{}{"end-key": "", "start-key": ""}}, "3": []interface{}{map[string]interface{}{"end-key": "", "start-key": ""}}} - checkSchedulerConfigCommand(expectedConfig, schedulers[idx]) + // FIXME: remove this check after scheduler config is updated + if cluster.GetSchedulingPrimaryServer() == nil && schedulers[idx] == "grant-leader-scheduler" { + expectedConfig["store-id-ranges"] = map[string]interface{}{"2": []interface{}{map[string]interface{}{"end-key": "", "start-key": ""}}, "3": []interface{}{map[string]interface{}{"end-key": "", "start-key": ""}}} + checkSchedulerConfigCommand(expectedConfig, schedulers[idx]) + } // scheduler delete command args = []string{"-u", pdAddr, "scheduler", "remove", schedulers[idx]} @@ -261,26 +273,33 @@ func (suite *schedulerTestSuite) checkScheduler(cluster *tests.TestCluster) { mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "grant-hot-region-scheduler", "set", "2", "1,2,3"}, nil) expected3["store-leader-id"] = float64(2) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "grant-hot-region-scheduler"}, &conf3) - re.Equal(expected3, conf3) + // FIXME: remove this check after scheduler config is updated + if cluster.GetSchedulingPrimaryServer() == nil { // "grant-hot-region-scheduler" + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "grant-hot-region-scheduler"}, &conf3) + re.Equal(expected3, conf3) + } - // test balance region config + // test remove and add scheduler echo := mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "balance-region-scheduler"}, nil) re.Contains(echo, "Success!") echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "balance-region-scheduler"}, nil) re.Contains(echo, "Success!") echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "balance-region-scheduler"}, nil) re.NotContains(echo, "Success!") + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "balance-region-scheduler"}, nil) + re.Contains(echo, "Success!") echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "evict-leader-scheduler", "1"}, nil) re.Contains(echo, "Success!") echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "evict-leader-scheduler-1"}, nil) re.Contains(echo, "Success!") echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "evict-leader-scheduler-1"}, nil) re.Contains(echo, "404") + testutil.Eventually(re, func() bool { // wait for removed scheduler to be synced to scheduling server. + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-leader-scheduler"}, nil) + return strings.Contains(echo, "[404] scheduler not found") + }) // test hot region config - echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-leader-scheduler"}, nil) - re.Contains(echo, "[404] scheduler not found") expected1 := map[string]interface{}{ "min-hot-byte-rate": float64(100), "min-hot-key-rate": float64(10), @@ -311,74 +330,77 @@ func (suite *schedulerTestSuite) checkScheduler(cluster *tests.TestCluster) { mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "src-tolerance-ratio", "1.02"}, nil) expected1["src-tolerance-ratio"] = 1.02 var conf1 map[string]interface{} - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(expected1, conf1) - - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "byte,key"}, nil) - expected1["read-priorities"] = []interface{}{"byte", "key"} - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(expected1, conf1) - - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key"}, nil) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(expected1, conf1) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key,byte"}, nil) - expected1["read-priorities"] = []interface{}{"key", "byte"} - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(expected1, conf1) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "foo,bar"}, nil) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(expected1, conf1) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", ""}, nil) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(expected1, conf1) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key,key"}, nil) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(expected1, conf1) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "byte,byte"}, nil) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(expected1, conf1) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key,key,byte"}, nil) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(expected1, conf1) - - // write-priorities is divided into write-leader-priorities and write-peer-priorities - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "write-priorities", "key,byte"}, nil) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(expected1, conf1) - - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "rank-formula-version", "v0"}, nil) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - expected1["rank-formula-version"] = "v2" - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "rank-formula-version", "v2"}, nil) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(expected1, conf1) - expected1["rank-formula-version"] = "v1" - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "rank-formula-version", "v1"}, nil) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(expected1, conf1) - - expected1["forbid-rw-type"] = "read" - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "forbid-rw-type", "read"}, nil) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(expected1, conf1) - - // test compatibility - re.Equal("2.0.0", leaderServer.GetClusterVersion().String()) - for _, store := range stores { - version := versioninfo.HotScheduleWithQuery - store.Version = versioninfo.MinSupportedVersion(version).String() - tests.MustPutStore(re, cluster, store) + // FIXME: remove this check after scheduler config is updated + if cluster.GetSchedulingPrimaryServer() == nil { // "balance-hot-region-scheduler" + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(expected1, conf1) + + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "byte,key"}, nil) + expected1["read-priorities"] = []interface{}{"byte", "key"} + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(expected1, conf1) + + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(expected1, conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key,byte"}, nil) + expected1["read-priorities"] = []interface{}{"key", "byte"} + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(expected1, conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "foo,bar"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(expected1, conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", ""}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(expected1, conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key,key"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(expected1, conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "byte,byte"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(expected1, conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key,key,byte"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(expected1, conf1) + + // write-priorities is divided into write-leader-priorities and write-peer-priorities + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "write-priorities", "key,byte"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(expected1, conf1) + + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "rank-formula-version", "v0"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + expected1["rank-formula-version"] = "v2" + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "rank-formula-version", "v2"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(expected1, conf1) + expected1["rank-formula-version"] = "v1" + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "rank-formula-version", "v1"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(expected1, conf1) + + expected1["forbid-rw-type"] = "read" + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "forbid-rw-type", "read"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(expected1, conf1) + + // test compatibility + re.Equal("2.0.0", leaderServer.GetClusterVersion().String()) + for _, store := range stores { + version := versioninfo.HotScheduleWithQuery + store.Version = versioninfo.MinSupportedVersion(version).String() + tests.MustPutStore(re, cluster, store) + } + re.Equal("5.2.0", leaderServer.GetClusterVersion().String()) + // After upgrading, we should not use query. + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(conf1["read-priorities"], []interface{}{"key", "byte"}) + // cannot set qps as write-peer-priorities + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "write-peer-priorities", "query,byte"}, nil) + re.Contains(echo, "query is not allowed to be set in priorities for write-peer-priorities") + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + re.Equal(conf1["write-peer-priorities"], []interface{}{"byte", "key"}) } - re.Equal("5.2.0", leaderServer.GetClusterVersion().String()) - // After upgrading, we should not use query. - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(conf1["read-priorities"], []interface{}{"key", "byte"}) - // cannot set qps as write-peer-priorities - echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "write-peer-priorities", "query,byte"}, nil) - re.Contains(echo, "query is not allowed to be set in priorities for write-peer-priorities") - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) - re.Equal(conf1["write-peer-priorities"], []interface{}{"byte", "key"}) // test remove and add echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "balance-hot-region-scheduler"}, nil) @@ -412,24 +434,33 @@ func (suite *schedulerTestSuite) checkScheduler(cluster *tests.TestCluster) { for _, schedulerName := range evictSlownessSchedulers { echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", schedulerName}, nil) re.Contains(echo, "Success!") - echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "show"}, nil) - re.Contains(echo, schedulerName) + testutil.Eventually(re, func() bool { + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "show"}, nil) + return strings.Contains(echo, schedulerName) + }) echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", schedulerName, "set", "recovery-duration", "100"}, nil) re.Contains(echo, "Success!") conf = make(map[string]interface{}) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", schedulerName, "show"}, &conf) - re.Equal(100., conf["recovery-duration"]) + // FIXME: remove this check after scheduler config is updated + if cluster.GetSchedulingPrimaryServer() == nil && schedulerName == "evict-slow-store-scheduler" { + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", schedulerName, "show"}, &conf) + re.Equal(100., conf["recovery-duration"]) + } echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", schedulerName}, nil) re.Contains(echo, "Success!") - echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "show"}, nil) - re.NotContains(echo, schedulerName) + testutil.Eventually(re, func() bool { + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "show"}, nil) + return !strings.Contains(echo, schedulerName) + }) } // test show scheduler with paused and disabled status. checkSchedulerWithStatusCommand := func(status string, expected []string) { - var schedulers []string - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "show", "--status", status}, &schedulers) - re.Equal(expected, schedulers) + testutil.Eventually(re, func() bool { + var schedulers []string + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "show", "--status", status}, &schedulers) + return reflect.DeepEqual(expected, schedulers) + }) } mustUsage([]string{"-u", pdAddr, "scheduler", "pause", "balance-leader-scheduler"}) @@ -504,18 +535,14 @@ func (suite *schedulerTestSuite) checkSchedulerDiagnostic(cluster *tests.TestClu // note: because pdqsort is a unstable sort algorithm, set ApproximateSize for this region. tests.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetApproximateSize(10)) - time.Sleep(3 * time.Second) echo := mustExec(re, cmd, []string{"-u", pdAddr, "config", "set", "enable-diagnostic", "true"}, nil) re.Contains(echo, "Success!") checkSchedulerDescribeCommand("balance-region-scheduler", "pending", "1 store(s) RegionNotMatchRule; ") // scheduler delete command - // Fixme: use RunTestInTwoModes when sync deleted scheduler is supported. - if sche := cluster.GetSchedulingPrimaryServer(); sche == nil { - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "balance-region-scheduler"}, nil) - checkSchedulerDescribeCommand("balance-region-scheduler", "disabled", "") - } + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "balance-region-scheduler"}, nil) + checkSchedulerDescribeCommand("balance-region-scheduler", "disabled", "") mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "pause", "balance-leader-scheduler", "60"}, nil) mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "resume", "balance-leader-scheduler"}, nil) @@ -528,7 +555,7 @@ func mustExec(re *require.Assertions, cmd *cobra.Command, args []string, v inter if v == nil { return string(output) } - re.NoError(json.Unmarshal(output, v)) + re.NoError(json.Unmarshal(output, v), string(output)) return "" } diff --git a/tests/server/api/scheduler_test.go b/tests/server/api/scheduler_test.go index 95c4d936a8c..9db94e8562d 100644 --- a/tests/server/api/scheduler_test.go +++ b/tests/server/api/scheduler_test.go @@ -23,8 +23,10 @@ import ( "github.com/pingcap/failpoint" "github.com/pingcap/kvproto/pkg/metapb" + "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" sc "github.com/tikv/pd/pkg/schedule/config" + "github.com/tikv/pd/pkg/slice" tu "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/server" "github.com/tikv/pd/tests" @@ -41,13 +43,12 @@ func TestScheduleTestSuite(t *testing.T) { } func (suite *scheduleTestSuite) TestScheduler() { - // Fixme: use RunTestInTwoModes when sync deleted scheduler is supported. env := tests.NewSchedulingTestEnvironment(suite.T()) - env.RunTestInPDMode(suite.checkOriginAPI) + env.RunTestInTwoModes(suite.checkOriginAPI) env = tests.NewSchedulingTestEnvironment(suite.T()) - env.RunTestInPDMode(suite.checkAPI) + env.RunTestInTwoModes(suite.checkAPI) env = tests.NewSchedulingTestEnvironment(suite.T()) - env.RunTestInPDMode(suite.checkDisable) + env.RunTestInTwoModes(suite.checkDisable) } func (suite *scheduleTestSuite) checkOriginAPI(cluster *tests.TestCluster) { @@ -71,7 +72,7 @@ func (suite *scheduleTestSuite) checkOriginAPI(cluster *tests.TestCluster) { re := suite.Require() suite.NoError(tu.CheckPostJSON(testDialClient, urlPrefix, body, tu.StatusOK(re))) - suite.Len(suite.getSchedulers(urlPrefix), 1) + suite.assertSchedulerExists(re, urlPrefix, "evict-leader-scheduler") resp := make(map[string]interface{}) listURL := fmt.Sprintf("%s%s%s/%s/list", leaderAddr, apiPrefix, server.SchedulerConfigHandlerPath, "evict-leader-scheduler") suite.NoError(tu.ReadGetJSON(re, testDialClient, listURL, &resp)) @@ -83,20 +84,20 @@ func (suite *scheduleTestSuite) checkOriginAPI(cluster *tests.TestCluster) { suite.NoError(err) suite.NoError(failpoint.Enable("github.com/tikv/pd/pkg/schedule/schedulers/persistFail", "return(true)")) suite.NoError(tu.CheckPostJSON(testDialClient, urlPrefix, body, tu.StatusNotOK(re))) - suite.Len(suite.getSchedulers(urlPrefix), 1) + suite.assertSchedulerExists(re, urlPrefix, "evict-leader-scheduler") resp = make(map[string]interface{}) suite.NoError(tu.ReadGetJSON(re, testDialClient, listURL, &resp)) suite.Len(resp["store-id-ranges"], 1) suite.NoError(failpoint.Disable("github.com/tikv/pd/pkg/schedule/schedulers/persistFail")) suite.NoError(tu.CheckPostJSON(testDialClient, urlPrefix, body, tu.StatusOK(re))) - suite.Len(suite.getSchedulers(urlPrefix), 1) + suite.assertSchedulerExists(re, urlPrefix, "evict-leader-scheduler") resp = make(map[string]interface{}) suite.NoError(tu.ReadGetJSON(re, testDialClient, listURL, &resp)) suite.Len(resp["store-id-ranges"], 2) deleteURL := fmt.Sprintf("%s/%s", urlPrefix, "evict-leader-scheduler-1") err = tu.CheckDelete(testDialClient, deleteURL, tu.StatusOK(re)) suite.NoError(err) - suite.Len(suite.getSchedulers(urlPrefix), 1) + suite.assertSchedulerExists(re, urlPrefix, "evict-leader-scheduler") resp1 := make(map[string]interface{}) suite.NoError(tu.ReadGetJSON(re, testDialClient, listURL, &resp1)) suite.Len(resp1["store-id-ranges"], 1) @@ -104,11 +105,11 @@ func (suite *scheduleTestSuite) checkOriginAPI(cluster *tests.TestCluster) { suite.NoError(failpoint.Enable("github.com/tikv/pd/server/config/persistFail", "return(true)")) err = tu.CheckDelete(testDialClient, deleteURL, tu.Status(re, http.StatusInternalServerError)) suite.NoError(err) - suite.Len(suite.getSchedulers(urlPrefix), 1) + suite.assertSchedulerExists(re, urlPrefix, "evict-leader-scheduler") suite.NoError(failpoint.Disable("github.com/tikv/pd/server/config/persistFail")) err = tu.CheckDelete(testDialClient, deleteURL, tu.StatusOK(re)) suite.NoError(err) - suite.Empty(suite.getSchedulers(urlPrefix)) + suite.assertNoScheduler(re, urlPrefix, "evict-leader-scheduler") suite.NoError(tu.CheckGetJSON(testDialClient, listURL, nil, tu.Status(re, http.StatusNotFound))) err = tu.CheckDelete(testDialClient, deleteURL, tu.Status(re, http.StatusNotFound)) suite.NoError(err) @@ -230,23 +231,27 @@ func (suite *scheduleTestSuite) checkAPI(cluster *tests.TestCluster) { suite.NoError(tu.CheckPostJSON(testDialClient, updateURL, body, tu.StatusOK(re))) resp = make(map[string]interface{}) suite.NoError(tu.ReadGetJSON(re, testDialClient, listURL, &resp)) - for key := range expectMap { - suite.Equal(expectMap[key], resp[key], "key %s", key) + // FIXME: remove this check after scheduler config is updated + if cluster.GetSchedulingPrimaryServer() == nil { // "balance-hot-region-scheduler" + for key := range expectMap { + suite.Equal(expectMap[key], resp[key], "key %s", key) + } + + // update again + err = tu.CheckPostJSON(testDialClient, updateURL, body, + tu.StatusOK(re), + tu.StringEqual(re, "Config is the same with origin, so do nothing.")) + suite.NoError(err) + // config item not found + dataMap = map[string]interface{}{} + dataMap["error"] = 3 + body, err = json.Marshal(dataMap) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, updateURL, body, + tu.Status(re, http.StatusBadRequest), + tu.StringEqual(re, "Config item is not found.")) + suite.NoError(err) } - // update again - err = tu.CheckPostJSON(testDialClient, updateURL, body, - tu.StatusOK(re), - tu.StringEqual(re, "Config is the same with origin, so do nothing.")) - suite.NoError(err) - // config item not found - dataMap = map[string]interface{}{} - dataMap["error"] = 3 - body, err = json.Marshal(dataMap) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, updateURL, body, - tu.Status(re, http.StatusBadRequest), - tu.StringEqual(re, "Config item is not found.")) - suite.NoError(err) }, }, { @@ -468,6 +473,7 @@ func (suite *scheduleTestSuite) checkAPI(cluster *tests.TestCluster) { testCase.extraTestFunc(testCase.createdName) } suite.deleteScheduler(urlPrefix, testCase.createdName) + suite.assertNoScheduler(re, urlPrefix, testCase.createdName) } // test pause and resume all schedulers. @@ -482,6 +488,7 @@ func (suite *scheduleTestSuite) checkAPI(cluster *tests.TestCluster) { body, err := json.Marshal(input) suite.NoError(err) suite.addScheduler(urlPrefix, body) + suite.assertSchedulerExists(re, urlPrefix, testCase.createdName) // wait for scheduler to be synced. if testCase.extraTestFunc != nil { testCase.extraTestFunc(testCase.createdName) } @@ -545,6 +552,7 @@ func (suite *scheduleTestSuite) checkAPI(cluster *tests.TestCluster) { createdName = testCase.name } suite.deleteScheduler(urlPrefix, createdName) + suite.assertNoScheduler(re, urlPrefix, createdName) } } @@ -581,16 +589,8 @@ func (suite *scheduleTestSuite) checkDisable(cluster *tests.TestCluster) { err = tu.CheckPostJSON(testDialClient, u, body, tu.StatusOK(re)) suite.NoError(err) - var schedulers []string - err = tu.ReadGetJSON(re, testDialClient, urlPrefix, &schedulers) - suite.NoError(err) - suite.Len(schedulers, 1) - suite.Equal(name, schedulers[0]) - - err = tu.ReadGetJSON(re, testDialClient, fmt.Sprintf("%s?status=disabled", urlPrefix), &schedulers) - suite.NoError(err) - suite.Len(schedulers, 1) - suite.Equal(name, schedulers[0]) + suite.assertNoScheduler(re, urlPrefix, name) + suite.assertSchedulerExists(re, fmt.Sprintf("%s?status=disabled", urlPrefix), name) // reset schedule config scheduleConfig.Schedulers = originSchedulers @@ -600,6 +600,7 @@ func (suite *scheduleTestSuite) checkDisable(cluster *tests.TestCluster) { suite.NoError(err) suite.deleteScheduler(urlPrefix, name) + suite.assertNoScheduler(re, urlPrefix, name) } func (suite *scheduleTestSuite) addScheduler(urlPrefix string, body []byte) { @@ -614,12 +615,17 @@ func (suite *scheduleTestSuite) deleteScheduler(urlPrefix string, createdName st } func (suite *scheduleTestSuite) testPauseOrResume(urlPrefix string, name, createdName string, body []byte) { + re := suite.Require() if createdName == "" { createdName = name } - re := suite.Require() - err := tu.CheckPostJSON(testDialClient, urlPrefix, body, tu.StatusOK(re)) - suite.NoError(err) + var schedulers []string + tu.ReadGetJSON(suite.Require(), testDialClient, urlPrefix, &schedulers) + if !slice.Contains(schedulers, createdName) { + err := tu.CheckPostJSON(testDialClient, urlPrefix, body, tu.StatusOK(re)) + re.NoError(err) + } + suite.assertSchedulerExists(re, urlPrefix, createdName) // wait for scheduler to be synced. // test pause. input := make(map[string]interface{}) @@ -655,9 +661,20 @@ func (suite *scheduleTestSuite) testPauseOrResume(urlPrefix string, name, create suite.False(isPaused) } -func (suite *scheduleTestSuite) getSchedulers(urlPrefix string) (resp []string) { - tu.ReadGetJSON(suite.Require(), testDialClient, urlPrefix, &resp) - return +func (suite *scheduleTestSuite) assertSchedulerExists(re *require.Assertions, urlPrefix string, scheduler string) { + var schedulers []string + tu.Eventually(re, func() bool { + tu.ReadGetJSON(suite.Require(), testDialClient, urlPrefix, &schedulers) + return slice.Contains(schedulers, scheduler) + }) +} + +func (suite *scheduleTestSuite) assertNoScheduler(re *require.Assertions, urlPrefix string, scheduler string) { + var schedulers []string + tu.Eventually(re, func() bool { + tu.ReadGetJSON(suite.Require(), testDialClient, urlPrefix, &schedulers) + return !slice.Contains(schedulers, scheduler) + }) } func (suite *scheduleTestSuite) isSchedulerPaused(urlPrefix, name string) bool { diff --git a/tools/pd-ctl/pdctl/command/scheduler.go b/tools/pd-ctl/pdctl/command/scheduler.go index 4349735f06d..526ff2646dc 100644 --- a/tools/pd-ctl/pdctl/command/scheduler.go +++ b/tools/pd-ctl/pdctl/command/scheduler.go @@ -745,11 +745,17 @@ func showShuffleRegionSchedulerRolesCommandFunc(cmd *cobra.Command, args []strin if p == "show-roles" { p = cmd.Parent().Name() } - path := path.Join(schedulerConfigPrefix, p, "roles") - r, err := doRequest(cmd, path, http.MethodGet, http.Header{}) + url := path.Join(schedulerConfigPrefix, p, "list") + r, err := doRequest(cmd, url, http.MethodGet, http.Header{}) if err != nil { - cmd.Println(err) - return + // try to use old api + var err2 error + url := path.Join(schedulerConfigPrefix, p, "roles") + r, err2 = doRequest(cmd, url, http.MethodGet, http.Header{}) + if err2 != nil { + cmd.Println(err, err2) + return + } } cmd.Println(r) } From a1a1eea8dafd7918d583378790a4bb6c39a21f97 Mon Sep 17 00:00:00 2001 From: Yongbo Jiang Date: Wed, 1 Nov 2023 16:42:37 +0800 Subject: [PATCH 04/20] resourcemanager: change param name (#7293) ref tikv/pd#4399 Signed-off-by: Cabinfever_B --- .../resourcemanager/server/resource_group.go | 4 +- .../resourcemanager/server/token_buckets.go | 42 +++++++++---------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/pkg/mcs/resourcemanager/server/resource_group.go b/pkg/mcs/resourcemanager/server/resource_group.go index 863cfd19026..fc3a58cab51 100644 --- a/pkg/mcs/resourcemanager/server/resource_group.go +++ b/pkg/mcs/resourcemanager/server/resource_group.go @@ -138,7 +138,7 @@ func FromProtoResourceGroup(group *rmpb.ResourceGroup) *ResourceGroup { // RequestRU requests the RU of the resource group. func (rg *ResourceGroup) RequestRU( now time.Time, - neededTokens float64, + requiredToken float64, targetPeriodMs, clientUniqueID uint64, ) *rmpb.GrantedRUTokenBucket { rg.Lock() @@ -147,7 +147,7 @@ func (rg *ResourceGroup) RequestRU( if rg.RUSettings == nil || rg.RUSettings.RU.Settings == nil { return nil } - tb, trickleTimeMs := rg.RUSettings.RU.request(now, neededTokens, targetPeriodMs, clientUniqueID) + tb, trickleTimeMs := rg.RUSettings.RU.request(now, requiredToken, targetPeriodMs, clientUniqueID) return &rmpb.GrantedRUTokenBucket{GrantedTokens: tb, TrickleTimeMs: trickleTimeMs} } diff --git a/pkg/mcs/resourcemanager/server/token_buckets.go b/pkg/mcs/resourcemanager/server/token_buckets.go index 5efab52fe68..a0acba3b54d 100644 --- a/pkg/mcs/resourcemanager/server/token_buckets.go +++ b/pkg/mcs/resourcemanager/server/token_buckets.go @@ -268,7 +268,7 @@ func (gtb *GroupTokenBucket) init(now time.Time, clientID uint64) { } // updateTokens updates the tokens and settings. -func (gtb *GroupTokenBucket) updateTokens(now time.Time, burstLimit int64, clientUniqueID uint64, consumptionToken float64) { +func (gtb *GroupTokenBucket) updateTokens(now time.Time, burstLimit int64, clientUniqueID uint64, requiredToken float64) { var elapseTokens float64 if !gtb.Initialized { gtb.init(now, clientUniqueID) @@ -288,21 +288,21 @@ func (gtb *GroupTokenBucket) updateTokens(now time.Time, burstLimit int64, clien gtb.Tokens = burst } // Balance each slots. - gtb.balanceSlotTokens(clientUniqueID, gtb.Settings, consumptionToken, elapseTokens) + gtb.balanceSlotTokens(clientUniqueID, gtb.Settings, requiredToken, elapseTokens) } // request requests tokens from the corresponding slot. func (gtb *GroupTokenBucket) request(now time.Time, - neededTokens float64, + requiredToken float64, targetPeriodMs, clientUniqueID uint64, ) (*rmpb.TokenBucket, int64) { burstLimit := gtb.Settings.GetBurstLimit() - gtb.updateTokens(now, burstLimit, clientUniqueID, neededTokens) + gtb.updateTokens(now, burstLimit, clientUniqueID, requiredToken) slot, ok := gtb.tokenSlots[clientUniqueID] if !ok { return &rmpb.TokenBucket{Settings: &rmpb.TokenLimitSettings{BurstLimit: burstLimit}}, 0 } - res, trickleDuration := slot.assignSlotTokens(neededTokens, targetPeriodMs) + res, trickleDuration := slot.assignSlotTokens(requiredToken, targetPeriodMs) // Update bucket to record all tokens. gtb.Tokens -= slot.lastTokenCapacity - slot.tokenCapacity slot.lastTokenCapacity = slot.tokenCapacity @@ -310,24 +310,24 @@ func (gtb *GroupTokenBucket) request(now time.Time, return res, trickleDuration } -func (ts *TokenSlot) assignSlotTokens(neededTokens float64, targetPeriodMs uint64) (*rmpb.TokenBucket, int64) { +func (ts *TokenSlot) assignSlotTokens(requiredToken float64, targetPeriodMs uint64) (*rmpb.TokenBucket, int64) { var res rmpb.TokenBucket burstLimit := ts.settings.GetBurstLimit() res.Settings = &rmpb.TokenLimitSettings{BurstLimit: burstLimit} // If BurstLimit < 0, just return. if burstLimit < 0 { - res.Tokens = neededTokens + res.Tokens = requiredToken return &res, 0 } // FillRate is used for the token server unavailable in abnormal situation. - if neededTokens <= 0 { + if requiredToken <= 0 { return &res, 0 } // If the current tokens can directly meet the requirement, returns the need token. - if ts.tokenCapacity >= neededTokens { - ts.tokenCapacity -= neededTokens + if ts.tokenCapacity >= requiredToken { + ts.tokenCapacity -= requiredToken // granted the total request tokens - res.Tokens = neededTokens + res.Tokens = requiredToken return &res, 0 } @@ -336,7 +336,7 @@ func (ts *TokenSlot) assignSlotTokens(neededTokens float64, targetPeriodMs uint6 hasRemaining := false if ts.tokenCapacity > 0 { grantedTokens = ts.tokenCapacity - neededTokens -= grantedTokens + requiredToken -= grantedTokens ts.tokenCapacity = 0 hasRemaining = true } @@ -373,36 +373,36 @@ func (ts *TokenSlot) assignSlotTokens(neededTokens float64, targetPeriodMs uint6 for i := 1; i < loanCoefficient; i++ { p[i] = float64(loanCoefficient-i)*float64(fillRate)*targetPeriodTimeSec + p[i-1] } - for i := 0; i < loanCoefficient && neededTokens > 0 && trickleTime < targetPeriodTimeSec; i++ { + for i := 0; i < loanCoefficient && requiredToken > 0 && trickleTime < targetPeriodTimeSec; i++ { loan := -ts.tokenCapacity if loan >= p[i] { continue } roundReserveTokens := p[i] - loan fillRate := float64(loanCoefficient-i) * float64(fillRate) - if roundReserveTokens > neededTokens { - ts.tokenCapacity -= neededTokens - grantedTokens += neededTokens + if roundReserveTokens > requiredToken { + ts.tokenCapacity -= requiredToken + grantedTokens += requiredToken trickleTime += grantedTokens / fillRate - neededTokens = 0 + requiredToken = 0 } else { roundReserveTime := roundReserveTokens / fillRate if roundReserveTime+trickleTime >= targetPeriodTimeSec { roundTokens := (targetPeriodTimeSec - trickleTime) * fillRate - neededTokens -= roundTokens + requiredToken -= roundTokens ts.tokenCapacity -= roundTokens grantedTokens += roundTokens trickleTime = targetPeriodTimeSec } else { grantedTokens += roundReserveTokens - neededTokens -= roundReserveTokens + requiredToken -= roundReserveTokens ts.tokenCapacity -= roundReserveTokens trickleTime += roundReserveTime } } } - if neededTokens > 0 && grantedTokens < defaultReserveRatio*float64(fillRate)*targetPeriodTimeSec { - reservedTokens := math.Min(neededTokens+grantedTokens, defaultReserveRatio*float64(fillRate)*targetPeriodTimeSec) + if requiredToken > 0 && grantedTokens < defaultReserveRatio*float64(fillRate)*targetPeriodTimeSec { + reservedTokens := math.Min(requiredToken+grantedTokens, defaultReserveRatio*float64(fillRate)*targetPeriodTimeSec) ts.tokenCapacity -= reservedTokens - grantedTokens grantedTokens = reservedTokens } From 01fb56b4f1eb0313e41c25ef08c1b543bd153d0e Mon Sep 17 00:00:00 2001 From: Yexiang Zhang Date: Thu, 2 Nov 2023 17:38:10 +0800 Subject: [PATCH 05/20] dashboard: update hotfix version (#7303) close tikv/pd#7302 Signed-off-by: mornyx --- go.mod | 2 +- go.sum | 4 ++-- tests/integrations/client/go.mod | 2 +- tests/integrations/client/go.sum | 4 ++-- tests/integrations/mcs/go.mod | 2 +- tests/integrations/mcs/go.sum | 4 ++-- tests/integrations/tso/go.mod | 2 +- tests/integrations/tso/go.sum | 4 ++-- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index 86f56089347..e8da2542be2 100644 --- a/go.mod +++ b/go.mod @@ -36,7 +36,7 @@ require ( github.com/pingcap/kvproto v0.0.0-20231018065736-c0689aded40c github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 - github.com/pingcap/tidb-dashboard v0.0.0-20230911054332-22add1e00511 + github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 github.com/prometheus/client_golang v1.11.1 github.com/prometheus/common v0.26.0 github.com/sasha-s/go-deadlock v0.2.0 diff --git a/go.sum b/go.sum index 9392644f181..28e210ef1cd 100644 --- a/go.sum +++ b/go.sum @@ -446,8 +446,8 @@ github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 h1:QV6jqlfOkh8hqvEAgwBZa+4bSgO0EeKC7s5c6Luam2I= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21/go.mod h1:QYnjfA95ZaMefyl1NO8oPtKeb8pYUdnDVhQgf+qdpjM= -github.com/pingcap/tidb-dashboard v0.0.0-20230911054332-22add1e00511 h1:oyrCfNlAWmLlUfEr+7YTSBo29SP/J1N8hnxBt5yUABo= -github.com/pingcap/tidb-dashboard v0.0.0-20230911054332-22add1e00511/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= +github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 h1:xIeaDUq2ItkYMIgpWXAYKC/N3hs8aurfFvvz79lhHYE= +github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e h1:FBaTXU8C3xgt/drM58VHxojHo/QoG1oPsgWTGvaSpO4= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e/go.mod h1:A7mrd7WHBl1o63LE2bIBGEJMTNWXqhgmYiOvMLxozfs= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= diff --git a/tests/integrations/client/go.mod b/tests/integrations/client/go.mod index e38efbeb438..b9b868cf8e3 100644 --- a/tests/integrations/client/go.mod +++ b/tests/integrations/client/go.mod @@ -119,7 +119,7 @@ require ( github.com/pingcap/errcode v0.3.0 // indirect github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c // indirect github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 // indirect - github.com/pingcap/tidb-dashboard v0.0.0-20230911054332-22add1e00511 // indirect + github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 // indirect github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/tests/integrations/client/go.sum b/tests/integrations/client/go.sum index c745c4fa518..81fa6fd7b39 100644 --- a/tests/integrations/client/go.sum +++ b/tests/integrations/client/go.sum @@ -410,8 +410,8 @@ github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 h1:QV6jqlfOkh8hqvEAgwBZa+4bSgO0EeKC7s5c6Luam2I= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21/go.mod h1:QYnjfA95ZaMefyl1NO8oPtKeb8pYUdnDVhQgf+qdpjM= -github.com/pingcap/tidb-dashboard v0.0.0-20230911054332-22add1e00511 h1:oyrCfNlAWmLlUfEr+7YTSBo29SP/J1N8hnxBt5yUABo= -github.com/pingcap/tidb-dashboard v0.0.0-20230911054332-22add1e00511/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= +github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 h1:xIeaDUq2ItkYMIgpWXAYKC/N3hs8aurfFvvz79lhHYE= +github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e h1:FBaTXU8C3xgt/drM58VHxojHo/QoG1oPsgWTGvaSpO4= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e/go.mod h1:A7mrd7WHBl1o63LE2bIBGEJMTNWXqhgmYiOvMLxozfs= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= diff --git a/tests/integrations/mcs/go.mod b/tests/integrations/mcs/go.mod index 000bfdc8312..c2dfdbe96ef 100644 --- a/tests/integrations/mcs/go.mod +++ b/tests/integrations/mcs/go.mod @@ -119,7 +119,7 @@ require ( github.com/pingcap/errcode v0.3.0 // indirect github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c // indirect github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 // indirect - github.com/pingcap/tidb-dashboard v0.0.0-20230911054332-22add1e00511 // indirect + github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 // indirect github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/tests/integrations/mcs/go.sum b/tests/integrations/mcs/go.sum index 0da75329284..d1b0962ab55 100644 --- a/tests/integrations/mcs/go.sum +++ b/tests/integrations/mcs/go.sum @@ -414,8 +414,8 @@ github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 h1:QV6jqlfOkh8hqvEAgwBZa+4bSgO0EeKC7s5c6Luam2I= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21/go.mod h1:QYnjfA95ZaMefyl1NO8oPtKeb8pYUdnDVhQgf+qdpjM= -github.com/pingcap/tidb-dashboard v0.0.0-20230911054332-22add1e00511 h1:oyrCfNlAWmLlUfEr+7YTSBo29SP/J1N8hnxBt5yUABo= -github.com/pingcap/tidb-dashboard v0.0.0-20230911054332-22add1e00511/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= +github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 h1:xIeaDUq2ItkYMIgpWXAYKC/N3hs8aurfFvvz79lhHYE= +github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e h1:FBaTXU8C3xgt/drM58VHxojHo/QoG1oPsgWTGvaSpO4= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e/go.mod h1:A7mrd7WHBl1o63LE2bIBGEJMTNWXqhgmYiOvMLxozfs= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= diff --git a/tests/integrations/tso/go.mod b/tests/integrations/tso/go.mod index f8a5cfac75f..e5131f15d91 100644 --- a/tests/integrations/tso/go.mod +++ b/tests/integrations/tso/go.mod @@ -117,7 +117,7 @@ require ( github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c // indirect github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 // indirect github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 // indirect - github.com/pingcap/tidb-dashboard v0.0.0-20230911054332-22add1e00511 // indirect + github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 // indirect github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/tests/integrations/tso/go.sum b/tests/integrations/tso/go.sum index 63327985f0d..576c3e75765 100644 --- a/tests/integrations/tso/go.sum +++ b/tests/integrations/tso/go.sum @@ -408,8 +408,8 @@ github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 h1:QV6jqlfOkh8hqvEAgwBZa+4bSgO0EeKC7s5c6Luam2I= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21/go.mod h1:QYnjfA95ZaMefyl1NO8oPtKeb8pYUdnDVhQgf+qdpjM= -github.com/pingcap/tidb-dashboard v0.0.0-20230911054332-22add1e00511 h1:oyrCfNlAWmLlUfEr+7YTSBo29SP/J1N8hnxBt5yUABo= -github.com/pingcap/tidb-dashboard v0.0.0-20230911054332-22add1e00511/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= +github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 h1:xIeaDUq2ItkYMIgpWXAYKC/N3hs8aurfFvvz79lhHYE= +github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e h1:FBaTXU8C3xgt/drM58VHxojHo/QoG1oPsgWTGvaSpO4= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e/go.mod h1:A7mrd7WHBl1o63LE2bIBGEJMTNWXqhgmYiOvMLxozfs= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= From 689fcbe2ff081e96ecad2762dd0b89c07364ffc5 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Fri, 3 Nov 2023 10:09:39 +0800 Subject: [PATCH 06/20] checker: replace down check with disconnect check when fixing orphan peer (#7294) close tikv/pd#7249 Signed-off-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/core/store.go | 3 + pkg/schedule/checker/rule_checker.go | 60 +++++--- pkg/schedule/checker/rule_checker_test.go | 176 +++++++++++++++++++++- 3 files changed, 214 insertions(+), 25 deletions(-) diff --git a/pkg/core/store.go b/pkg/core/store.go index 1d3362cac0e..b3c62f45750 100644 --- a/pkg/core/store.go +++ b/pkg/core/store.go @@ -551,6 +551,9 @@ var ( // tikv's store heartbeat for a short time, maybe caused by process restart or // temporary network failure. func (s *StoreInfo) IsDisconnected() bool { + if s == nil { + return true + } return s.DownTime() > storeDisconnectDuration } diff --git a/pkg/schedule/checker/rule_checker.go b/pkg/schedule/checker/rule_checker.go index 7012359ca36..84cafaa871e 100644 --- a/pkg/schedule/checker/rule_checker.go +++ b/pkg/schedule/checker/rule_checker.go @@ -447,7 +447,7 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg if len(fit.OrphanPeers) == 0 { return nil, nil } - var pinDownPeer *metapb.Peer + isUnhealthyPeer := func(id uint64) bool { for _, downPeer := range region.GetDownPeers() { if downPeer.Peer.GetId() == id { @@ -461,31 +461,41 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg } return false } + + isDisconnectedPeer := func(p *metapb.Peer) bool { + // avoid to meet down store when fix orphan peers, + // Isdisconnected is more strictly than IsUnhealthy. + return c.cluster.GetStore(p.GetStoreId()).IsDisconnected() + } + + checkDownPeer := func(peers []*metapb.Peer) (*metapb.Peer, bool) { + for _, p := range peers { + if isUnhealthyPeer(p.GetId()) { + // make sure is down peer. + if region.GetDownPeer(p.GetId()) != nil { + return p, true + } + return nil, true + } + if isDisconnectedPeer(p) { + return p, true + } + } + return nil, false + } + // remove orphan peers only when all rules are satisfied (count+role) and all peers selected // by RuleFits is not pending or down. + var pinDownPeer *metapb.Peer hasUnhealthyFit := false -loopFits: for _, rf := range fit.RuleFits { if !rf.IsSatisfied() { hasUnhealthyFit = true break } - for _, p := range rf.Peers { - if isUnhealthyPeer(p.GetId()) { - // make sure is down peer. - if region.GetDownPeer(p.GetId()) != nil { - pinDownPeer = p - } - hasUnhealthyFit = true - break loopFits - } - // avoid to meet down store when fix orpahn peers, - // Isdisconnected is more strictly than IsUnhealthy. - if c.cluster.GetStore(p.GetStoreId()).IsDisconnected() { - hasUnhealthyFit = true - pinDownPeer = p - break loopFits - } + pinDownPeer, hasUnhealthyFit = checkDownPeer(rf.Peers) + if hasUnhealthyFit { + break } } @@ -502,15 +512,15 @@ loopFits: continue } // make sure the orphan peer is healthy. - if isUnhealthyPeer(orphanPeer.GetId()) { + if isUnhealthyPeer(orphanPeer.GetId()) || isDisconnectedPeer(orphanPeer) { continue } // no consider witness in this path. if pinDownPeer.GetIsWitness() || orphanPeer.GetIsWitness() { continue } - // down peer's store should be down. - if !c.isStoreDownTimeHitMaxDownTime(pinDownPeer.GetStoreId()) { + // down peer's store should be disconnected + if !isDisconnectedPeer(pinDownPeer) { continue } // check if down peer can replace with orphan peer. @@ -525,7 +535,7 @@ loopFits: case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Learner: return operator.CreateDemoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer) case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Voter && - c.cluster.GetStore(pinDownPeer.GetStoreId()).IsDisconnected() && !dstStore.IsDisconnected(): + isDisconnectedPeer(pinDownPeer) && !dstStore.IsDisconnected(): return operator.CreateRemovePeerOperator("remove-replaced-orphan-peer", c.cluster, 0, region, pinDownPeer.GetStoreId()) default: // destRole should not same with orphanPeerRole. if role is same, it fit with orphanPeer should be better than now. @@ -542,7 +552,11 @@ loopFits: for _, orphanPeer := range fit.OrphanPeers { if isUnhealthyPeer(orphanPeer.GetId()) { ruleCheckerRemoveOrphanPeerCounter.Inc() - return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) + return operator.CreateRemovePeerOperator("remove-unhealthy-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) + } + if isDisconnectedPeer(orphanPeer) { + ruleCheckerRemoveOrphanPeerCounter.Inc() + return operator.CreateRemovePeerOperator("remove-disconnected-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) } if hasHealthPeer { // there already exists a healthy orphan peer, so we can remove other orphan Peers. diff --git a/pkg/schedule/checker/rule_checker_test.go b/pkg/schedule/checker/rule_checker_test.go index 8ee3b1eccfa..0c4a2a9ecc9 100644 --- a/pkg/schedule/checker/rule_checker_test.go +++ b/pkg/schedule/checker/rule_checker_test.go @@ -235,7 +235,7 @@ func (suite *ruleCheckerTestSuite) TestFixToManyOrphanPeers() { suite.cluster.PutRegion(region) op = suite.rc.Check(suite.cluster.GetRegion(1)) suite.NotNil(op) - suite.Equal("remove-orphan-peer", op.Desc()) + suite.Equal("remove-unhealthy-orphan-peer", op.Desc()) suite.Equal(uint64(4), op.Step(0).(operator.RemovePeer).FromStore) } @@ -702,7 +702,7 @@ func (suite *ruleCheckerTestSuite) TestPriorityFixOrphanPeer() { suite.cluster.PutRegion(testRegion) op = suite.rc.Check(suite.cluster.GetRegion(1)) suite.NotNil(op) - suite.Equal("remove-orphan-peer", op.Desc()) + suite.Equal("remove-unhealthy-orphan-peer", op.Desc()) suite.IsType(remove, op.Step(0)) // Ref #3521 suite.cluster.SetStoreOffline(2) @@ -723,6 +723,178 @@ func (suite *ruleCheckerTestSuite) TestPriorityFixOrphanPeer() { suite.Equal("remove-orphan-peer", op.Desc()) } +// Ref https://github.com/tikv/pd/issues/7249 https://github.com/tikv/tikv/issues/15799 +func (suite *ruleCheckerTestSuite) TestFixOrphanPeerWithDisconnectedStoreAndRuleChanged() { + // init cluster with 5 replicas + suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) + suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"}) + suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"}) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) + suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) + storeIDs := []uint64{1, 2, 3, 4, 5} + suite.cluster.AddLeaderRegionWithRange(1, "", "", storeIDs[0], storeIDs[1:]...) + rule := &placement.Rule{ + GroupID: "pd", + ID: "default", + Role: placement.Voter, + Count: 5, + StartKey: []byte{}, + EndKey: []byte{}, + } + suite.ruleManager.SetRule(rule) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.Nil(op) + + // set store 1, 2 to disconnected + suite.cluster.SetStoreDisconnect(1) + suite.cluster.SetStoreDisconnect(2) + + // change rule to 3 replicas + rule = &placement.Rule{ + GroupID: "pd", + ID: "default", + Role: placement.Voter, + Count: 3, + StartKey: []byte{}, + EndKey: []byte{}, + Override: true, + } + suite.ruleManager.SetRule(rule) + + // remove store 1 from region 1 + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-replaced-orphan-peer", op.Desc()) + suite.Equal(op.Len(), 2) + newLeaderID := op.Step(0).(operator.TransferLeader).ToStore + removedPeerID := op.Step(1).(operator.RemovePeer).FromStore + r1 := suite.cluster.GetRegion(1) + r1 = r1.Clone( + core.WithLeader(r1.GetPeer(newLeaderID)), + core.WithRemoveStorePeer(removedPeerID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 4) + + // remove store 2 from region 1 + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-replaced-orphan-peer", op.Desc()) + suite.Equal(op.Len(), 1) + removedPeerID = op.Step(0).(operator.RemovePeer).FromStore + r1 = r1.Clone(core.WithRemoveStorePeer(removedPeerID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 3) + for _, p := range r1.GetPeers() { + suite.NotEqual(p.GetStoreId(), 1) + suite.NotEqual(p.GetStoreId(), 2) + } +} + +// Ref https://github.com/tikv/pd/issues/7249 https://github.com/tikv/tikv/issues/15799 +func (suite *ruleCheckerTestSuite) TestFixOrphanPeerWithDisconnectedStoreAndRuleChanged2() { + // init cluster with 5 voters and 1 learner + suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) + suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"}) + suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"}) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) + suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) + suite.cluster.AddLabelsStore(6, 1, map[string]string{"host": "host6"}) + storeIDs := []uint64{1, 2, 3, 4, 5} + suite.cluster.AddLeaderRegionWithRange(1, "", "", storeIDs[0], storeIDs[1:]...) + r1 := suite.cluster.GetRegion(1) + r1 = r1.Clone(core.WithAddPeer(&metapb.Peer{Id: 6, StoreId: 6, Role: metapb.PeerRole_Learner})) + suite.cluster.PutRegion(r1) + err := suite.ruleManager.SetRules([]*placement.Rule{ + { + GroupID: "pd", + ID: "default", + Index: 100, + Override: true, + Role: placement.Voter, + Count: 5, + IsWitness: false, + }, + { + GroupID: "pd", + ID: "r1", + Index: 100, + Override: false, + Role: placement.Learner, + Count: 1, + IsWitness: false, + }, + }) + suite.NoError(err) + + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.Nil(op) + + // set store 1, 2 to disconnected + suite.cluster.SetStoreDisconnect(1) + suite.cluster.SetStoreDisconnect(2) + suite.cluster.SetStoreDisconnect(3) + + // change rule to 3 replicas + suite.ruleManager.DeleteRuleGroup("pd") + suite.ruleManager.SetRule(&placement.Rule{ + GroupID: "pd", + ID: "default", + Role: placement.Voter, + Count: 2, + StartKey: []byte{}, + EndKey: []byte{}, + Override: true, + }) + + // remove store 1 from region 1 + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-replaced-orphan-peer", op.Desc()) + suite.Equal(op.Len(), 2) + newLeaderID := op.Step(0).(operator.TransferLeader).ToStore + removedPeerID := op.Step(1).(operator.RemovePeer).FromStore + r1 = suite.cluster.GetRegion(1) + r1 = r1.Clone( + core.WithLeader(r1.GetPeer(newLeaderID)), + core.WithRemoveStorePeer(removedPeerID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 5) + + // remove store 2 from region 1 + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-replaced-orphan-peer", op.Desc()) + suite.Equal(op.Len(), 1) + removedPeerID = op.Step(0).(operator.RemovePeer).FromStore + r1 = r1.Clone(core.WithRemoveStorePeer(removedPeerID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 4) + for _, p := range r1.GetPeers() { + fmt.Println(p.GetStoreId(), p.Role.String()) + } + + // remove store 3 from region 1 + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-replaced-orphan-peer", op.Desc()) + suite.Equal(op.Len(), 1) + removedPeerID = op.Step(0).(operator.RemovePeer).FromStore + r1 = r1.Clone(core.WithRemoveStorePeer(removedPeerID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 3) + + for _, p := range r1.GetPeers() { + suite.NotEqual(p.GetStoreId(), 1) + suite.NotEqual(p.GetStoreId(), 2) + suite.NotEqual(p.GetStoreId(), 3) + } +} + func (suite *ruleCheckerTestSuite) TestPriorityFitHealthWithDifferentRole1() { suite.cluster.SetEnableUseJointConsensus(true) suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) From ab8bf7b7a62d43abd9a8d5213fc3b5855472bd9e Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Mon, 6 Nov 2023 10:14:10 +0800 Subject: [PATCH 07/20] mcs: fix duplicated metrics (#7319) close tikv/pd#7290 Signed-off-by: Ryan Leung --- pkg/mcs/resourcemanager/server/apis/v1/api.go | 2 +- pkg/mcs/scheduling/server/apis/v1/api.go | 2 +- pkg/mcs/tso/server/apis/v1/api.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/mcs/resourcemanager/server/apis/v1/api.go b/pkg/mcs/resourcemanager/server/apis/v1/api.go index ffcb9318590..7b5f2903484 100644 --- a/pkg/mcs/resourcemanager/server/apis/v1/api.go +++ b/pkg/mcs/resourcemanager/server/apis/v1/api.go @@ -81,10 +81,10 @@ func NewService(srv *rmserver.Service) *Service { c.Set(multiservicesapi.ServiceContextKey, manager.GetBasicServer()) c.Next() }) - apiHandlerEngine.Use(multiservicesapi.ServiceRedirector()) apiHandlerEngine.GET("metrics", utils.PromHandler()) pprof.Register(apiHandlerEngine) endpoint := apiHandlerEngine.Group(APIPathPrefix) + endpoint.Use(multiservicesapi.ServiceRedirector()) s := &Service{ manager: manager, apiHandlerEngine: apiHandlerEngine, diff --git a/pkg/mcs/scheduling/server/apis/v1/api.go b/pkg/mcs/scheduling/server/apis/v1/api.go index 356dc5a7f42..98fb68c090b 100644 --- a/pkg/mcs/scheduling/server/apis/v1/api.go +++ b/pkg/mcs/scheduling/server/apis/v1/api.go @@ -100,10 +100,10 @@ func NewService(srv *scheserver.Service) *Service { c.Set(handlerKey, handler.NewHandler(&server{srv.Server})) c.Next() }) - apiHandlerEngine.Use(multiservicesapi.ServiceRedirector()) apiHandlerEngine.GET("metrics", mcsutils.PromHandler()) pprof.Register(apiHandlerEngine) root := apiHandlerEngine.Group(APIPathPrefix) + root.Use(multiservicesapi.ServiceRedirector()) s := &Service{ srv: srv, apiHandlerEngine: apiHandlerEngine, diff --git a/pkg/mcs/tso/server/apis/v1/api.go b/pkg/mcs/tso/server/apis/v1/api.go index f1853bf5483..1b8f68778af 100644 --- a/pkg/mcs/tso/server/apis/v1/api.go +++ b/pkg/mcs/tso/server/apis/v1/api.go @@ -89,10 +89,10 @@ func NewService(srv *tsoserver.Service) *Service { c.Set(multiservicesapi.ServiceContextKey, srv) c.Next() }) - apiHandlerEngine.Use(multiservicesapi.ServiceRedirector()) apiHandlerEngine.GET("metrics", utils.PromHandler()) pprof.Register(apiHandlerEngine) root := apiHandlerEngine.Group(APIPathPrefix) + root.Use(multiservicesapi.ServiceRedirector()) s := &Service{ srv: srv, apiHandlerEngine: apiHandlerEngine, From c332ddce95b0a9193022724c94047bdb87953633 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Mon, 6 Nov 2023 20:20:11 +0800 Subject: [PATCH 08/20] checker: avoid unnecessary remove disconnected peer with multi orphan peers (#7315) close tikv/pd#7249 Signed-off-by: lhy1024 --- pkg/core/store.go | 3 - pkg/schedule/checker/rule_checker.go | 29 +- pkg/schedule/checker/rule_checker_test.go | 445 ++++++++++++++-------- 3 files changed, 308 insertions(+), 169 deletions(-) diff --git a/pkg/core/store.go b/pkg/core/store.go index b3c62f45750..1d3362cac0e 100644 --- a/pkg/core/store.go +++ b/pkg/core/store.go @@ -551,9 +551,6 @@ var ( // tikv's store heartbeat for a short time, maybe caused by process restart or // temporary network failure. func (s *StoreInfo) IsDisconnected() bool { - if s == nil { - return true - } return s.DownTime() > storeDisconnectDuration } diff --git a/pkg/schedule/checker/rule_checker.go b/pkg/schedule/checker/rule_checker.go index 84cafaa871e..c4e7c242dea 100644 --- a/pkg/schedule/checker/rule_checker.go +++ b/pkg/schedule/checker/rule_checker.go @@ -78,6 +78,7 @@ var ( ruleCheckerSkipRemoveOrphanPeerCounter = checkerCounter.WithLabelValues(ruleChecker, "skip-remove-orphan-peer") ruleCheckerRemoveOrphanPeerCounter = checkerCounter.WithLabelValues(ruleChecker, "remove-orphan-peer") ruleCheckerReplaceOrphanPeerCounter = checkerCounter.WithLabelValues(ruleChecker, "replace-orphan-peer") + ruleCheckerReplaceOrphanPeerNoFitCounter = checkerCounter.WithLabelValues(ruleChecker, "replace-orphan-peer-no-fit") ) // RuleChecker fix/improve region by placement rules. @@ -465,7 +466,11 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg isDisconnectedPeer := func(p *metapb.Peer) bool { // avoid to meet down store when fix orphan peers, // Isdisconnected is more strictly than IsUnhealthy. - return c.cluster.GetStore(p.GetStoreId()).IsDisconnected() + store := c.cluster.GetStore(p.GetStoreId()) + if store == nil { + return true + } + return store.IsDisconnected() } checkDownPeer := func(peers []*metapb.Peer) (*metapb.Peer, bool) { @@ -519,7 +524,7 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg if pinDownPeer.GetIsWitness() || orphanPeer.GetIsWitness() { continue } - // down peer's store should be disconnected + // pinDownPeer's store should be disconnected, because we use more strict judge before. if !isDisconnectedPeer(pinDownPeer) { continue } @@ -534,13 +539,14 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg return operator.CreatePromoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer) case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Learner: return operator.CreateDemoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer) - case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Voter && - isDisconnectedPeer(pinDownPeer) && !dstStore.IsDisconnected(): + case orphanPeerRole == destRole && isDisconnectedPeer(pinDownPeer) && !dstStore.IsDisconnected(): return operator.CreateRemovePeerOperator("remove-replaced-orphan-peer", c.cluster, 0, region, pinDownPeer.GetStoreId()) default: // destRole should not same with orphanPeerRole. if role is same, it fit with orphanPeer should be better than now. // destRole never be leader, so we not consider it. } + } else { + ruleCheckerReplaceOrphanPeerNoFitCounter.Inc() } } } @@ -549,18 +555,25 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg // Ref https://github.com/tikv/pd/issues/4045 if len(fit.OrphanPeers) >= 2 { hasHealthPeer := false + var disconnectedPeer *metapb.Peer + for _, orphanPeer := range fit.OrphanPeers { + if isDisconnectedPeer(orphanPeer) { + disconnectedPeer = orphanPeer + break + } + } for _, orphanPeer := range fit.OrphanPeers { if isUnhealthyPeer(orphanPeer.GetId()) { ruleCheckerRemoveOrphanPeerCounter.Inc() return operator.CreateRemovePeerOperator("remove-unhealthy-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) } - if isDisconnectedPeer(orphanPeer) { - ruleCheckerRemoveOrphanPeerCounter.Inc() - return operator.CreateRemovePeerOperator("remove-disconnected-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) - } if hasHealthPeer { // there already exists a healthy orphan peer, so we can remove other orphan Peers. ruleCheckerRemoveOrphanPeerCounter.Inc() + // if there exists a disconnected orphan peer, we will pick it to remove firstly. + if disconnectedPeer != nil { + return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, disconnectedPeer.StoreId) + } return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) } hasHealthPeer = true diff --git a/pkg/schedule/checker/rule_checker_test.go b/pkg/schedule/checker/rule_checker_test.go index 0c4a2a9ecc9..eb357f302b7 100644 --- a/pkg/schedule/checker/rule_checker_test.go +++ b/pkg/schedule/checker/rule_checker_test.go @@ -17,6 +17,7 @@ package checker import ( "context" "fmt" + "strconv" "strings" "testing" @@ -225,7 +226,6 @@ func (suite *ruleCheckerTestSuite) TestFixToManyOrphanPeers() { suite.NotNil(op) suite.Equal("remove-orphan-peer", op.Desc()) suite.Equal(uint64(5), op.Step(0).(operator.RemovePeer).FromStore) - // Case2: // store 4, 5, 6 are orphan peers, and peer on store 3 is down peer. and peer on store 4, 5 are pending. region = suite.cluster.GetRegion(1) @@ -237,6 +237,91 @@ func (suite *ruleCheckerTestSuite) TestFixToManyOrphanPeers() { suite.NotNil(op) suite.Equal("remove-unhealthy-orphan-peer", op.Desc()) suite.Equal(uint64(4), op.Step(0).(operator.RemovePeer).FromStore) + // Case3: + // store 4, 5, 6 are orphan peers, and peer on one of stores is disconnect peer + // we should remove disconnect peer first. + for i := uint64(4); i <= 6; i++ { + region = suite.cluster.GetRegion(1) + suite.cluster.SetStoreDisconnect(i) + region = region.Clone( + core.WithDownPeers([]*pdpb.PeerStats{{Peer: region.GetStorePeer(3), DownSeconds: 60000}}), + core.WithPendingPeers([]*metapb.Peer{region.GetStorePeer(3)})) + suite.cluster.PutRegion(region) + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) + suite.Equal(i, op.Step(0).(operator.RemovePeer).FromStore) + suite.cluster.SetStoreUp(i) + } + // Case4: + // store 4, 5, 6 are orphan peers, and peer on two of stores is disconnect peer + // we should remove disconnect peer first. + for i := uint64(4); i <= 6; i++ { + region = suite.cluster.GetRegion(1) + suite.cluster.SetStoreDisconnect(4) + suite.cluster.SetStoreDisconnect(5) + suite.cluster.SetStoreDisconnect(6) + suite.cluster.SetStoreUp(i) + region = region.Clone( + core.WithDownPeers([]*pdpb.PeerStats{{Peer: region.GetStorePeer(3), DownSeconds: 60000}}), + core.WithPendingPeers([]*metapb.Peer{region.GetStorePeer(3)})) + suite.cluster.PutRegion(region) + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) + removedPeerStoreID := op.Step(0).(operator.RemovePeer).FromStore + suite.NotEqual(i, removedPeerStoreID) + region = suite.cluster.GetRegion(1) + newRegion := region.Clone(core.WithRemoveStorePeer(removedPeerStoreID)) + suite.cluster.PutRegion(newRegion) + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) + removedPeerStoreID = op.Step(0).(operator.RemovePeer).FromStore + suite.NotEqual(i, removedPeerStoreID) + suite.cluster.PutRegion(region) + } +} + +func (suite *ruleCheckerTestSuite) TestFixToManyOrphanPeers2() { + suite.cluster.AddLeaderStore(1, 1) + suite.cluster.AddLeaderStore(2, 1) + suite.cluster.AddLeaderStore(3, 1) + suite.cluster.AddLeaderStore(4, 1) + suite.cluster.AddLeaderStore(5, 1) + suite.cluster.AddRegionWithLearner(1, 1, []uint64{2, 3}, []uint64{4, 5}) + + // Case1: + // store 4, 5 are orphan peers, and peer on one of stores is disconnect peer + // we should remove disconnect peer first. + for i := uint64(4); i <= 5; i++ { + region := suite.cluster.GetRegion(1) + suite.cluster.SetStoreDisconnect(i) + region = region.Clone( + core.WithDownPeers([]*pdpb.PeerStats{{Peer: region.GetStorePeer(3), DownSeconds: 60000}}), + core.WithPendingPeers([]*metapb.Peer{region.GetStorePeer(3)})) + suite.cluster.PutRegion(region) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) + suite.Equal(i, op.Step(0).(operator.RemovePeer).FromStore) + suite.cluster.SetStoreUp(i) + } + + // Case2: + // store 4, 5 are orphan peers, and they are disconnect peers + // we should remove the peer on disconnect stores at least. + region := suite.cluster.GetRegion(1) + suite.cluster.SetStoreDisconnect(4) + suite.cluster.SetStoreDisconnect(5) + region = region.Clone( + core.WithDownPeers([]*pdpb.PeerStats{{Peer: region.GetStorePeer(3), DownSeconds: 60000}}), + core.WithPendingPeers([]*metapb.Peer{region.GetStorePeer(3)})) + suite.cluster.PutRegion(region) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) + suite.Equal(uint64(4), op.Step(0).(operator.RemovePeer).FromStore) } func (suite *ruleCheckerTestSuite) TestFixOrphanPeers2() { @@ -725,173 +810,217 @@ func (suite *ruleCheckerTestSuite) TestPriorityFixOrphanPeer() { // Ref https://github.com/tikv/pd/issues/7249 https://github.com/tikv/tikv/issues/15799 func (suite *ruleCheckerTestSuite) TestFixOrphanPeerWithDisconnectedStoreAndRuleChanged() { - // init cluster with 5 replicas - suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) - suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"}) - suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"}) - suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) - suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) - storeIDs := []uint64{1, 2, 3, 4, 5} - suite.cluster.AddLeaderRegionWithRange(1, "", "", storeIDs[0], storeIDs[1:]...) - rule := &placement.Rule{ - GroupID: "pd", - ID: "default", - Role: placement.Voter, - Count: 5, - StartKey: []byte{}, - EndKey: []byte{}, - } - suite.ruleManager.SetRule(rule) - op := suite.rc.Check(suite.cluster.GetRegion(1)) - suite.Nil(op) - - // set store 1, 2 to disconnected - suite.cluster.SetStoreDisconnect(1) - suite.cluster.SetStoreDisconnect(2) - - // change rule to 3 replicas - rule = &placement.Rule{ - GroupID: "pd", - ID: "default", - Role: placement.Voter, - Count: 3, - StartKey: []byte{}, - EndKey: []byte{}, - Override: true, + // disconnect any two stores and change rule to 3 replicas + stores := []uint64{1, 2, 3, 4, 5} + testCases := [][]uint64{} + for i := 0; i < len(stores); i++ { + for j := i + 1; j < len(stores); j++ { + testCases = append(testCases, []uint64{stores[i], stores[j]}) + } } - suite.ruleManager.SetRule(rule) + for _, leader := range stores { + var followers []uint64 + for i := 0; i < len(stores); i++ { + if stores[i] != leader { + followers = append(followers, stores[i]) + } + } - // remove store 1 from region 1 - op = suite.rc.Check(suite.cluster.GetRegion(1)) - suite.NotNil(op) - suite.Equal("remove-replaced-orphan-peer", op.Desc()) - suite.Equal(op.Len(), 2) - newLeaderID := op.Step(0).(operator.TransferLeader).ToStore - removedPeerID := op.Step(1).(operator.RemovePeer).FromStore - r1 := suite.cluster.GetRegion(1) - r1 = r1.Clone( - core.WithLeader(r1.GetPeer(newLeaderID)), - core.WithRemoveStorePeer(removedPeerID)) - suite.cluster.PutRegion(r1) - r1 = suite.cluster.GetRegion(1) - suite.Len(r1.GetPeers(), 4) + for _, testCase := range testCases { + // init cluster with 5 replicas + suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) + suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"}) + suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"}) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) + suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) + suite.cluster.AddLeaderRegionWithRange(1, "", "", leader, followers...) + rule := &placement.Rule{ + GroupID: "pd", + ID: "default", + Role: placement.Voter, + Count: 5, + StartKey: []byte{}, + EndKey: []byte{}, + } + err := suite.ruleManager.SetRule(rule) + suite.NoError(err) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.Nil(op) + + // set two stores to disconnected + suite.cluster.SetStoreDisconnect(testCase[0]) + suite.cluster.SetStoreDisconnect(testCase[1]) + + // change rule to 3 replicas + rule = &placement.Rule{ + GroupID: "pd", + ID: "default", + Role: placement.Voter, + Count: 3, + StartKey: []byte{}, + EndKey: []byte{}, + Override: true, + } + suite.ruleManager.SetRule(rule) + + // remove peer from region 1 + for j := 1; j <= 2; j++ { + r1 := suite.cluster.GetRegion(1) + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Contains(op.Desc(), "orphan") + var removedPeerStoreID uint64 + newLeaderStoreID := r1.GetLeader().GetStoreId() + for i := 0; i < op.Len(); i++ { + if s, ok := op.Step(i).(operator.RemovePeer); ok { + removedPeerStoreID = s.FromStore + } + if s, ok := op.Step(i).(operator.TransferLeader); ok { + newLeaderStoreID = s.ToStore + } + } + suite.NotZero(removedPeerStoreID) + r1 = r1.Clone( + core.WithLeader(r1.GetStorePeer(newLeaderStoreID)), + core.WithRemoveStorePeer(removedPeerStoreID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 5-j) + } - // remove store 2 from region 1 - op = suite.rc.Check(suite.cluster.GetRegion(1)) - suite.NotNil(op) - suite.Equal("remove-replaced-orphan-peer", op.Desc()) - suite.Equal(op.Len(), 1) - removedPeerID = op.Step(0).(operator.RemovePeer).FromStore - r1 = r1.Clone(core.WithRemoveStorePeer(removedPeerID)) - suite.cluster.PutRegion(r1) - r1 = suite.cluster.GetRegion(1) - suite.Len(r1.GetPeers(), 3) - for _, p := range r1.GetPeers() { - suite.NotEqual(p.GetStoreId(), 1) - suite.NotEqual(p.GetStoreId(), 2) + r1 := suite.cluster.GetRegion(1) + for _, p := range r1.GetPeers() { + suite.NotEqual(p.GetStoreId(), testCase[0]) + suite.NotEqual(p.GetStoreId(), testCase[1]) + } + suite.TearDownTest() + suite.SetupTest() + } } } // Ref https://github.com/tikv/pd/issues/7249 https://github.com/tikv/tikv/issues/15799 -func (suite *ruleCheckerTestSuite) TestFixOrphanPeerWithDisconnectedStoreAndRuleChanged2() { - // init cluster with 5 voters and 1 learner - suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) - suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"}) - suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"}) - suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) - suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) - suite.cluster.AddLabelsStore(6, 1, map[string]string{"host": "host6"}) - storeIDs := []uint64{1, 2, 3, 4, 5} - suite.cluster.AddLeaderRegionWithRange(1, "", "", storeIDs[0], storeIDs[1:]...) - r1 := suite.cluster.GetRegion(1) - r1 = r1.Clone(core.WithAddPeer(&metapb.Peer{Id: 6, StoreId: 6, Role: metapb.PeerRole_Learner})) - suite.cluster.PutRegion(r1) - err := suite.ruleManager.SetRules([]*placement.Rule{ - { - GroupID: "pd", - ID: "default", - Index: 100, - Override: true, - Role: placement.Voter, - Count: 5, - IsWitness: false, - }, - { - GroupID: "pd", - ID: "r1", - Index: 100, - Override: false, - Role: placement.Learner, - Count: 1, - IsWitness: false, - }, - }) - suite.NoError(err) - - op := suite.rc.Check(suite.cluster.GetRegion(1)) - suite.Nil(op) - - // set store 1, 2 to disconnected - suite.cluster.SetStoreDisconnect(1) - suite.cluster.SetStoreDisconnect(2) - suite.cluster.SetStoreDisconnect(3) - - // change rule to 3 replicas - suite.ruleManager.DeleteRuleGroup("pd") - suite.ruleManager.SetRule(&placement.Rule{ - GroupID: "pd", - ID: "default", - Role: placement.Voter, - Count: 2, - StartKey: []byte{}, - EndKey: []byte{}, - Override: true, - }) - - // remove store 1 from region 1 - op = suite.rc.Check(suite.cluster.GetRegion(1)) - suite.NotNil(op) - suite.Equal("remove-replaced-orphan-peer", op.Desc()) - suite.Equal(op.Len(), 2) - newLeaderID := op.Step(0).(operator.TransferLeader).ToStore - removedPeerID := op.Step(1).(operator.RemovePeer).FromStore - r1 = suite.cluster.GetRegion(1) - r1 = r1.Clone( - core.WithLeader(r1.GetPeer(newLeaderID)), - core.WithRemoveStorePeer(removedPeerID)) - suite.cluster.PutRegion(r1) - r1 = suite.cluster.GetRegion(1) - suite.Len(r1.GetPeers(), 5) - - // remove store 2 from region 1 - op = suite.rc.Check(suite.cluster.GetRegion(1)) - suite.NotNil(op) - suite.Equal("remove-replaced-orphan-peer", op.Desc()) - suite.Equal(op.Len(), 1) - removedPeerID = op.Step(0).(operator.RemovePeer).FromStore - r1 = r1.Clone(core.WithRemoveStorePeer(removedPeerID)) - suite.cluster.PutRegion(r1) - r1 = suite.cluster.GetRegion(1) - suite.Len(r1.GetPeers(), 4) - for _, p := range r1.GetPeers() { - fmt.Println(p.GetStoreId(), p.Role.String()) +func (suite *ruleCheckerTestSuite) TestFixOrphanPeerWithDisconnectedStoreAndRuleChangedWithLearner() { + // disconnect any three stores and change rule to 3 replicas + // and there is a learner in the disconnected store. + stores := []uint64{1, 2, 3, 4, 5, 6} + testCases := [][]uint64{} + for i := 0; i < len(stores); i++ { + for j := i + 1; j < len(stores); j++ { + for k := j + 1; k < len(stores); k++ { + testCases = append(testCases, []uint64{stores[i], stores[j], stores[k]}) + } + } } + for _, leader := range stores { + var followers []uint64 + for i := 0; i < len(stores); i++ { + if stores[i] != leader { + followers = append(followers, stores[i]) + } + } - // remove store 3 from region 1 - op = suite.rc.Check(suite.cluster.GetRegion(1)) - suite.NotNil(op) - suite.Equal("remove-replaced-orphan-peer", op.Desc()) - suite.Equal(op.Len(), 1) - removedPeerID = op.Step(0).(operator.RemovePeer).FromStore - r1 = r1.Clone(core.WithRemoveStorePeer(removedPeerID)) - suite.cluster.PutRegion(r1) - r1 = suite.cluster.GetRegion(1) - suite.Len(r1.GetPeers(), 3) + for _, testCase := range testCases { + for _, learnerStore := range testCase { + if learnerStore == leader { + continue + } + voterFollowers := []uint64{} + for _, follower := range followers { + if follower != learnerStore { + voterFollowers = append(voterFollowers, follower) + } + } + // init cluster with 5 voters and 1 learner + suite.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) + suite.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host2"}) + suite.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host3"}) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) + suite.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) + suite.cluster.AddLabelsStore(6, 1, map[string]string{"host": "host6"}) + suite.cluster.AddLeaderRegionWithRange(1, "", "", leader, voterFollowers...) + err := suite.ruleManager.SetRules([]*placement.Rule{ + { + GroupID: "pd", + ID: "default", + Index: 100, + Override: true, + Role: placement.Voter, + Count: 5, + IsWitness: false, + }, + { + GroupID: "pd", + ID: "r1", + Index: 100, + Override: false, + Role: placement.Learner, + Count: 1, + IsWitness: false, + LabelConstraints: []placement.LabelConstraint{ + {Key: "host", Op: "in", Values: []string{"host" + strconv.FormatUint(learnerStore, 10)}}, + }, + }, + }) + suite.NoError(err) + r1 := suite.cluster.GetRegion(1) + r1 = r1.Clone(core.WithAddPeer(&metapb.Peer{Id: 12, StoreId: learnerStore, Role: metapb.PeerRole_Learner})) + suite.cluster.PutRegion(r1) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.Nil(op) + + // set three stores to disconnected + suite.cluster.SetStoreDisconnect(testCase[0]) + suite.cluster.SetStoreDisconnect(testCase[1]) + suite.cluster.SetStoreDisconnect(testCase[2]) + + // change rule to 3 replicas + suite.ruleManager.DeleteRule("pd", "r1") + suite.ruleManager.SetRule(&placement.Rule{ + GroupID: "pd", + ID: "default", + Role: placement.Voter, + Count: 3, + StartKey: []byte{}, + EndKey: []byte{}, + Override: true, + }) + + // remove peer from region 1 + for j := 1; j <= 3; j++ { + r1 := suite.cluster.GetRegion(1) + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Contains(op.Desc(), "orphan") + var removedPeerStroeID uint64 + newLeaderStoreID := r1.GetLeader().GetStoreId() + for i := 0; i < op.Len(); i++ { + if s, ok := op.Step(i).(operator.RemovePeer); ok { + removedPeerStroeID = s.FromStore + } + if s, ok := op.Step(i).(operator.TransferLeader); ok { + newLeaderStoreID = s.ToStore + } + } + suite.NotZero(removedPeerStroeID) + r1 = r1.Clone( + core.WithLeader(r1.GetStorePeer(newLeaderStoreID)), + core.WithRemoveStorePeer(removedPeerStroeID)) + suite.cluster.PutRegion(r1) + r1 = suite.cluster.GetRegion(1) + suite.Len(r1.GetPeers(), 6-j) + } - for _, p := range r1.GetPeers() { - suite.NotEqual(p.GetStoreId(), 1) - suite.NotEqual(p.GetStoreId(), 2) - suite.NotEqual(p.GetStoreId(), 3) + r1 = suite.cluster.GetRegion(1) + for _, p := range r1.GetPeers() { + suite.NotEqual(p.GetStoreId(), testCase[0]) + suite.NotEqual(p.GetStoreId(), testCase[1]) + suite.NotEqual(p.GetStoreId(), testCase[2]) + } + suite.TearDownTest() + suite.SetupTest() + } + } } } From 356066a3598c68620b3807bc2bf449ba209e33f4 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Tue, 7 Nov 2023 14:11:10 +0800 Subject: [PATCH 09/20] mcs: solve forward stream error (#7321) close tikv/pd#7320 Signed-off-by: lhy1024 --- server/grpc_service.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/server/grpc_service.go b/server/grpc_service.go index 2e59bdaf742..4aa6dc5b1da 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -2632,7 +2632,13 @@ func (s *GrpcServer) getGlobalTSOFromTSOServer(ctx context.Context) (pdpb.Timest if err != nil { return pdpb.Timestamp{}, err } - forwardStream.Send(request) + err := forwardStream.Send(request) + if err != nil { + s.tsoClientPool.Lock() + delete(s.tsoClientPool.clients, forwardedHost) + s.tsoClientPool.Unlock() + continue + } ts, err = forwardStream.Recv() if err != nil { if strings.Contains(err.Error(), errs.NotLeaderErr) { From 47ba96f95bf96eeaa323dfd091990d4dc1bb1684 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Tue, 7 Nov 2023 14:59:10 +0800 Subject: [PATCH 10/20] mcs: support config http interface in scheduling server (#7278) ref tikv/pd#5839 Signed-off-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/mcs/scheduling/server/apis/v1/api.go | 19 + pkg/mcs/scheduling/server/config/config.go | 13 +- pkg/mcs/scheduling/server/server.go | 23 + server/api/config.go | 63 ++- server/api/config_test.go | 440 --------------- tests/integrations/mcs/scheduling/api_test.go | 83 +++ tests/pdctl/config/config_test.go | 162 +++--- tests/pdctl/scheduler/scheduler_test.go | 12 +- tests/server/api/operator_test.go | 33 +- tests/server/api/scheduler_test.go | 25 +- tests/server/config/config_test.go | 531 +++++++++++++++++- 11 files changed, 855 insertions(+), 549 deletions(-) delete mode 100644 server/api/config_test.go diff --git a/pkg/mcs/scheduling/server/apis/v1/api.go b/pkg/mcs/scheduling/server/apis/v1/api.go index 98fb68c090b..47fdb95543f 100644 --- a/pkg/mcs/scheduling/server/apis/v1/api.go +++ b/pkg/mcs/scheduling/server/apis/v1/api.go @@ -111,6 +111,7 @@ func NewService(srv *scheserver.Service) *Service { rd: createIndentRender(), } s.RegisterAdminRouter() + s.RegisterConfigRouter() s.RegisterOperatorsRouter() s.RegisterSchedulersRouter() s.RegisterCheckersRouter() @@ -126,6 +127,12 @@ func (s *Service) RegisterAdminRouter() { router.DELETE("cache/regions/:id", deleteRegionCacheByID) } +// RegisterConfigRouter registers the router of the config handler. +func (s *Service) RegisterConfigRouter() { + router := s.root.Group("config") + router.GET("", getConfig) +} + // RegisterSchedulersRouter registers the router of the schedulers handler. func (s *Service) RegisterSchedulersRouter() { router := s.root.Group("schedulers") @@ -186,6 +193,18 @@ func changeLogLevel(c *gin.Context) { c.String(http.StatusOK, "The log level is updated.") } +// @Tags config +// @Summary Get full config. +// @Produce json +// @Success 200 {object} config.Config +// @Router /config [get] +func getConfig(c *gin.Context) { + svr := c.MustGet(multiservicesapi.ServiceContextKey).(*scheserver.Server) + cfg := svr.GetConfig() + cfg.Schedule.MaxMergeRegionKeys = cfg.Schedule.GetMaxMergeRegionKeys() + c.IndentedJSON(http.StatusOK, cfg) +} + // @Tags admin // @Summary Drop all regions from cache. // @Produce json diff --git a/pkg/mcs/scheduling/server/config/config.go b/pkg/mcs/scheduling/server/config/config.go index 4f9caca41e6..772eab835f1 100644 --- a/pkg/mcs/scheduling/server/config/config.go +++ b/pkg/mcs/scheduling/server/config/config.go @@ -61,9 +61,9 @@ type Config struct { Metric metricutil.MetricConfig `toml:"metric" json:"metric"` // Log related config. - Log log.Config `toml:"log" json:"log"` - Logger *zap.Logger - LogProps *log.ZapProperties + Log log.Config `toml:"log" json:"log"` + Logger *zap.Logger `json:"-"` + LogProps *log.ZapProperties `json:"-"` Security configutil.SecurityConfig `toml:"security" json:"security"` @@ -195,6 +195,13 @@ func (c *Config) validate() error { return nil } +// Clone creates a copy of current config. +func (c *Config) Clone() *Config { + cfg := &Config{} + *cfg = *c + return cfg +} + // PersistConfig wraps all configurations that need to persist to storage and // allows to access them safely. type PersistConfig struct { diff --git a/pkg/mcs/scheduling/server/server.go b/pkg/mcs/scheduling/server/server.go index 5e2ed58a009..1790cb2b4be 100644 --- a/pkg/mcs/scheduling/server/server.go +++ b/pkg/mcs/scheduling/server/server.go @@ -504,6 +504,29 @@ func (s *Server) stopWatcher() { s.metaWatcher.Close() } +// GetPersistConfig returns the persist config. +// It's used to test. +func (s *Server) GetPersistConfig() *config.PersistConfig { + return s.persistConfig +} + +// GetConfig gets the config. +func (s *Server) GetConfig() *config.Config { + cfg := s.cfg.Clone() + cfg.Schedule = *s.persistConfig.GetScheduleConfig().Clone() + cfg.Replication = *s.persistConfig.GetReplicationConfig().Clone() + cfg.ClusterVersion = *s.persistConfig.GetClusterVersion() + if s.storage == nil { + return cfg + } + sches, configs, err := s.storage.LoadAllSchedulerConfigs() + if err != nil { + return cfg + } + cfg.Schedule.SchedulersPayload = schedulers.ToPayload(sches, configs) + return cfg +} + // CreateServer creates the Server func CreateServer(ctx context.Context, cfg *config.Config) *Server { svr := &Server{ diff --git a/server/api/config.go b/server/api/config.go index c63bd953c37..746b1119a73 100644 --- a/server/api/config.go +++ b/server/api/config.go @@ -27,6 +27,8 @@ import ( "github.com/pingcap/errcode" "github.com/pingcap/errors" "github.com/pingcap/log" + "github.com/tikv/pd/pkg/errs" + "github.com/tikv/pd/pkg/mcs/utils" sc "github.com/tikv/pd/pkg/schedule/config" "github.com/tikv/pd/pkg/utils/apiutil" "github.com/tikv/pd/pkg/utils/jsonutil" @@ -60,7 +62,17 @@ func newConfHandler(svr *server.Server, rd *render.Render) *confHandler { // @Router /config [get] func (h *confHandler) GetConfig(w http.ResponseWriter, r *http.Request) { cfg := h.svr.GetConfig() - cfg.Schedule.MaxMergeRegionKeys = cfg.Schedule.GetMaxMergeRegionKeys() + if h.svr.IsAPIServiceMode() { + schedulingServerConfig, err := h.GetSchedulingServerConfig() + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) + return + } + cfg.Schedule = schedulingServerConfig.Schedule + cfg.Replication = schedulingServerConfig.Replication + } else { + cfg.Schedule.MaxMergeRegionKeys = cfg.Schedule.GetMaxMergeRegionKeys() + } h.rd.JSON(w, http.StatusOK, cfg) } @@ -301,6 +313,16 @@ func getConfigMap(cfg map[string]interface{}, key []string, value interface{}) m // @Success 200 {object} sc.ScheduleConfig // @Router /config/schedule [get] func (h *confHandler) GetScheduleConfig(w http.ResponseWriter, r *http.Request) { + if h.svr.IsAPIServiceMode() { + cfg, err := h.GetSchedulingServerConfig() + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) + return + } + cfg.Schedule.SchedulersPayload = nil + h.rd.JSON(w, http.StatusOK, cfg.Schedule) + return + } cfg := h.svr.GetScheduleConfig() cfg.MaxMergeRegionKeys = cfg.GetMaxMergeRegionKeys() h.rd.JSON(w, http.StatusOK, cfg) @@ -364,6 +386,15 @@ func (h *confHandler) SetScheduleConfig(w http.ResponseWriter, r *http.Request) // @Success 200 {object} sc.ReplicationConfig // @Router /config/replicate [get] func (h *confHandler) GetReplicationConfig(w http.ResponseWriter, r *http.Request) { + if h.svr.IsAPIServiceMode() { + cfg, err := h.GetSchedulingServerConfig() + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) + return + } + h.rd.JSON(w, http.StatusOK, cfg.Replication) + return + } h.rd.JSON(w, http.StatusOK, h.svr.GetReplicationConfig()) } @@ -505,3 +536,33 @@ func (h *confHandler) SetReplicationModeConfig(w http.ResponseWriter, r *http.Re func (h *confHandler) GetPDServerConfig(w http.ResponseWriter, r *http.Request) { h.rd.JSON(w, http.StatusOK, h.svr.GetPDServerConfig()) } + +func (h *confHandler) GetSchedulingServerConfig() (*config.Config, error) { + addr, ok := h.svr.GetServicePrimaryAddr(h.svr.Context(), utils.SchedulingServiceName) + if !ok { + return nil, errs.ErrNotFoundSchedulingAddr.FastGenByArgs() + } + url := fmt.Sprintf("%s/scheduling/api/v1/config", addr) + req, err := http.NewRequest(http.MethodGet, url, nil) + if err != nil { + return nil, err + } + resp, err := h.svr.GetHTTPClient().Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return nil, errs.ErrSchedulingServer.FastGenByArgs(resp.StatusCode) + } + b, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + var schedulingServerConfig config.Config + err = json.Unmarshal(b, &schedulingServerConfig) + if err != nil { + return nil, err + } + return &schedulingServerConfig, nil +} diff --git a/server/api/config_test.go b/server/api/config_test.go deleted file mode 100644 index fbfb3f94518..00000000000 --- a/server/api/config_test.go +++ /dev/null @@ -1,440 +0,0 @@ -// Copyright 2016 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package api - -import ( - "encoding/json" - "fmt" - "testing" - "time" - - "github.com/stretchr/testify/suite" - sc "github.com/tikv/pd/pkg/schedule/config" - tu "github.com/tikv/pd/pkg/utils/testutil" - "github.com/tikv/pd/pkg/utils/typeutil" - "github.com/tikv/pd/pkg/versioninfo" - "github.com/tikv/pd/server" - "github.com/tikv/pd/server/config" -) - -type configTestSuite struct { - suite.Suite - svr *server.Server - cleanup tu.CleanupFunc - urlPrefix string -} - -func TestConfigTestSuite(t *testing.T) { - suite.Run(t, new(configTestSuite)) -} - -func (suite *configTestSuite) SetupSuite() { - re := suite.Require() - suite.svr, suite.cleanup = mustNewServer(re, func(cfg *config.Config) { - cfg.Replication.EnablePlacementRules = false - }) - server.MustWaitLeader(re, []*server.Server{suite.svr}) - - addr := suite.svr.GetAddr() - suite.urlPrefix = fmt.Sprintf("%s%s/api/v1", addr, apiPrefix) -} - -func (suite *configTestSuite) TearDownSuite() { - suite.cleanup() -} - -func (suite *configTestSuite) TestConfigAll() { - re := suite.Require() - addr := fmt.Sprintf("%s/config", suite.urlPrefix) - cfg := &config.Config{} - err := tu.ReadGetJSON(re, testDialClient, addr, cfg) - suite.NoError(err) - - // the original way - r := map[string]int{"max-replicas": 5} - postData, err := json.Marshal(r) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) - l := map[string]interface{}{ - "location-labels": "zone,rack", - "region-schedule-limit": 10, - } - postData, err = json.Marshal(l) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) - - l = map[string]interface{}{ - "metric-storage": "http://127.0.0.1:9090", - } - postData, err = json.Marshal(l) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) - - newCfg := &config.Config{} - err = tu.ReadGetJSON(re, testDialClient, addr, newCfg) - suite.NoError(err) - cfg.Replication.MaxReplicas = 5 - cfg.Replication.LocationLabels = []string{"zone", "rack"} - cfg.Schedule.RegionScheduleLimit = 10 - cfg.PDServerCfg.MetricStorage = "http://127.0.0.1:9090" - suite.Equal(newCfg, cfg) - - // the new way - l = map[string]interface{}{ - "schedule.tolerant-size-ratio": 2.5, - "schedule.enable-tikv-split-region": "false", - "replication.location-labels": "idc,host", - "pd-server.metric-storage": "http://127.0.0.1:1234", - "log.level": "warn", - "cluster-version": "v4.0.0-beta", - "replication-mode.replication-mode": "dr-auto-sync", - "replication-mode.dr-auto-sync.label-key": "foobar", - } - postData, err = json.Marshal(l) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) - newCfg1 := &config.Config{} - err = tu.ReadGetJSON(re, testDialClient, addr, newCfg1) - suite.NoError(err) - cfg.Schedule.EnableTiKVSplitRegion = false - cfg.Schedule.TolerantSizeRatio = 2.5 - cfg.Replication.LocationLabels = []string{"idc", "host"} - cfg.PDServerCfg.MetricStorage = "http://127.0.0.1:1234" - cfg.Log.Level = "warn" - cfg.ReplicationMode.DRAutoSync.LabelKey = "foobar" - cfg.ReplicationMode.ReplicationMode = "dr-auto-sync" - v, err := versioninfo.ParseVersion("v4.0.0-beta") - suite.NoError(err) - cfg.ClusterVersion = *v - suite.Equal(cfg, newCfg1) - - // revert this to avoid it affects TestConfigTTL - l["schedule.enable-tikv-split-region"] = "true" - postData, err = json.Marshal(l) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) - - // illegal prefix - l = map[string]interface{}{ - "replicate.max-replicas": 1, - } - postData, err = json.Marshal(l) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, - tu.StatusNotOK(re), - tu.StringContain(re, "not found")) - suite.NoError(err) - - // update prefix directly - l = map[string]interface{}{ - "replication-mode": nil, - } - postData, err = json.Marshal(l) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, - tu.StatusNotOK(re), - tu.StringContain(re, "cannot update config prefix")) - suite.NoError(err) - - // config item not found - l = map[string]interface{}{ - "schedule.region-limit": 10, - } - postData, err = json.Marshal(l) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusNotOK(re), tu.StringContain(re, "not found")) - suite.NoError(err) -} - -func (suite *configTestSuite) TestConfigSchedule() { - re := suite.Require() - addr := fmt.Sprintf("%s/config/schedule", suite.urlPrefix) - scheduleConfig := &sc.ScheduleConfig{} - suite.NoError(tu.ReadGetJSON(re, testDialClient, addr, scheduleConfig)) - scheduleConfig.MaxStoreDownTime.Duration = time.Second - postData, err := json.Marshal(scheduleConfig) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) - - scheduleConfig1 := &sc.ScheduleConfig{} - suite.NoError(tu.ReadGetJSON(re, testDialClient, addr, scheduleConfig1)) - suite.Equal(*scheduleConfig1, *scheduleConfig) -} - -func (suite *configTestSuite) TestConfigReplication() { - re := suite.Require() - addr := fmt.Sprintf("%s/config/replicate", suite.urlPrefix) - rc := &sc.ReplicationConfig{} - err := tu.ReadGetJSON(re, testDialClient, addr, rc) - suite.NoError(err) - - rc.MaxReplicas = 5 - rc1 := map[string]int{"max-replicas": 5} - postData, err := json.Marshal(rc1) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) - - rc.LocationLabels = []string{"zone", "rack"} - rc2 := map[string]string{"location-labels": "zone,rack"} - postData, err = json.Marshal(rc2) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) - - rc.IsolationLevel = "zone" - rc3 := map[string]string{"isolation-level": "zone"} - postData, err = json.Marshal(rc3) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) - - rc4 := &sc.ReplicationConfig{} - err = tu.ReadGetJSON(re, testDialClient, addr, rc4) - suite.NoError(err) - - suite.Equal(*rc4, *rc) -} - -func (suite *configTestSuite) TestConfigLabelProperty() { - re := suite.Require() - addr := suite.svr.GetAddr() + apiPrefix + "/api/v1/config/label-property" - loadProperties := func() config.LabelPropertyConfig { - var cfg config.LabelPropertyConfig - err := tu.ReadGetJSON(re, testDialClient, addr, &cfg) - suite.NoError(err) - return cfg - } - - cfg := loadProperties() - suite.Empty(cfg) - - cmds := []string{ - `{"type": "foo", "action": "set", "label-key": "zone", "label-value": "cn1"}`, - `{"type": "foo", "action": "set", "label-key": "zone", "label-value": "cn2"}`, - `{"type": "bar", "action": "set", "label-key": "host", "label-value": "h1"}`, - } - for _, cmd := range cmds { - err := tu.CheckPostJSON(testDialClient, addr, []byte(cmd), tu.StatusOK(re)) - suite.NoError(err) - } - - cfg = loadProperties() - suite.Len(cfg, 2) - suite.Equal([]config.StoreLabel{ - {Key: "zone", Value: "cn1"}, - {Key: "zone", Value: "cn2"}, - }, cfg["foo"]) - suite.Equal([]config.StoreLabel{{Key: "host", Value: "h1"}}, cfg["bar"]) - - cmds = []string{ - `{"type": "foo", "action": "delete", "label-key": "zone", "label-value": "cn1"}`, - `{"type": "bar", "action": "delete", "label-key": "host", "label-value": "h1"}`, - } - for _, cmd := range cmds { - err := tu.CheckPostJSON(testDialClient, addr, []byte(cmd), tu.StatusOK(re)) - suite.NoError(err) - } - - cfg = loadProperties() - suite.Len(cfg, 1) - suite.Equal([]config.StoreLabel{{Key: "zone", Value: "cn2"}}, cfg["foo"]) -} - -func (suite *configTestSuite) TestConfigDefault() { - addr := fmt.Sprintf("%s/config", suite.urlPrefix) - - r := map[string]int{"max-replicas": 5} - postData, err := json.Marshal(r) - suite.NoError(err) - re := suite.Require() - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) - l := map[string]interface{}{ - "location-labels": "zone,rack", - "region-schedule-limit": 10, - } - postData, err = json.Marshal(l) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) - - l = map[string]interface{}{ - "metric-storage": "http://127.0.0.1:9090", - } - postData, err = json.Marshal(l) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) - - addr = fmt.Sprintf("%s/config/default", suite.urlPrefix) - defaultCfg := &config.Config{} - err = tu.ReadGetJSON(re, testDialClient, addr, defaultCfg) - suite.NoError(err) - - suite.Equal(uint64(3), defaultCfg.Replication.MaxReplicas) - suite.Equal(typeutil.StringSlice([]string{}), defaultCfg.Replication.LocationLabels) - suite.Equal(uint64(2048), defaultCfg.Schedule.RegionScheduleLimit) - suite.Equal("", defaultCfg.PDServerCfg.MetricStorage) -} - -func (suite *configTestSuite) TestConfigPDServer() { - re := suite.Require() - addrPost := fmt.Sprintf("%s/config", suite.urlPrefix) - ms := map[string]interface{}{ - "metric-storage": "", - } - postData, err := json.Marshal(ms) - suite.NoError(err) - suite.NoError(tu.CheckPostJSON(testDialClient, addrPost, postData, tu.StatusOK(re))) - addrGet := fmt.Sprintf("%s/config/pd-server", suite.urlPrefix) - sc := &config.PDServerConfig{} - suite.NoError(tu.ReadGetJSON(re, testDialClient, addrGet, sc)) - suite.Equal(bool(true), sc.UseRegionStorage) - suite.Equal("table", sc.KeyType) - suite.Equal(typeutil.StringSlice([]string{}), sc.RuntimeServices) - suite.Equal("", sc.MetricStorage) - suite.Equal("auto", sc.DashboardAddress) - suite.Equal(int(3), sc.FlowRoundByDigit) - suite.Equal(typeutil.NewDuration(time.Second), sc.MinResolvedTSPersistenceInterval) - suite.Equal(24*time.Hour, sc.MaxResetTSGap.Duration) -} - -var ttlConfig = map[string]interface{}{ - "schedule.max-snapshot-count": 999, - "schedule.enable-location-replacement": false, - "schedule.max-merge-region-size": 999, - "schedule.max-merge-region-keys": 999, - "schedule.scheduler-max-waiting-operator": 999, - "schedule.leader-schedule-limit": 999, - "schedule.region-schedule-limit": 999, - "schedule.hot-region-schedule-limit": 999, - "schedule.replica-schedule-limit": 999, - "schedule.merge-schedule-limit": 999, - "schedule.enable-tikv-split-region": false, -} - -var invalidTTLConfig = map[string]interface{}{ - "schedule.invalid-ttl-config": 0, -} - -func assertTTLConfig( - options *config.PersistOptions, - equality func(interface{}, interface{}, ...interface{}) bool, -) { - equality(uint64(999), options.GetMaxSnapshotCount()) - equality(false, options.IsLocationReplacementEnabled()) - equality(uint64(999), options.GetMaxMergeRegionSize()) - equality(uint64(999), options.GetMaxMergeRegionKeys()) - equality(uint64(999), options.GetSchedulerMaxWaitingOperator()) - equality(uint64(999), options.GetLeaderScheduleLimit()) - equality(uint64(999), options.GetRegionScheduleLimit()) - equality(uint64(999), options.GetHotRegionScheduleLimit()) - equality(uint64(999), options.GetReplicaScheduleLimit()) - equality(uint64(999), options.GetMergeScheduleLimit()) - equality(false, options.IsTikvRegionSplitEnabled()) -} - -func createTTLUrl(url string, ttl int) string { - return fmt.Sprintf("%s/config?ttlSecond=%d", url, ttl) -} - -func (suite *configTestSuite) TestConfigTTL() { - postData, err := json.Marshal(ttlConfig) - suite.NoError(err) - - // test no config and cleaning up - re := suite.Require() - err = tu.CheckPostJSON(testDialClient, createTTLUrl(suite.urlPrefix, 0), postData, tu.StatusOK(re)) - suite.NoError(err) - assertTTLConfig(suite.svr.GetPersistOptions(), suite.NotEqual) - - // test time goes by - err = tu.CheckPostJSON(testDialClient, createTTLUrl(suite.urlPrefix, 1), postData, tu.StatusOK(re)) - suite.NoError(err) - assertTTLConfig(suite.svr.GetPersistOptions(), suite.Equal) - time.Sleep(2 * time.Second) - assertTTLConfig(suite.svr.GetPersistOptions(), suite.NotEqual) - - // test cleaning up - err = tu.CheckPostJSON(testDialClient, createTTLUrl(suite.urlPrefix, 1), postData, tu.StatusOK(re)) - suite.NoError(err) - assertTTLConfig(suite.svr.GetPersistOptions(), suite.Equal) - err = tu.CheckPostJSON(testDialClient, createTTLUrl(suite.urlPrefix, 0), postData, tu.StatusOK(re)) - suite.NoError(err) - assertTTLConfig(suite.svr.GetPersistOptions(), suite.NotEqual) - - postData, err = json.Marshal(invalidTTLConfig) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, createTTLUrl(suite.urlPrefix, 1), postData, - tu.StatusNotOK(re), tu.StringEqual(re, "\"unsupported ttl config schedule.invalid-ttl-config\"\n")) - suite.NoError(err) - - // only set max-merge-region-size - mergeConfig := map[string]interface{}{ - "schedule.max-merge-region-size": 999, - } - postData, err = json.Marshal(mergeConfig) - suite.NoError(err) - - err = tu.CheckPostJSON(testDialClient, createTTLUrl(suite.urlPrefix, 1), postData, tu.StatusOK(re)) - suite.NoError(err) - suite.Equal(uint64(999), suite.svr.GetPersistOptions().GetMaxMergeRegionSize()) - // max-merge-region-keys should keep consistence with max-merge-region-size. - suite.Equal(uint64(999*10000), suite.svr.GetPersistOptions().GetMaxMergeRegionKeys()) - - // on invalid value, we use default config - mergeConfig = map[string]interface{}{ - "schedule.enable-tikv-split-region": "invalid", - } - postData, err = json.Marshal(mergeConfig) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, createTTLUrl(suite.urlPrefix, 1), postData, tu.StatusOK(re)) - suite.NoError(err) - suite.True(suite.svr.GetPersistOptions().IsTikvRegionSplitEnabled()) -} - -func (suite *configTestSuite) TestTTLConflict() { - addr := createTTLUrl(suite.urlPrefix, 1) - postData, err := json.Marshal(ttlConfig) - suite.NoError(err) - re := suite.Require() - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) - assertTTLConfig(suite.svr.GetPersistOptions(), suite.Equal) - - cfg := map[string]interface{}{"max-snapshot-count": 30} - postData, err = json.Marshal(cfg) - suite.NoError(err) - addr = fmt.Sprintf("%s/config", suite.urlPrefix) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusNotOK(re), tu.StringEqual(re, "\"need to clean up TTL first for schedule.max-snapshot-count\"\n")) - suite.NoError(err) - addr = fmt.Sprintf("%s/config/schedule", suite.urlPrefix) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusNotOK(re), tu.StringEqual(re, "\"need to clean up TTL first for schedule.max-snapshot-count\"\n")) - suite.NoError(err) - cfg = map[string]interface{}{"schedule.max-snapshot-count": 30} - postData, err = json.Marshal(cfg) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, createTTLUrl(suite.urlPrefix, 0), postData, tu.StatusOK(re)) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) - suite.NoError(err) -} diff --git a/tests/integrations/mcs/scheduling/api_test.go b/tests/integrations/mcs/scheduling/api_test.go index 3793c09d883..15c66ce5829 100644 --- a/tests/integrations/mcs/scheduling/api_test.go +++ b/tests/integrations/mcs/scheduling/api_test.go @@ -13,6 +13,7 @@ import ( "github.com/stretchr/testify/suite" "github.com/tikv/pd/pkg/core" _ "github.com/tikv/pd/pkg/mcs/scheduling/server/apis/v1" + "github.com/tikv/pd/pkg/mcs/scheduling/server/config" "github.com/tikv/pd/pkg/schedule/handler" "github.com/tikv/pd/pkg/statistics" "github.com/tikv/pd/pkg/storage" @@ -242,6 +243,88 @@ func (suite *apiTestSuite) TestAPIForward() { re.NoError(err) } +func (suite *apiTestSuite) TestConfig() { + checkConfig := func(cluster *tests.TestCluster) { + re := suite.Require() + s := cluster.GetSchedulingPrimaryServer() + testutil.Eventually(re, func() bool { + return s.IsServing() + }, testutil.WithWaitFor(5*time.Second), testutil.WithTickInterval(50*time.Millisecond)) + addr := s.GetAddr() + urlPrefix := fmt.Sprintf("%s/scheduling/api/v1/config", addr) + + var cfg config.Config + testutil.ReadGetJSON(re, testDialClient, urlPrefix, &cfg) + suite.Equal(cfg.GetListenAddr(), s.GetConfig().GetListenAddr()) + suite.Equal(cfg.Schedule.LeaderScheduleLimit, s.GetConfig().Schedule.LeaderScheduleLimit) + suite.Equal(cfg.Schedule.EnableCrossTableMerge, s.GetConfig().Schedule.EnableCrossTableMerge) + suite.Equal(cfg.Replication.MaxReplicas, s.GetConfig().Replication.MaxReplicas) + suite.Equal(cfg.Replication.LocationLabels, s.GetConfig().Replication.LocationLabels) + suite.Equal(cfg.DataDir, s.GetConfig().DataDir) + testutil.Eventually(re, func() bool { + // wait for all schedulers to be loaded in scheduling server. + return len(cfg.Schedule.SchedulersPayload) == 5 + }) + suite.Contains(cfg.Schedule.SchedulersPayload, "balance-leader-scheduler") + suite.Contains(cfg.Schedule.SchedulersPayload, "balance-region-scheduler") + suite.Contains(cfg.Schedule.SchedulersPayload, "balance-hot-region-scheduler") + suite.Contains(cfg.Schedule.SchedulersPayload, "balance-witness-scheduler") + suite.Contains(cfg.Schedule.SchedulersPayload, "transfer-witness-leader-scheduler") + } + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInAPIMode(checkConfig) +} + +func TestConfigForward(t *testing.T) { + re := require.New(t) + checkConfigForward := func(cluster *tests.TestCluster) { + sche := cluster.GetSchedulingPrimaryServer() + opts := sche.GetPersistConfig() + var cfg map[string]interface{} + addr := cluster.GetLeaderServer().GetAddr() + urlPrefix := fmt.Sprintf("%s/pd/api/v1/config", addr) + + // Test config forward + // Expect to get same config in scheduling server and api server + testutil.Eventually(re, func() bool { + testutil.ReadGetJSON(re, testDialClient, urlPrefix, &cfg) + re.Equal(cfg["schedule"].(map[string]interface{})["leader-schedule-limit"], + float64(opts.GetLeaderScheduleLimit())) + re.Equal(cfg["replication"].(map[string]interface{})["max-replicas"], + float64(opts.GetReplicationConfig().MaxReplicas)) + schedulers := cfg["schedule"].(map[string]interface{})["schedulers-payload"].(map[string]interface{}) + return len(schedulers) == 5 + }) + + // Test to change config in api server + // Expect to get new config in scheduling server and api server + reqData, err := json.Marshal(map[string]interface{}{ + "max-replicas": 4, + }) + re.NoError(err) + err = testutil.CheckPostJSON(testDialClient, urlPrefix, reqData, testutil.StatusOK(re)) + re.NoError(err) + testutil.Eventually(re, func() bool { + testutil.ReadGetJSON(re, testDialClient, urlPrefix, &cfg) + return cfg["replication"].(map[string]interface{})["max-replicas"] == 4. && + opts.GetReplicationConfig().MaxReplicas == 4. + }) + + // Test to change config only in scheduling server + // Expect to get new config in scheduling server but not old config in api server + opts.GetScheduleConfig().LeaderScheduleLimit = 100 + re.Equal(100, int(opts.GetLeaderScheduleLimit())) + testutil.ReadGetJSON(re, testDialClient, urlPrefix, &cfg) + re.Equal(100., cfg["schedule"].(map[string]interface{})["leader-schedule-limit"]) + opts.GetReplicationConfig().MaxReplicas = 5 + re.Equal(5, int(opts.GetReplicationConfig().MaxReplicas)) + testutil.ReadGetJSON(re, testDialClient, urlPrefix, &cfg) + re.Equal(5., cfg["replication"].(map[string]interface{})["max-replicas"]) + } + env := tests.NewSchedulingTestEnvironment(t) + env.RunTestInAPIMode(checkConfigForward) +} + func TestAdminRegionCache(t *testing.T) { re := require.New(t) checkAdminRegionCache := func(cluster *tests.TestCluster) { diff --git a/tests/pdctl/config/config_test.go b/tests/pdctl/config/config_test.go index 6ed0841bf74..26d70bb955f 100644 --- a/tests/pdctl/config/config_test.go +++ b/tests/pdctl/config/config_test.go @@ -25,8 +25,10 @@ import ( "github.com/coreos/go-semver/semver" "github.com/pingcap/kvproto/pkg/metapb" "github.com/stretchr/testify/require" + "github.com/stretchr/testify/suite" sc "github.com/tikv/pd/pkg/schedule/config" "github.com/tikv/pd/pkg/schedule/placement" + "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/pkg/utils/typeutil" "github.com/tikv/pd/server/config" "github.com/tikv/pd/tests" @@ -48,24 +50,29 @@ func (t *testCase) judge(re *require.Assertions, scheduleConfigs ...*sc.Schedule } } -func TestConfig(t *testing.T) { - re := require.New(t) - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - cluster, err := tests.NewTestCluster(ctx, 1) - re.NoError(err) - err = cluster.RunInitialServers() - re.NoError(err) - cluster.WaitLeader() - pdAddr := cluster.GetConfig().GetClientURL() +type configTestSuite struct { + suite.Suite +} + +func TestConfigTestSuite(t *testing.T) { + suite.Run(t, new(configTestSuite)) +} + +func (suite *configTestSuite) TestConfig() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkConfig) +} + +func (suite *configTestSuite) checkConfig(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() cmd := pdctlCmd.GetRootCmd() store := &metapb.Store{ Id: 1, State: metapb.StoreState_Up, } - leaderServer := cluster.GetLeaderServer() - re.NoError(leaderServer.BootstrapCluster()) svr := leaderServer.GetServer() tests.MustPutStore(re, cluster, store) defer cluster.Destroy() @@ -283,16 +290,15 @@ func TestConfig(t *testing.T) { re.Contains(string(output), "is invalid") } -func TestPlacementRules(t *testing.T) { - re := require.New(t) - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - cluster, err := tests.NewTestCluster(ctx, 1) - re.NoError(err) - err = cluster.RunInitialServers() - re.NoError(err) - cluster.WaitLeader() - pdAddr := cluster.GetConfig().GetClientURL() +func (suite *configTestSuite) TestPlacementRules() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkPlacementRules) +} + +func (suite *configTestSuite) checkPlacementRules(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() cmd := pdctlCmd.GetRootCmd() store := &metapb.Store{ @@ -300,8 +306,6 @@ func TestPlacementRules(t *testing.T) { State: metapb.StoreState_Up, LastHeartbeat: time.Now().UnixNano(), } - leaderServer := cluster.GetLeaderServer() - re.NoError(leaderServer.BootstrapCluster()) tests.MustPutStore(re, cluster, store) defer cluster.Destroy() @@ -380,16 +384,15 @@ func TestPlacementRules(t *testing.T) { re.Equal([2]string{"pd", "test1"}, rules[0].Key()) } -func TestPlacementRuleGroups(t *testing.T) { - re := require.New(t) - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - cluster, err := tests.NewTestCluster(ctx, 1) - re.NoError(err) - err = cluster.RunInitialServers() - re.NoError(err) - cluster.WaitLeader() - pdAddr := cluster.GetConfig().GetClientURL() +func (suite *configTestSuite) TestPlacementRuleGroups() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkPlacementRuleGroups) +} + +func (suite *configTestSuite) checkPlacementRuleGroups(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() cmd := pdctlCmd.GetRootCmd() store := &metapb.Store{ @@ -397,8 +400,6 @@ func TestPlacementRuleGroups(t *testing.T) { State: metapb.StoreState_Up, LastHeartbeat: time.Now().UnixNano(), } - leaderServer := cluster.GetLeaderServer() - re.NoError(leaderServer.BootstrapCluster()) tests.MustPutStore(re, cluster, store) defer cluster.Destroy() @@ -454,16 +455,15 @@ func TestPlacementRuleGroups(t *testing.T) { re.Contains(string(output), "404") } -func TestPlacementRuleBundle(t *testing.T) { - re := require.New(t) - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - cluster, err := tests.NewTestCluster(ctx, 1) - re.NoError(err) - err = cluster.RunInitialServers() - re.NoError(err) - cluster.WaitLeader() - pdAddr := cluster.GetConfig().GetClientURL() +func (suite *configTestSuite) TestPlacementRuleBundle() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkPlacementRuleBundle) +} + +func (suite *configTestSuite) checkPlacementRuleBundle(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() cmd := pdctlCmd.GetRootCmd() store := &metapb.Store{ @@ -471,8 +471,6 @@ func TestPlacementRuleBundle(t *testing.T) { State: metapb.StoreState_Up, LastHeartbeat: time.Now().UnixNano(), } - leaderServer := cluster.GetLeaderServer() - re.NoError(leaderServer.BootstrapCluster()) tests.MustPutStore(re, cluster, store) defer cluster.Destroy() @@ -648,24 +646,21 @@ func TestReplicationMode(t *testing.T) { check() } -func TestUpdateDefaultReplicaConfig(t *testing.T) { - re := require.New(t) - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - cluster, err := tests.NewTestCluster(ctx, 1) - re.NoError(err) - err = cluster.RunInitialServers() - re.NoError(err) - cluster.WaitLeader() - pdAddr := cluster.GetConfig().GetClientURL() +func (suite *configTestSuite) TestUpdateDefaultReplicaConfig() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkUpdateDefaultReplicaConfig) +} + +func (suite *configTestSuite) checkUpdateDefaultReplicaConfig(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() cmd := pdctlCmd.GetRootCmd() store := &metapb.Store{ Id: 1, State: metapb.StoreState_Up, } - leaderServer := cluster.GetLeaderServer() - re.NoError(leaderServer.BootstrapCluster()) tests.MustPutStore(re, cluster, store) defer cluster.Destroy() @@ -675,7 +670,9 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.NoError(err) replicationCfg := sc.ReplicationConfig{} re.NoError(json.Unmarshal(output, &replicationCfg)) - re.Equal(expect, replicationCfg.MaxReplicas) + testutil.Eventually(re, func() bool { // wait for the config to be synced to the scheduling server + return replicationCfg.MaxReplicas == expect + }) } checkLocationLabels := func(expect int) { @@ -684,7 +681,9 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.NoError(err) replicationCfg := sc.ReplicationConfig{} re.NoError(json.Unmarshal(output, &replicationCfg)) - re.Len(replicationCfg.LocationLabels, expect) + testutil.Eventually(re, func() bool { // wait for the config to be synced to the scheduling server + return len(replicationCfg.LocationLabels) == expect + }) } checkIsolationLevel := func(expect string) { @@ -693,7 +692,9 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.NoError(err) replicationCfg := sc.ReplicationConfig{} re.NoError(json.Unmarshal(output, &replicationCfg)) - re.Equal(replicationCfg.IsolationLevel, expect) + testutil.Eventually(re, func() bool { // wait for the config to be synced to the scheduling server + return replicationCfg.IsolationLevel == expect + }) } checkRuleCount := func(expect int) { @@ -702,7 +703,9 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.NoError(err) rule := placement.Rule{} re.NoError(json.Unmarshal(output, &rule)) - re.Equal(expect, rule.Count) + testutil.Eventually(re, func() bool { // wait for the config to be synced to the scheduling server + return rule.Count == expect + }) } checkRuleLocationLabels := func(expect int) { @@ -711,7 +714,9 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.NoError(err) rule := placement.Rule{} re.NoError(json.Unmarshal(output, &rule)) - re.Len(rule.LocationLabels, expect) + testutil.Eventually(re, func() bool { // wait for the config to be synced to the scheduling server + return len(rule.LocationLabels) == expect + }) } checkRuleIsolationLevel := func(expect string) { @@ -720,7 +725,9 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.NoError(err) rule := placement.Rule{} re.NoError(json.Unmarshal(output, &rule)) - re.Equal(rule.IsolationLevel, expect) + testutil.Eventually(re, func() bool { // wait for the config to be synced to the scheduling server + return rule.IsolationLevel == expect + }) } // update successfully when placement rules is not enabled. @@ -764,7 +771,7 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { checkRuleIsolationLevel("host") // update unsuccessfully when many rule exists. - fname := t.TempDir() + fname := suite.T().TempDir() rules := []placement.Rule{ { GroupID: "pd", @@ -791,16 +798,15 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { checkRuleIsolationLevel("host") } -func TestPDServerConfig(t *testing.T) { - re := require.New(t) - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - cluster, err := tests.NewTestCluster(ctx, 1) - re.NoError(err) - err = cluster.RunInitialServers() - re.NoError(err) - cluster.WaitLeader() - pdAddr := cluster.GetConfig().GetClientURL() +func (suite *configTestSuite) TestPDServerConfig() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkPDServerConfig) +} + +func (suite *configTestSuite) checkPDServerConfig(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() cmd := pdctlCmd.GetRootCmd() store := &metapb.Store{ @@ -808,8 +814,6 @@ func TestPDServerConfig(t *testing.T) { State: metapb.StoreState_Up, LastHeartbeat: time.Now().UnixNano(), } - leaderServer := cluster.GetLeaderServer() - re.NoError(leaderServer.BootstrapCluster()) tests.MustPutStore(re, cluster, store) defer cluster.Destroy() diff --git a/tests/pdctl/scheduler/scheduler_test.go b/tests/pdctl/scheduler/scheduler_test.go index 3554b828269..cd599405124 100644 --- a/tests/pdctl/scheduler/scheduler_test.go +++ b/tests/pdctl/scheduler/scheduler_test.go @@ -46,7 +46,6 @@ func TestSchedulerTestSuite(t *testing.T) { func (suite *schedulerTestSuite) TestScheduler() { env := tests.NewSchedulingTestEnvironment(suite.T()) env.RunTestInTwoModes(suite.checkScheduler) - env.RunTestInTwoModes(suite.checkSchedulerDiagnostic) } func (suite *schedulerTestSuite) checkScheduler(cluster *tests.TestCluster) { @@ -414,8 +413,10 @@ func (suite *schedulerTestSuite) checkScheduler(cluster *tests.TestCluster) { mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-leader-scheduler", "show"}, &conf) re.Equal(4., conf["batch"]) mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-leader-scheduler", "set", "batch", "3"}, nil) - mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-leader-scheduler"}, &conf1) - re.Equal(3., conf1["batch"]) + testutil.Eventually(re, func() bool { + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-leader-scheduler"}, &conf1) + return conf1["batch"] == 3. + }) echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "balance-leader-scheduler"}, nil) re.NotContains(echo, "Success!") echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "balance-leader-scheduler"}, nil) @@ -494,6 +495,11 @@ func (suite *schedulerTestSuite) checkScheduler(cluster *tests.TestCluster) { checkSchedulerWithStatusCommand("disabled", nil) } +func (suite *schedulerTestSuite) TestSchedulerDiagnostic() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkSchedulerDiagnostic) +} + func (suite *schedulerTestSuite) checkSchedulerDiagnostic(cluster *tests.TestCluster) { re := suite.Require() pdAddr := cluster.GetConfig().GetClientURL() diff --git a/tests/server/api/operator_test.go b/tests/server/api/operator_test.go index 83ab0f3c7ed..908daf21aac 100644 --- a/tests/server/api/operator_test.go +++ b/tests/server/api/operator_test.go @@ -51,7 +51,7 @@ func TestOperatorTestSuite(t *testing.T) { suite.Run(t, new(operatorTestSuite)) } -func (suite *operatorTestSuite) TestOperator() { +func (suite *operatorTestSuite) TestAddRemovePeer() { opts := []tests.ConfigOption{ func(conf *config.Config, serverName string) { conf.Replication.MaxReplicas = 1 @@ -59,17 +59,6 @@ func (suite *operatorTestSuite) TestOperator() { } env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) env.RunTestInTwoModes(suite.checkAddRemovePeer) - - env = tests.NewSchedulingTestEnvironment(suite.T(), opts...) - env.RunTestInTwoModes(suite.checkMergeRegionOperator) - - opts = []tests.ConfigOption{ - func(conf *config.Config, serverName string) { - conf.Replication.MaxReplicas = 3 - }, - } - env = tests.NewSchedulingTestEnvironment(suite.T(), opts...) - env.RunTestInTwoModes(suite.checkTransferRegionWithPlacementRule) } func (suite *operatorTestSuite) checkAddRemovePeer(cluster *tests.TestCluster) { @@ -178,6 +167,16 @@ func (suite *operatorTestSuite) checkAddRemovePeer(cluster *tests.TestCluster) { suite.NoError(err) } +func (suite *operatorTestSuite) TestMergeRegionOperator() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.Replication.MaxReplicas = 1 + }, + } + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + env.RunTestInTwoModes(suite.checkMergeRegionOperator) +} + func (suite *operatorTestSuite) checkMergeRegionOperator(cluster *tests.TestCluster) { re := suite.Require() suite.pauseRuleChecker(cluster) @@ -204,6 +203,16 @@ func (suite *operatorTestSuite) checkMergeRegionOperator(cluster *tests.TestClus suite.NoError(err) } +func (suite *operatorTestSuite) TestTransferRegionWithPlacementRule() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.Replication.MaxReplicas = 3 + }, + } + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + env.RunTestInTwoModes(suite.checkTransferRegionWithPlacementRule) +} + func (suite *operatorTestSuite) checkTransferRegionWithPlacementRule(cluster *tests.TestCluster) { re := suite.Require() suite.pauseRuleChecker(cluster) diff --git a/tests/server/api/scheduler_test.go b/tests/server/api/scheduler_test.go index 9db94e8562d..38f691a4eda 100644 --- a/tests/server/api/scheduler_test.go +++ b/tests/server/api/scheduler_test.go @@ -42,13 +42,9 @@ func TestScheduleTestSuite(t *testing.T) { suite.Run(t, new(scheduleTestSuite)) } -func (suite *scheduleTestSuite) TestScheduler() { +func (suite *scheduleTestSuite) TestOriginAPI() { env := tests.NewSchedulingTestEnvironment(suite.T()) env.RunTestInTwoModes(suite.checkOriginAPI) - env = tests.NewSchedulingTestEnvironment(suite.T()) - env.RunTestInTwoModes(suite.checkAPI) - env = tests.NewSchedulingTestEnvironment(suite.T()) - env.RunTestInTwoModes(suite.checkDisable) } func (suite *scheduleTestSuite) checkOriginAPI(cluster *tests.TestCluster) { @@ -115,6 +111,11 @@ func (suite *scheduleTestSuite) checkOriginAPI(cluster *tests.TestCluster) { suite.NoError(err) } +func (suite *scheduleTestSuite) TestAPI() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkAPI) +} + func (suite *scheduleTestSuite) checkAPI(cluster *tests.TestCluster) { re := suite.Require() leaderAddr := cluster.GetLeaderServer().GetAddr() @@ -153,9 +154,12 @@ func (suite *scheduleTestSuite) checkAPI(cluster *tests.TestCluster) { body, err := json.Marshal(dataMap) suite.NoError(err) suite.NoError(tu.CheckPostJSON(testDialClient, updateURL, body, tu.StatusOK(re))) - resp = make(map[string]interface{}) - suite.NoError(tu.ReadGetJSON(re, testDialClient, listURL, &resp)) - suite.Equal(3.0, resp["batch"]) + tu.Eventually(re, func() bool { // wait for scheduling server to be synced. + resp = make(map[string]interface{}) + suite.NoError(tu.ReadGetJSON(re, testDialClient, listURL, &resp)) + return resp["batch"] == 3.0 + }) + // update again err = tu.CheckPostJSON(testDialClient, updateURL, body, tu.StatusOK(re), @@ -556,6 +560,11 @@ func (suite *scheduleTestSuite) checkAPI(cluster *tests.TestCluster) { } } +func (suite *scheduleTestSuite) TestDisable() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkDisable) +} + func (suite *scheduleTestSuite) checkDisable(cluster *tests.TestCluster) { re := suite.Require() leaderAddr := cluster.GetLeaderServer().GetAddr() diff --git a/tests/server/config/config_test.go b/tests/server/config/config_test.go index 1b2178bde33..8d8cf40e692 100644 --- a/tests/server/config/config_test.go +++ b/tests/server/config/config_test.go @@ -18,17 +18,25 @@ import ( "bytes" "context" "encoding/json" + "fmt" "net/http" "testing" + "time" "github.com/stretchr/testify/require" + "github.com/stretchr/testify/suite" "github.com/tikv/pd/pkg/ratelimit" + sc "github.com/tikv/pd/pkg/schedule/config" + tu "github.com/tikv/pd/pkg/utils/testutil" + "github.com/tikv/pd/pkg/utils/typeutil" + "github.com/tikv/pd/pkg/versioninfo" "github.com/tikv/pd/server" + "github.com/tikv/pd/server/config" "github.com/tikv/pd/tests" ) -// dialClient used to dial http request. -var dialClient = &http.Client{ +// testDialClient used to dial http request. +var testDialClient = &http.Client{ Transport: &http.Transport{ DisableKeepAlives: true, }, @@ -56,7 +64,7 @@ func TestRateLimitConfigReload(t *testing.T) { data, err := json.Marshal(input) re.NoError(err) req, _ := http.NewRequest(http.MethodPost, leader.GetAddr()+"/pd/api/v1/service-middleware/config", bytes.NewBuffer(data)) - resp, err := dialClient.Do(req) + resp, err := testDialClient.Do(req) re.NoError(err) resp.Body.Close() re.True(leader.GetServer().GetServiceMiddlewarePersistOptions().IsRateLimitEnabled()) @@ -74,3 +82,520 @@ func TestRateLimitConfigReload(t *testing.T) { re.True(leader.GetServer().GetServiceMiddlewarePersistOptions().IsRateLimitEnabled()) re.Len(leader.GetServer().GetServiceMiddlewarePersistOptions().GetRateLimitConfig().LimiterConfig, 1) } + +type configTestSuite struct { + suite.Suite +} + +func TestConfigTestSuite(t *testing.T) { + suite.Run(t, new(configTestSuite)) +} + +func (suite *configTestSuite) TestConfigAll() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkConfigAll) +} + +func (suite *configTestSuite) checkConfigAll(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + urlPrefix := leaderServer.GetAddr() + + addr := fmt.Sprintf("%s/pd/api/v1/config", urlPrefix) + cfg := &config.Config{} + tu.Eventually(re, func() bool { + err := tu.ReadGetJSON(re, testDialClient, addr, cfg) + suite.NoError(err) + return cfg.PDServerCfg.DashboardAddress != "auto" + }) + + // the original way + r := map[string]int{"max-replicas": 5} + postData, err := json.Marshal(r) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) + l := map[string]interface{}{ + "location-labels": "zone,rack", + "region-schedule-limit": 10, + } + postData, err = json.Marshal(l) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) + + l = map[string]interface{}{ + "metric-storage": "http://127.0.0.1:9090", + } + postData, err = json.Marshal(l) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) + + newCfg := &config.Config{} + err = tu.ReadGetJSON(re, testDialClient, addr, newCfg) + suite.NoError(err) + cfg.Replication.MaxReplicas = 5 + cfg.Replication.LocationLabels = []string{"zone", "rack"} + cfg.Schedule.RegionScheduleLimit = 10 + cfg.PDServerCfg.MetricStorage = "http://127.0.0.1:9090" + suite.Equal(newCfg, cfg) + + // the new way + l = map[string]interface{}{ + "schedule.tolerant-size-ratio": 2.5, + "schedule.enable-tikv-split-region": "false", + "replication.location-labels": "idc,host", + "pd-server.metric-storage": "http://127.0.0.1:1234", + "log.level": "warn", + "cluster-version": "v4.0.0-beta", + "replication-mode.replication-mode": "dr-auto-sync", + "replication-mode.dr-auto-sync.label-key": "foobar", + } + postData, err = json.Marshal(l) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) + newCfg1 := &config.Config{} + err = tu.ReadGetJSON(re, testDialClient, addr, newCfg1) + suite.NoError(err) + cfg.Schedule.EnableTiKVSplitRegion = false + cfg.Schedule.TolerantSizeRatio = 2.5 + cfg.Replication.LocationLabels = []string{"idc", "host"} + cfg.PDServerCfg.MetricStorage = "http://127.0.0.1:1234" + cfg.Log.Level = "warn" + cfg.ReplicationMode.DRAutoSync.LabelKey = "foobar" + cfg.ReplicationMode.ReplicationMode = "dr-auto-sync" + v, err := versioninfo.ParseVersion("v4.0.0-beta") + suite.NoError(err) + cfg.ClusterVersion = *v + suite.Equal(cfg, newCfg1) + + // revert this to avoid it affects TestConfigTTL + l["schedule.enable-tikv-split-region"] = "true" + postData, err = json.Marshal(l) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) + + // illegal prefix + l = map[string]interface{}{ + "replicate.max-replicas": 1, + } + postData, err = json.Marshal(l) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, + tu.StatusNotOK(re), + tu.StringContain(re, "not found")) + suite.NoError(err) + + // update prefix directly + l = map[string]interface{}{ + "replication-mode": nil, + } + postData, err = json.Marshal(l) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, + tu.StatusNotOK(re), + tu.StringContain(re, "cannot update config prefix")) + suite.NoError(err) + + // config item not found + l = map[string]interface{}{ + "schedule.region-limit": 10, + } + postData, err = json.Marshal(l) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusNotOK(re), tu.StringContain(re, "not found")) + suite.NoError(err) +} + +func (suite *configTestSuite) TestConfigSchedule() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkConfigSchedule) +} + +func (suite *configTestSuite) checkConfigSchedule(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + urlPrefix := leaderServer.GetAddr() + + addr := fmt.Sprintf("%s/pd/api/v1/config/schedule", urlPrefix) + + scheduleConfig := &sc.ScheduleConfig{} + suite.NoError(tu.ReadGetJSON(re, testDialClient, addr, scheduleConfig)) + scheduleConfig.MaxStoreDownTime.Duration = time.Second + postData, err := json.Marshal(scheduleConfig) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) + + scheduleConfig1 := &sc.ScheduleConfig{} + suite.NoError(tu.ReadGetJSON(re, testDialClient, addr, scheduleConfig1)) + suite.Equal(*scheduleConfig1, *scheduleConfig) +} + +func (suite *configTestSuite) TestConfigReplication() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkConfigReplication) +} + +func (suite *configTestSuite) checkConfigReplication(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + urlPrefix := leaderServer.GetAddr() + + addr := fmt.Sprintf("%s/pd/api/v1/config/replicate", urlPrefix) + rc := &sc.ReplicationConfig{} + err := tu.ReadGetJSON(re, testDialClient, addr, rc) + suite.NoError(err) + + rc.MaxReplicas = 5 + rc1 := map[string]int{"max-replicas": 5} + postData, err := json.Marshal(rc1) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) + + rc.LocationLabels = []string{"zone", "rack"} + rc2 := map[string]string{"location-labels": "zone,rack"} + postData, err = json.Marshal(rc2) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) + + rc.IsolationLevel = "zone" + rc3 := map[string]string{"isolation-level": "zone"} + postData, err = json.Marshal(rc3) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) + + rc4 := &sc.ReplicationConfig{} + err = tu.ReadGetJSON(re, testDialClient, addr, rc4) + suite.NoError(err) + + suite.Equal(*rc4, *rc) +} + +func (suite *configTestSuite) TestConfigLabelProperty() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkConfigLabelProperty) +} + +func (suite *configTestSuite) checkConfigLabelProperty(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + urlPrefix := leaderServer.GetAddr() + + addr := urlPrefix + "/pd/api/v1/config/label-property" + loadProperties := func() config.LabelPropertyConfig { + var cfg config.LabelPropertyConfig + err := tu.ReadGetJSON(re, testDialClient, addr, &cfg) + suite.NoError(err) + return cfg + } + + cfg := loadProperties() + suite.Empty(cfg) + + cmds := []string{ + `{"type": "foo", "action": "set", "label-key": "zone", "label-value": "cn1"}`, + `{"type": "foo", "action": "set", "label-key": "zone", "label-value": "cn2"}`, + `{"type": "bar", "action": "set", "label-key": "host", "label-value": "h1"}`, + } + for _, cmd := range cmds { + err := tu.CheckPostJSON(testDialClient, addr, []byte(cmd), tu.StatusOK(re)) + suite.NoError(err) + } + + cfg = loadProperties() + suite.Len(cfg, 2) + suite.Equal([]config.StoreLabel{ + {Key: "zone", Value: "cn1"}, + {Key: "zone", Value: "cn2"}, + }, cfg["foo"]) + suite.Equal([]config.StoreLabel{{Key: "host", Value: "h1"}}, cfg["bar"]) + + cmds = []string{ + `{"type": "foo", "action": "delete", "label-key": "zone", "label-value": "cn1"}`, + `{"type": "bar", "action": "delete", "label-key": "host", "label-value": "h1"}`, + } + for _, cmd := range cmds { + err := tu.CheckPostJSON(testDialClient, addr, []byte(cmd), tu.StatusOK(re)) + suite.NoError(err) + } + + cfg = loadProperties() + suite.Len(cfg, 1) + suite.Equal([]config.StoreLabel{{Key: "zone", Value: "cn2"}}, cfg["foo"]) +} + +func (suite *configTestSuite) TestConfigDefault() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkConfigDefault) +} + +func (suite *configTestSuite) checkConfigDefault(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + urlPrefix := leaderServer.GetAddr() + + addr := urlPrefix + "/pd/api/v1/config" + + r := map[string]int{"max-replicas": 5} + postData, err := json.Marshal(r) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) + l := map[string]interface{}{ + "location-labels": "zone,rack", + "region-schedule-limit": 10, + } + postData, err = json.Marshal(l) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) + + l = map[string]interface{}{ + "metric-storage": "http://127.0.0.1:9090", + } + postData, err = json.Marshal(l) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) + + addr = fmt.Sprintf("%s/pd/api/v1/config/default", urlPrefix) + defaultCfg := &config.Config{} + err = tu.ReadGetJSON(re, testDialClient, addr, defaultCfg) + suite.NoError(err) + + suite.Equal(uint64(3), defaultCfg.Replication.MaxReplicas) + suite.Equal(typeutil.StringSlice([]string{}), defaultCfg.Replication.LocationLabels) + suite.Equal(uint64(2048), defaultCfg.Schedule.RegionScheduleLimit) + suite.Equal("", defaultCfg.PDServerCfg.MetricStorage) +} + +func (suite *configTestSuite) TestConfigPDServer() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + env.RunTestInTwoModes(suite.checkConfigPDServer) +} + +func (suite *configTestSuite) checkConfigPDServer(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + urlPrefix := leaderServer.GetAddr() + + addrPost := urlPrefix + "/pd/api/v1/config" + ms := map[string]interface{}{ + "metric-storage": "", + } + postData, err := json.Marshal(ms) + suite.NoError(err) + suite.NoError(tu.CheckPostJSON(testDialClient, addrPost, postData, tu.StatusOK(re))) + addrGet := fmt.Sprintf("%s/pd/api/v1/config/pd-server", urlPrefix) + sc := &config.PDServerConfig{} + suite.NoError(tu.ReadGetJSON(re, testDialClient, addrGet, sc)) + suite.Equal(bool(true), sc.UseRegionStorage) + suite.Equal("table", sc.KeyType) + suite.Equal(typeutil.StringSlice([]string{}), sc.RuntimeServices) + suite.Equal("", sc.MetricStorage) + suite.Equal("auto", sc.DashboardAddress) + suite.Equal(int(3), sc.FlowRoundByDigit) + suite.Equal(typeutil.NewDuration(time.Second), sc.MinResolvedTSPersistenceInterval) + suite.Equal(24*time.Hour, sc.MaxResetTSGap.Duration) +} + +var ttlConfig = map[string]interface{}{ + "schedule.max-snapshot-count": 999, + "schedule.enable-location-replacement": false, + "schedule.max-merge-region-size": 999, + "schedule.max-merge-region-keys": 999, + "schedule.scheduler-max-waiting-operator": 999, + "schedule.leader-schedule-limit": 999, + "schedule.region-schedule-limit": 999, + "schedule.hot-region-schedule-limit": 999, + "schedule.replica-schedule-limit": 999, + "schedule.merge-schedule-limit": 999, + "schedule.enable-tikv-split-region": false, +} + +var invalidTTLConfig = map[string]interface{}{ + "schedule.invalid-ttl-config": 0, +} + +type ttlConfigInterface interface { + GetMaxSnapshotCount() uint64 + IsLocationReplacementEnabled() bool + GetMaxMergeRegionSize() uint64 + GetMaxMergeRegionKeys() uint64 + GetSchedulerMaxWaitingOperator() uint64 + GetLeaderScheduleLimit() uint64 + GetRegionScheduleLimit() uint64 + GetHotRegionScheduleLimit() uint64 + GetReplicaScheduleLimit() uint64 + GetMergeScheduleLimit() uint64 + IsTikvRegionSplitEnabled() bool +} + +func (suite *configTestSuite) assertTTLConfig( + cluster *tests.TestCluster, + expectedEqual bool, +) { + equality := suite.Equal + if !expectedEqual { + equality = suite.NotEqual + } + checkfunc := func(options ttlConfigInterface) { + equality(uint64(999), options.GetMaxSnapshotCount()) + equality(false, options.IsLocationReplacementEnabled()) + equality(uint64(999), options.GetMaxMergeRegionSize()) + equality(uint64(999), options.GetMaxMergeRegionKeys()) + equality(uint64(999), options.GetSchedulerMaxWaitingOperator()) + equality(uint64(999), options.GetLeaderScheduleLimit()) + equality(uint64(999), options.GetRegionScheduleLimit()) + equality(uint64(999), options.GetHotRegionScheduleLimit()) + equality(uint64(999), options.GetReplicaScheduleLimit()) + equality(uint64(999), options.GetMergeScheduleLimit()) + equality(false, options.IsTikvRegionSplitEnabled()) + } + checkfunc(cluster.GetLeaderServer().GetServer().GetPersistOptions()) + if cluster.GetSchedulingPrimaryServer() != nil { + // wait for the scheduling primary server to be synced + options := cluster.GetSchedulingPrimaryServer().GetPersistConfig() + tu.Eventually(suite.Require(), func() bool { + if expectedEqual { + return uint64(999) == options.GetMaxSnapshotCount() + } + return uint64(999) != options.GetMaxSnapshotCount() + }) + checkfunc(options) + } +} + +func (suite *configTestSuite) assertTTLConfigItemEqaul( + cluster *tests.TestCluster, + item string, + expectedValue interface{}, +) { + checkfunc := func(options ttlConfigInterface) bool { + switch item { + case "max-merge-region-size": + return expectedValue.(uint64) == options.GetMaxMergeRegionSize() + case "max-merge-region-keys": + return expectedValue.(uint64) == options.GetMaxMergeRegionKeys() + case "enable-tikv-split-region": + return expectedValue.(bool) == options.IsTikvRegionSplitEnabled() + } + return false + } + suite.True(checkfunc(cluster.GetLeaderServer().GetServer().GetPersistOptions())) + if cluster.GetSchedulingPrimaryServer() != nil { + // wait for the scheduling primary server to be synced + tu.Eventually(suite.Require(), func() bool { + return checkfunc(cluster.GetSchedulingPrimaryServer().GetPersistConfig()) + }) + } +} + +func createTTLUrl(url string, ttl int) string { + return fmt.Sprintf("%s/pd/api/v1/config?ttlSecond=%d", url, ttl) +} + +func (suite *configTestSuite) TestConfigTTL() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + // FIXME: enable this test in two modes after ttl config is supported. + env.RunTestInPDMode(suite.checkConfigTTL) +} + +func (suite *configTestSuite) checkConfigTTL(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + urlPrefix := leaderServer.GetAddr() + postData, err := json.Marshal(ttlConfig) + suite.NoError(err) + + // test no config and cleaning up + err = tu.CheckPostJSON(testDialClient, createTTLUrl(urlPrefix, 0), postData, tu.StatusOK(re)) + suite.NoError(err) + suite.assertTTLConfig(cluster, false) + + // test time goes by + err = tu.CheckPostJSON(testDialClient, createTTLUrl(urlPrefix, 1), postData, tu.StatusOK(re)) + suite.NoError(err) + suite.assertTTLConfig(cluster, true) + time.Sleep(2 * time.Second) + suite.assertTTLConfig(cluster, false) + + // test cleaning up + err = tu.CheckPostJSON(testDialClient, createTTLUrl(urlPrefix, 1), postData, tu.StatusOK(re)) + suite.NoError(err) + suite.assertTTLConfig(cluster, true) + err = tu.CheckPostJSON(testDialClient, createTTLUrl(urlPrefix, 0), postData, tu.StatusOK(re)) + suite.NoError(err) + suite.assertTTLConfig(cluster, false) + + postData, err = json.Marshal(invalidTTLConfig) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, createTTLUrl(urlPrefix, 1), postData, + tu.StatusNotOK(re), tu.StringEqual(re, "\"unsupported ttl config schedule.invalid-ttl-config\"\n")) + suite.NoError(err) + + // only set max-merge-region-size + mergeConfig := map[string]interface{}{ + "schedule.max-merge-region-size": 999, + } + postData, err = json.Marshal(mergeConfig) + suite.NoError(err) + + err = tu.CheckPostJSON(testDialClient, createTTLUrl(urlPrefix, 1), postData, tu.StatusOK(re)) + suite.NoError(err) + suite.assertTTLConfigItemEqaul(cluster, "max-merge-region-size", uint64(999)) + // max-merge-region-keys should keep consistence with max-merge-region-size. + suite.assertTTLConfigItemEqaul(cluster, "max-merge-region-keys", uint64(999*10000)) + + // on invalid value, we use default config + mergeConfig = map[string]interface{}{ + "schedule.enable-tikv-split-region": "invalid", + } + postData, err = json.Marshal(mergeConfig) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, createTTLUrl(urlPrefix, 10), postData, tu.StatusOK(re)) + suite.NoError(err) + suite.assertTTLConfigItemEqaul(cluster, "enable-tikv-split-region", true) +} + +func (suite *configTestSuite) TestTTLConflict() { + env := tests.NewSchedulingTestEnvironment(suite.T()) + // FIXME: enable this test in two modes after ttl config is supported. + env.RunTestInPDMode(suite.checkTTLConflict) +} + +func (suite *configTestSuite) checkTTLConflict(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + urlPrefix := leaderServer.GetAddr() + addr := createTTLUrl(urlPrefix, 1) + postData, err := json.Marshal(ttlConfig) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) + suite.assertTTLConfig(cluster, true) + + cfg := map[string]interface{}{"max-snapshot-count": 30} + postData, err = json.Marshal(cfg) + suite.NoError(err) + addr = fmt.Sprintf("%s/pd/api/v1/config", urlPrefix) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusNotOK(re), tu.StringEqual(re, "\"need to clean up TTL first for schedule.max-snapshot-count\"\n")) + suite.NoError(err) + addr = fmt.Sprintf("%s/pd/api/v1/config/schedule", urlPrefix) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusNotOK(re), tu.StringEqual(re, "\"need to clean up TTL first for schedule.max-snapshot-count\"\n")) + suite.NoError(err) + cfg = map[string]interface{}{"schedule.max-snapshot-count": 30} + postData, err = json.Marshal(cfg) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, createTTLUrl(urlPrefix, 0), postData, tu.StatusOK(re)) + suite.NoError(err) + err = tu.CheckPostJSON(testDialClient, addr, postData, tu.StatusOK(re)) + suite.NoError(err) +} From d651c6b91f2cbd0b22ae87a22c06317cdee12462 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Wed, 8 Nov 2023 11:18:42 +0800 Subject: [PATCH 11/20] core: batch get region size (#7252) close tikv/pd#7248 Signed-off-by: nolouch Co-authored-by: nolouch Co-authored-by: ShuNing --- pkg/core/region.go | 36 ++++++++--- pkg/core/region_test.go | 120 +++++++++++++++++++++++++++++++++++ server/cluster/cluster.go | 3 +- tests/server/api/api_test.go | 6 +- 4 files changed, 152 insertions(+), 13 deletions(-) diff --git a/pkg/core/region.go b/pkg/core/region.go index 2ac323a1272..c9daa69c477 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -41,7 +41,10 @@ import ( "go.uber.org/zap" ) -const randomRegionMaxRetry = 10 +const ( + randomRegionMaxRetry = 10 + scanRegionLimit = 1000 +) // errRegionIsStale is error info for region is stale. func errRegionIsStale(region *metapb.Region, origin *metapb.Region) error { @@ -1610,16 +1613,31 @@ func (r *RegionsInfo) ScanRegionWithIterator(startKey []byte, iterator func(regi // GetRegionSizeByRange scans regions intersecting [start key, end key), returns the total region size of this range. func (r *RegionsInfo) GetRegionSizeByRange(startKey, endKey []byte) int64 { - r.t.RLock() - defer r.t.RUnlock() var size int64 - r.tree.scanRange(startKey, func(region *RegionInfo) bool { - if len(endKey) > 0 && bytes.Compare(region.GetStartKey(), endKey) >= 0 { - return false + for { + r.t.RLock() + var cnt int + r.tree.scanRange(startKey, func(region *RegionInfo) bool { + if len(endKey) > 0 && bytes.Compare(region.GetStartKey(), endKey) >= 0 { + return false + } + if cnt >= scanRegionLimit { + return false + } + cnt++ + startKey = region.GetEndKey() + size += region.GetApproximateSize() + return true + }) + r.t.RUnlock() + if cnt == 0 { + break } - size += region.GetApproximateSize() - return true - }) + if len(startKey) == 0 { + break + } + } + return size } diff --git a/pkg/core/region_test.go b/pkg/core/region_test.go index 50302de920e..508e7aa59aa 100644 --- a/pkg/core/region_test.go +++ b/pkg/core/region_test.go @@ -18,8 +18,10 @@ import ( "crypto/rand" "fmt" "math" + mrand "math/rand" "strconv" "testing" + "time" "github.com/pingcap/failpoint" "github.com/pingcap/kvproto/pkg/metapb" @@ -658,6 +660,124 @@ func BenchmarkRandomRegion(b *testing.B) { } } +func BenchmarkRandomSetRegion(b *testing.B) { + regions := NewRegionsInfo() + var items []*RegionInfo + for i := 0; i < 1000000; i++ { + peer := &metapb.Peer{StoreId: 1, Id: uint64(i + 1)} + region := NewRegionInfo(&metapb.Region{ + Id: uint64(i + 1), + Peers: []*metapb.Peer{peer}, + StartKey: []byte(fmt.Sprintf("%20d", i)), + EndKey: []byte(fmt.Sprintf("%20d", i+1)), + }, peer) + origin, overlaps, rangeChanged := regions.SetRegion(region) + regions.UpdateSubTree(region, origin, overlaps, rangeChanged) + items = append(items, region) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + item := items[i%len(items)] + item.approximateKeys = int64(200000) + item.approximateSize = int64(20) + origin, overlaps, rangeChanged := regions.SetRegion(item) + regions.UpdateSubTree(item, origin, overlaps, rangeChanged) + } +} + +func TestGetRegionSizeByRange(t *testing.T) { + regions := NewRegionsInfo() + nums := 1000010 + for i := 0; i < nums; i++ { + peer := &metapb.Peer{StoreId: 1, Id: uint64(i + 1)} + endKey := []byte(fmt.Sprintf("%20d", i+1)) + if i == nums-1 { + endKey = []byte("") + } + region := NewRegionInfo(&metapb.Region{ + Id: uint64(i + 1), + Peers: []*metapb.Peer{peer}, + StartKey: []byte(fmt.Sprintf("%20d", i)), + EndKey: endKey, + }, peer, SetApproximateSize(10)) + origin, overlaps, rangeChanged := regions.SetRegion(region) + regions.UpdateSubTree(region, origin, overlaps, rangeChanged) + } + totalSize := regions.GetRegionSizeByRange([]byte(""), []byte("")) + require.Equal(t, int64(nums*10), totalSize) + for i := 1; i < 10; i++ { + verifyNum := nums / i + endKey := fmt.Sprintf("%20d", verifyNum) + totalSize := regions.GetRegionSizeByRange([]byte(""), []byte(endKey)) + require.Equal(t, int64(verifyNum*10), totalSize) + } +} + +func BenchmarkRandomSetRegionWithGetRegionSizeByRange(b *testing.B) { + regions := NewRegionsInfo() + var items []*RegionInfo + for i := 0; i < 1000000; i++ { + peer := &metapb.Peer{StoreId: 1, Id: uint64(i + 1)} + region := NewRegionInfo(&metapb.Region{ + Id: uint64(i + 1), + Peers: []*metapb.Peer{peer}, + StartKey: []byte(fmt.Sprintf("%20d", i)), + EndKey: []byte(fmt.Sprintf("%20d", i+1)), + }, peer, SetApproximateSize(10)) + origin, overlaps, rangeChanged := regions.SetRegion(region) + regions.UpdateSubTree(region, origin, overlaps, rangeChanged) + items = append(items, region) + } + b.ResetTimer() + go func() { + for { + regions.GetRegionSizeByRange([]byte(""), []byte("")) + time.Sleep(time.Millisecond) + } + }() + for i := 0; i < b.N; i++ { + item := items[i%len(items)] + item.approximateKeys = int64(200000) + origin, overlaps, rangeChanged := regions.SetRegion(item) + regions.UpdateSubTree(item, origin, overlaps, rangeChanged) + } +} + +func BenchmarkRandomSetRegionWithGetRegionSizeByRangeParallel(b *testing.B) { + regions := NewRegionsInfo() + var items []*RegionInfo + for i := 0; i < 1000000; i++ { + peer := &metapb.Peer{StoreId: 1, Id: uint64(i + 1)} + region := NewRegionInfo(&metapb.Region{ + Id: uint64(i + 1), + Peers: []*metapb.Peer{peer}, + StartKey: []byte(fmt.Sprintf("%20d", i)), + EndKey: []byte(fmt.Sprintf("%20d", i+1)), + }, peer) + origin, overlaps, rangeChanged := regions.SetRegion(region) + regions.UpdateSubTree(region, origin, overlaps, rangeChanged) + items = append(items, region) + } + b.ResetTimer() + go func() { + for { + regions.GetRegionSizeByRange([]byte(""), []byte("")) + time.Sleep(time.Millisecond) + } + }() + + b.RunParallel( + func(pb *testing.PB) { + for pb.Next() { + item := items[mrand.Intn(len(items))] + n := item.Clone(SetApproximateSize(20)) + origin, overlaps, rangeChanged := regions.SetRegion(n) + regions.UpdateSubTree(item, origin, overlaps, rangeChanged) + } + }, + ) +} + const keyLength = 100 func randomBytes(n int) []byte { diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 25a47a7fca9..8362ee9f331 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -1846,12 +1846,13 @@ func (c *RaftCluster) checkStores() { if err := c.ReadyToServe(storeID); err != nil { log.Error("change store to serving failed", zap.Stringer("store", store.GetMeta()), + zap.Int("region-count", c.GetTotalRegionCount()), errs.ZapError(err)) } } else if c.IsPrepared() { threshold := c.getThreshold(stores, store) - log.Debug("store serving threshold", zap.Uint64("store-id", storeID), zap.Float64("threshold", threshold)) regionSize := float64(store.GetRegionSize()) + log.Debug("store serving threshold", zap.Uint64("store-id", storeID), zap.Float64("threshold", threshold), zap.Float64("region-size", regionSize)) if regionSize >= threshold { if err := c.ReadyToServe(storeID); err != nil { log.Error("change store to serving failed", diff --git a/tests/server/api/api_test.go b/tests/server/api/api_test.go index ff430f1b848..04bcdc0d461 100644 --- a/tests/server/api/api_test.go +++ b/tests/server/api/api_test.go @@ -914,7 +914,7 @@ func TestPreparingProgress(t *testing.T) { tests.MustPutStore(re, cluster, store) } for i := 0; i < 100; i++ { - tests.MustPutRegion(re, cluster, uint64(i+1), uint64(i)%3+1, []byte(fmt.Sprintf("p%d", i)), []byte(fmt.Sprintf("%d", i+1)), core.SetApproximateSize(10)) + tests.MustPutRegion(re, cluster, uint64(i+1), uint64(i)%3+1, []byte(fmt.Sprintf("%20d", i)), []byte(fmt.Sprintf("%20d", i+1)), core.SetApproximateSize(10)) } // no store preparing output := sendRequest(re, leader.GetAddr()+"/pd/api/v1/stores/progress?action=preparing", http.MethodGet, http.StatusNotFound) @@ -941,8 +941,8 @@ func TestPreparingProgress(t *testing.T) { re.Equal(math.MaxFloat64, p.LeftSeconds) // update size - tests.MustPutRegion(re, cluster, 1000, 4, []byte(fmt.Sprintf("%d", 1000)), []byte(fmt.Sprintf("%d", 1001)), core.SetApproximateSize(10)) - tests.MustPutRegion(re, cluster, 1001, 5, []byte(fmt.Sprintf("%d", 1001)), []byte(fmt.Sprintf("%d", 1002)), core.SetApproximateSize(40)) + tests.MustPutRegion(re, cluster, 1000, 4, []byte(fmt.Sprintf("%20d", 1000)), []byte(fmt.Sprintf("%20d", 1001)), core.SetApproximateSize(10)) + tests.MustPutRegion(re, cluster, 1001, 5, []byte(fmt.Sprintf("%20d", 1001)), []byte(fmt.Sprintf("%20d", 1002)), core.SetApproximateSize(40)) time.Sleep(2 * time.Second) output = sendRequest(re, leader.GetAddr()+"/pd/api/v1/stores/progress?action=preparing", http.MethodGet, http.StatusOK) re.NoError(json.Unmarshal(output, &p)) From 4457ac2717644a39a1ccfaeeb5cfb7ecd0542e99 Mon Sep 17 00:00:00 2001 From: Yongbo Jiang Date: Wed, 8 Nov 2023 12:14:13 +0800 Subject: [PATCH 12/20] mcs/scheduling: fix typo (#7333) ref tikv/pd#5839 Signed-off-by: Cabinfever_B Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/mcs/scheduling/server/grpc_service.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/mcs/scheduling/server/grpc_service.go b/pkg/mcs/scheduling/server/grpc_service.go index 79c5c293ee7..b865e917d75 100644 --- a/pkg/mcs/scheduling/server/grpc_service.go +++ b/pkg/mcs/scheduling/server/grpc_service.go @@ -65,7 +65,7 @@ type Service struct { *Server } -// NewService creates a new TSO service. +// NewService creates a new scheduling service. func NewService[T ConfigProvider](svr bs.Server) registry.RegistrableService { server, ok := svr.(*Server) if !ok { @@ -118,7 +118,7 @@ func (s *heartbeatServer) Recv() (*schedulingpb.RegionHeartbeatRequest, error) { return req, nil } -// RegionHeartbeat implements gRPC PDServer. +// RegionHeartbeat implements gRPC SchedulingServer. func (s *Service) RegionHeartbeat(stream schedulingpb.Scheduling_RegionHeartbeatServer) error { var ( server = &heartbeatServer{stream: stream} @@ -168,7 +168,7 @@ func (s *Service) RegionHeartbeat(stream schedulingpb.Scheduling_RegionHeartbeat } } -// StoreHeartbeat implements gRPC PDServer. +// StoreHeartbeat implements gRPC SchedulingServer. func (s *Service) StoreHeartbeat(ctx context.Context, request *schedulingpb.StoreHeartbeatRequest) (*schedulingpb.StoreHeartbeatResponse, error) { c := s.GetCluster() if c == nil { @@ -202,7 +202,7 @@ func (s *Service) SplitRegions(ctx context.Context, request *schedulingpb.SplitR }, nil } -// ScatterRegions implements gRPC PDServer. +// ScatterRegions implements gRPC SchedulingServer. func (s *Service) ScatterRegions(ctx context.Context, request *schedulingpb.ScatterRegionsRequest) (*schedulingpb.ScatterRegionsResponse, error) { c := s.GetCluster() if c == nil { @@ -261,7 +261,7 @@ func (s *Service) GetOperator(ctx context.Context, request *schedulingpb.GetOper }, nil } -// AskBatchSplit implements gRPC PDServer. +// AskBatchSplit implements gRPC SchedulingServer. func (s *Service) AskBatchSplit(ctx context.Context, request *schedulingpb.AskBatchSplitRequest) (*schedulingpb.AskBatchSplitResponse, error) { c := s.GetCluster() if c == nil { From a98295c22490b9fdb0e2cb052b68691a9d56f6dc Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Wed, 8 Nov 2023 16:05:14 +0800 Subject: [PATCH 13/20] mcs: fix participant name (#7335) close tikv/pd#7336 Signed-off-by: Ryan Leung Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/mcs/resourcemanager/server/config.go | 20 ++++++++++++++++++++ pkg/mcs/resourcemanager/server/server.go | 6 +++--- pkg/mcs/scheduling/server/config/config.go | 15 +++++++++++++++ pkg/mcs/scheduling/server/server.go | 8 ++++---- 4 files changed, 42 insertions(+), 7 deletions(-) diff --git a/pkg/mcs/resourcemanager/server/config.go b/pkg/mcs/resourcemanager/server/config.go index 3f64b2987fd..10e91612842 100644 --- a/pkg/mcs/resourcemanager/server/config.go +++ b/pkg/mcs/resourcemanager/server/config.go @@ -250,6 +250,26 @@ func (c *Config) adjustLog(meta *configutil.ConfigMetaData) { } } +// GetName returns the Name +func (c *Config) GetName() string { + return c.Name +} + +// GeBackendEndpoints returns the BackendEndpoints +func (c *Config) GeBackendEndpoints() string { + return c.BackendEndpoints +} + +// GetListenAddr returns the ListenAddr +func (c *Config) GetListenAddr() string { + return c.ListenAddr +} + +// GetAdvertiseListenAddr returns the AdvertiseListenAddr +func (c *Config) GetAdvertiseListenAddr() string { + return c.AdvertiseListenAddr +} + // GetTLSConfig returns the TLS config. func (c *Config) GetTLSConfig() *grpcutil.TLSConfig { return &c.Security.TLSConfig diff --git a/pkg/mcs/resourcemanager/server/server.go b/pkg/mcs/resourcemanager/server/server.go index 47248208c8a..7b660c07605 100644 --- a/pkg/mcs/resourcemanager/server/server.go +++ b/pkg/mcs/resourcemanager/server/server.go @@ -296,14 +296,14 @@ func (s *Server) startServer() (err error) { // different service modes provided by the same pd-server binary serverInfo.WithLabelValues(versioninfo.PDReleaseVersion, versioninfo.PDGitHash).Set(float64(time.Now().Unix())) - uniqueName := s.cfg.ListenAddr + uniqueName := s.cfg.GetAdvertiseListenAddr() uniqueID := memberutil.GenerateUniqueID(uniqueName) log.Info("joining primary election", zap.String("participant-name", uniqueName), zap.Uint64("participant-id", uniqueID)) s.participant = member.NewParticipant(s.GetClient(), utils.ResourceManagerServiceName) p := &resource_manager.Participant{ Name: uniqueName, Id: uniqueID, // id is unique among all participants - ListenUrls: []string{s.cfg.AdvertiseListenAddr}, + ListenUrls: []string{s.cfg.GetAdvertiseListenAddr()}, } s.participant.InitInfo(p, endpoint.ResourceManagerSvcRootPath(s.clusterID), utils.PrimaryKey, "primary election") @@ -312,7 +312,7 @@ func (s *Server) startServer() (err error) { manager: NewManager[*Server](s), } - if err := s.InitListener(s.GetTLSConfig(), s.cfg.ListenAddr); err != nil { + if err := s.InitListener(s.GetTLSConfig(), s.cfg.GetListenAddr()); err != nil { return err } diff --git a/pkg/mcs/scheduling/server/config/config.go b/pkg/mcs/scheduling/server/config/config.go index 772eab835f1..a211c989c64 100644 --- a/pkg/mcs/scheduling/server/config/config.go +++ b/pkg/mcs/scheduling/server/config/config.go @@ -164,11 +164,26 @@ func (c *Config) adjustLog(meta *configutil.ConfigMetaData) { } } +// GetName returns the Name +func (c *Config) GetName() string { + return c.Name +} + +// GeBackendEndpoints returns the BackendEndpoints +func (c *Config) GeBackendEndpoints() string { + return c.BackendEndpoints +} + // GetListenAddr returns the ListenAddr func (c *Config) GetListenAddr() string { return c.ListenAddr } +// GetAdvertiseListenAddr returns the AdvertiseListenAddr +func (c *Config) GetAdvertiseListenAddr() string { + return c.AdvertiseListenAddr +} + // GetTLSConfig returns the TLS config. func (c *Config) GetTLSConfig() *grpcutil.TLSConfig { return &c.Security.TLSConfig diff --git a/pkg/mcs/scheduling/server/server.go b/pkg/mcs/scheduling/server/server.go index 1790cb2b4be..4304ffb218a 100644 --- a/pkg/mcs/scheduling/server/server.go +++ b/pkg/mcs/scheduling/server/server.go @@ -405,21 +405,21 @@ func (s *Server) startServer() (err error) { // different service modes provided by the same pd-server binary serverInfo.WithLabelValues(versioninfo.PDReleaseVersion, versioninfo.PDGitHash).Set(float64(time.Now().Unix())) - uniqueName := s.cfg.ListenAddr + uniqueName := s.cfg.GetAdvertiseListenAddr() uniqueID := memberutil.GenerateUniqueID(uniqueName) log.Info("joining primary election", zap.String("participant-name", uniqueName), zap.Uint64("participant-id", uniqueID)) s.participant = member.NewParticipant(s.GetClient(), utils.SchedulingServiceName) p := &schedulingpb.Participant{ Name: uniqueName, Id: uniqueID, // id is unique among all participants - ListenUrls: []string{s.cfg.AdvertiseListenAddr}, + ListenUrls: []string{s.cfg.GetAdvertiseListenAddr()}, } s.participant.InitInfo(p, endpoint.SchedulingSvcRootPath(s.clusterID), utils.PrimaryKey, "primary election") s.service = &Service{Server: s} s.AddServiceReadyCallback(s.startCluster) s.AddServiceExitCallback(s.stopCluster) - if err := s.InitListener(s.GetTLSConfig(), s.cfg.ListenAddr); err != nil { + if err := s.InitListener(s.GetTLSConfig(), s.cfg.GetListenAddr()); err != nil { return err } @@ -443,7 +443,7 @@ func (s *Server) startServer() (err error) { return err } s.serviceRegister = discovery.NewServiceRegister(s.Context(), s.GetClient(), strconv.FormatUint(s.clusterID, 10), - utils.SchedulingServiceName, s.cfg.AdvertiseListenAddr, serializedEntry, discovery.DefaultLeaseInSeconds) + utils.SchedulingServiceName, s.cfg.GetAdvertiseListenAddr(), serializedEntry, discovery.DefaultLeaseInSeconds) if err := s.serviceRegister.Register(); err != nil { log.Error("failed to register the service", zap.String("service-name", utils.SchedulingServiceName), errs.ZapError(err)) return err From d189a42894f5e2d957f9f4da0790cbe1468558da Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Wed, 8 Nov 2023 17:08:12 +0800 Subject: [PATCH 14/20] mcs: solve stream error when forward tso (#7327) close tikv/pd#7320 Signed-off-by: lhy1024 --- pkg/utils/grpcutil/grpcutil.go | 14 +++++++ server/gc_service.go | 11 +----- server/grpc_service.go | 71 +++++++++++++++++----------------- 3 files changed, 50 insertions(+), 46 deletions(-) diff --git a/pkg/utils/grpcutil/grpcutil.go b/pkg/utils/grpcutil/grpcutil.go index ee9d85a4ee1..44d45ff4c70 100644 --- a/pkg/utils/grpcutil/grpcutil.go +++ b/pkg/utils/grpcutil/grpcutil.go @@ -18,7 +18,9 @@ import ( "context" "crypto/tls" "crypto/x509" + "io" "net/url" + "strings" "time" "github.com/pingcap/errors" @@ -28,6 +30,7 @@ import ( "go.etcd.io/etcd/pkg/transport" "go.uber.org/zap" "google.golang.org/grpc" + "google.golang.org/grpc/codes" "google.golang.org/grpc/credentials" "google.golang.org/grpc/metadata" ) @@ -221,3 +224,14 @@ func CheckStream(ctx context.Context, cancel context.CancelFunc, done chan struc } <-done } + +// NeedRebuildConnection checks if the error is a connection error. +func NeedRebuildConnection(err error) bool { + return err == io.EOF || + strings.Contains(err.Error(), codes.Unavailable.String()) || // Unavailable indicates the service is currently unavailable. This is a most likely a transient condition. + strings.Contains(err.Error(), codes.DeadlineExceeded.String()) || // DeadlineExceeded means operation expired before completion. + strings.Contains(err.Error(), codes.Internal.String()) || // Internal errors. + strings.Contains(err.Error(), codes.Unknown.String()) || // Unknown error. + strings.Contains(err.Error(), codes.ResourceExhausted.String()) // ResourceExhausted is returned when either the client or the server has exhausted their resources. + // Besides, we don't need to rebuild the connection if the code is Canceled, which means the client cancelled the request. +} diff --git a/server/gc_service.go b/server/gc_service.go index d8a0158920d..90333654e5e 100644 --- a/server/gc_service.go +++ b/server/gc_service.go @@ -26,7 +26,6 @@ import ( "github.com/pingcap/log" "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/pkg/storage/endpoint" - "github.com/tikv/pd/pkg/tso" "github.com/tikv/pd/pkg/utils/etcdutil" "github.com/tikv/pd/pkg/utils/tsoutil" "go.etcd.io/etcd/clientv3" @@ -107,15 +106,7 @@ func (s *GrpcServer) UpdateServiceSafePointV2(ctx context.Context, request *pdpb return rsp.(*pdpb.UpdateServiceSafePointV2Response), err } - var ( - nowTSO pdpb.Timestamp - err error - ) - if s.IsAPIServiceMode() { - nowTSO, err = s.getGlobalTSOFromTSOServer(ctx) - } else { - nowTSO, err = s.tsoAllocatorManager.HandleRequest(ctx, tso.GlobalDCLocation, 1) - } + nowTSO, err := s.getGlobalTSO(ctx) if err != nil { return nil, err } diff --git a/server/grpc_service.go b/server/grpc_service.go index 4aa6dc5b1da..05ec38919cb 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -2002,15 +2002,7 @@ func (s *GrpcServer) UpdateServiceGCSafePoint(ctx context.Context, request *pdpb return nil, err } } - var ( - nowTSO pdpb.Timestamp - err error - ) - if s.IsAPIServiceMode() { - nowTSO, err = s.getGlobalTSOFromTSOServer(ctx) - } else { - nowTSO, err = s.tsoAllocatorManager.HandleRequest(ctx, tso.GlobalDCLocation, 1) - } + nowTSO, err := s.getGlobalTSO(ctx) if err != nil { return nil, err } @@ -2608,7 +2600,10 @@ func forwardReportBucketClientToServer(forwardStream pdpb.PD_ReportBucketsClient } } -func (s *GrpcServer) getGlobalTSOFromTSOServer(ctx context.Context) (pdpb.Timestamp, error) { +func (s *GrpcServer) getGlobalTSO(ctx context.Context) (pdpb.Timestamp, error) { + if !s.IsAPIServiceMode() { + return s.tsoAllocatorManager.HandleRequest(ctx, tso.GlobalDCLocation, 1) + } request := &tsopb.TsoRequest{ Header: &tsopb.RequestHeader{ ClusterId: s.clusterID, @@ -2622,9 +2617,28 @@ func (s *GrpcServer) getGlobalTSOFromTSOServer(ctx context.Context) (pdpb.Timest forwardStream tsopb.TSO_TsoClient ts *tsopb.TsoResponse err error + ok bool ) + handleStreamError := func(err error) (needRetry bool) { + if strings.Contains(err.Error(), errs.NotLeaderErr) { + s.tsoPrimaryWatcher.ForceLoad() + log.Warn("force to load tso primary address due to error", zap.Error(err), zap.String("tso-addr", forwardedHost)) + return true + } + if grpcutil.NeedRebuildConnection(err) { + s.tsoClientPool.Lock() + delete(s.tsoClientPool.clients, forwardedHost) + s.tsoClientPool.Unlock() + log.Warn("client connection removed due to error", zap.Error(err), zap.String("tso-addr", forwardedHost)) + return true + } + return false + } for i := 0; i < maxRetryTimesRequestTSOServer; i++ { - forwardedHost, ok := s.GetServicePrimaryAddr(ctx, utils.TSOServiceName) + if i > 0 { + time.Sleep(retryIntervalRequestTSOServer) + } + forwardedHost, ok = s.GetServicePrimaryAddr(ctx, utils.TSOServiceName) if !ok || forwardedHost == "" { return pdpb.Timestamp{}, ErrNotFoundTSOAddr } @@ -2632,32 +2646,25 @@ func (s *GrpcServer) getGlobalTSOFromTSOServer(ctx context.Context) (pdpb.Timest if err != nil { return pdpb.Timestamp{}, err } - err := forwardStream.Send(request) + err = forwardStream.Send(request) if err != nil { - s.tsoClientPool.Lock() - delete(s.tsoClientPool.clients, forwardedHost) - s.tsoClientPool.Unlock() - continue + if needRetry := handleStreamError(err); needRetry { + continue + } + log.Error("send request to tso primary server failed", zap.Error(err), zap.String("tso-addr", forwardedHost)) + return pdpb.Timestamp{}, err } ts, err = forwardStream.Recv() if err != nil { - if strings.Contains(err.Error(), errs.NotLeaderErr) { - s.tsoPrimaryWatcher.ForceLoad() - time.Sleep(retryIntervalRequestTSOServer) - continue - } - if strings.Contains(err.Error(), codes.Unavailable.String()) { - s.tsoClientPool.Lock() - delete(s.tsoClientPool.clients, forwardedHost) - s.tsoClientPool.Unlock() + if needRetry := handleStreamError(err); needRetry { continue } - log.Error("get global tso from tso service primary addr failed", zap.Error(err), zap.String("tso-addr", forwardedHost)) + log.Error("receive response from tso primary server failed", zap.Error(err), zap.String("tso-addr", forwardedHost)) return pdpb.Timestamp{}, err } return *ts.GetTimestamp(), nil } - log.Error("get global tso from tso service primary addr failed after retry", zap.Error(err), zap.String("tso-addr", forwardedHost)) + log.Error("get global tso from tso primary server failed after retry", zap.Error(err), zap.String("tso-addr", forwardedHost)) return pdpb.Timestamp{}, err } @@ -2906,15 +2913,7 @@ func (s *GrpcServer) SetExternalTimestamp(ctx context.Context, request *pdpb.Set return rsp.(*pdpb.SetExternalTimestampResponse), nil } - var ( - nowTSO pdpb.Timestamp - err error - ) - if s.IsAPIServiceMode() { - nowTSO, err = s.getGlobalTSOFromTSOServer(ctx) - } else { - nowTSO, err = s.tsoAllocatorManager.HandleRequest(ctx, tso.GlobalDCLocation, 1) - } + nowTSO, err := s.getGlobalTSO(ctx) if err != nil { return nil, err } From 2c07c241114fe9afabd9927ecbee61c4252f2d8e Mon Sep 17 00:00:00 2001 From: Sparkle <1284531+baurine@users.noreply.github.com> Date: Wed, 8 Nov 2023 18:18:42 +0800 Subject: [PATCH 15/20] chore(dashboard): update tidb dashboard verstion to v2023.11.08.1 (#7339) close tikv/pd#7340 Signed-off-by: baurine <2008.hbl@gmail.com> --- go.mod | 2 +- go.sum | 4 ++-- tests/integrations/client/go.mod | 2 +- tests/integrations/client/go.sum | 4 ++-- tests/integrations/mcs/go.mod | 2 +- tests/integrations/mcs/go.sum | 4 ++-- tests/integrations/tso/go.mod | 2 +- tests/integrations/tso/go.sum | 4 ++-- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index e8da2542be2..0306d70f7a3 100644 --- a/go.mod +++ b/go.mod @@ -36,7 +36,7 @@ require ( github.com/pingcap/kvproto v0.0.0-20231018065736-c0689aded40c github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 - github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 + github.com/pingcap/tidb-dashboard v0.0.0-20231108071238-7cb8b7ff0537 github.com/prometheus/client_golang v1.11.1 github.com/prometheus/common v0.26.0 github.com/sasha-s/go-deadlock v0.2.0 diff --git a/go.sum b/go.sum index 28e210ef1cd..fb178321864 100644 --- a/go.sum +++ b/go.sum @@ -446,8 +446,8 @@ github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 h1:QV6jqlfOkh8hqvEAgwBZa+4bSgO0EeKC7s5c6Luam2I= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21/go.mod h1:QYnjfA95ZaMefyl1NO8oPtKeb8pYUdnDVhQgf+qdpjM= -github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 h1:xIeaDUq2ItkYMIgpWXAYKC/N3hs8aurfFvvz79lhHYE= -github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= +github.com/pingcap/tidb-dashboard v0.0.0-20231108071238-7cb8b7ff0537 h1:wnHt7ETIB0vm+gbLx8QhcIEmRtrT4QlWlfpcI9vjxOk= +github.com/pingcap/tidb-dashboard v0.0.0-20231108071238-7cb8b7ff0537/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e h1:FBaTXU8C3xgt/drM58VHxojHo/QoG1oPsgWTGvaSpO4= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e/go.mod h1:A7mrd7WHBl1o63LE2bIBGEJMTNWXqhgmYiOvMLxozfs= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= diff --git a/tests/integrations/client/go.mod b/tests/integrations/client/go.mod index b9b868cf8e3..a4aca195f3f 100644 --- a/tests/integrations/client/go.mod +++ b/tests/integrations/client/go.mod @@ -119,7 +119,7 @@ require ( github.com/pingcap/errcode v0.3.0 // indirect github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c // indirect github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 // indirect - github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 // indirect + github.com/pingcap/tidb-dashboard v0.0.0-20231108071238-7cb8b7ff0537 // indirect github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/tests/integrations/client/go.sum b/tests/integrations/client/go.sum index 81fa6fd7b39..ef9c4d2a5f3 100644 --- a/tests/integrations/client/go.sum +++ b/tests/integrations/client/go.sum @@ -410,8 +410,8 @@ github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 h1:QV6jqlfOkh8hqvEAgwBZa+4bSgO0EeKC7s5c6Luam2I= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21/go.mod h1:QYnjfA95ZaMefyl1NO8oPtKeb8pYUdnDVhQgf+qdpjM= -github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 h1:xIeaDUq2ItkYMIgpWXAYKC/N3hs8aurfFvvz79lhHYE= -github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= +github.com/pingcap/tidb-dashboard v0.0.0-20231108071238-7cb8b7ff0537 h1:wnHt7ETIB0vm+gbLx8QhcIEmRtrT4QlWlfpcI9vjxOk= +github.com/pingcap/tidb-dashboard v0.0.0-20231108071238-7cb8b7ff0537/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e h1:FBaTXU8C3xgt/drM58VHxojHo/QoG1oPsgWTGvaSpO4= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e/go.mod h1:A7mrd7WHBl1o63LE2bIBGEJMTNWXqhgmYiOvMLxozfs= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= diff --git a/tests/integrations/mcs/go.mod b/tests/integrations/mcs/go.mod index c2dfdbe96ef..f6df0eb4de0 100644 --- a/tests/integrations/mcs/go.mod +++ b/tests/integrations/mcs/go.mod @@ -119,7 +119,7 @@ require ( github.com/pingcap/errcode v0.3.0 // indirect github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c // indirect github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 // indirect - github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 // indirect + github.com/pingcap/tidb-dashboard v0.0.0-20231108071238-7cb8b7ff0537 // indirect github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/tests/integrations/mcs/go.sum b/tests/integrations/mcs/go.sum index d1b0962ab55..fc1dc1bbea5 100644 --- a/tests/integrations/mcs/go.sum +++ b/tests/integrations/mcs/go.sum @@ -414,8 +414,8 @@ github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 h1:QV6jqlfOkh8hqvEAgwBZa+4bSgO0EeKC7s5c6Luam2I= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21/go.mod h1:QYnjfA95ZaMefyl1NO8oPtKeb8pYUdnDVhQgf+qdpjM= -github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 h1:xIeaDUq2ItkYMIgpWXAYKC/N3hs8aurfFvvz79lhHYE= -github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= +github.com/pingcap/tidb-dashboard v0.0.0-20231108071238-7cb8b7ff0537 h1:wnHt7ETIB0vm+gbLx8QhcIEmRtrT4QlWlfpcI9vjxOk= +github.com/pingcap/tidb-dashboard v0.0.0-20231108071238-7cb8b7ff0537/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e h1:FBaTXU8C3xgt/drM58VHxojHo/QoG1oPsgWTGvaSpO4= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e/go.mod h1:A7mrd7WHBl1o63LE2bIBGEJMTNWXqhgmYiOvMLxozfs= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= diff --git a/tests/integrations/tso/go.mod b/tests/integrations/tso/go.mod index e5131f15d91..7e833943e6e 100644 --- a/tests/integrations/tso/go.mod +++ b/tests/integrations/tso/go.mod @@ -117,7 +117,7 @@ require ( github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c // indirect github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 // indirect github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 // indirect - github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 // indirect + github.com/pingcap/tidb-dashboard v0.0.0-20231108071238-7cb8b7ff0537 // indirect github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/tests/integrations/tso/go.sum b/tests/integrations/tso/go.sum index 576c3e75765..65a7f3e3558 100644 --- a/tests/integrations/tso/go.sum +++ b/tests/integrations/tso/go.sum @@ -408,8 +408,8 @@ github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 h1:QV6jqlfOkh8hqvEAgwBZa+4bSgO0EeKC7s5c6Luam2I= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21/go.mod h1:QYnjfA95ZaMefyl1NO8oPtKeb8pYUdnDVhQgf+qdpjM= -github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9 h1:xIeaDUq2ItkYMIgpWXAYKC/N3hs8aurfFvvz79lhHYE= -github.com/pingcap/tidb-dashboard v0.0.0-20231102083420-865955cd15d9/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= +github.com/pingcap/tidb-dashboard v0.0.0-20231108071238-7cb8b7ff0537 h1:wnHt7ETIB0vm+gbLx8QhcIEmRtrT4QlWlfpcI9vjxOk= +github.com/pingcap/tidb-dashboard v0.0.0-20231108071238-7cb8b7ff0537/go.mod h1:EZ90+V5S4TttbYag6oKZ3jcNKRwZe1Mc9vXwOt9JBYw= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e h1:FBaTXU8C3xgt/drM58VHxojHo/QoG1oPsgWTGvaSpO4= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e/go.mod h1:A7mrd7WHBl1o63LE2bIBGEJMTNWXqhgmYiOvMLxozfs= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= From 0c352271d7413bdf6ac948e11f1a3fb905fe2ccb Mon Sep 17 00:00:00 2001 From: disksing Date: Fri, 10 Nov 2023 12:03:42 +0800 Subject: [PATCH 16/20] dr-autosync: add recover timeout (#6295) ref tikv/pd#4399 Signed-off-by: husharp --- pkg/replication/replication_mode.go | 15 +++++- pkg/replication/replication_mode_test.go | 60 ++++++++++++++++-------- server/config/config.go | 15 +++--- 3 files changed, 62 insertions(+), 28 deletions(-) diff --git a/pkg/replication/replication_mode.go b/pkg/replication/replication_mode.go index 30b34e4596a..9093f911901 100644 --- a/pkg/replication/replication_mode.go +++ b/pkg/replication/replication_mode.go @@ -212,6 +212,7 @@ const ( type drAutoSyncStatus struct { State string `json:"state,omitempty"` StateID uint64 `json:"state_id,omitempty"` + AsyncStartTime *time.Time `json:"async_start,omitempty"` RecoverStartTime *time.Time `json:"recover_start,omitempty"` TotalRegions int `json:"total_regions,omitempty"` SyncedRegions int `json:"synced_regions,omitempty"` @@ -262,7 +263,8 @@ func (m *ModeManager) drSwitchToAsyncWithLock(availableStores []uint64) error { log.Warn("failed to switch to async state", zap.String("replicate-mode", modeDRAutoSync), errs.ZapError(err)) return err } - dr := drAutoSyncStatus{State: drStateAsync, StateID: id, AvailableStores: availableStores} + now := time.Now() + dr := drAutoSyncStatus{State: drStateAsync, StateID: id, AvailableStores: availableStores, AsyncStartTime: &now} if err := m.storage.SaveReplicationStatus(modeDRAutoSync, dr); err != nil { log.Warn("failed to switch to async state", zap.String("replicate-mode", modeDRAutoSync), errs.ZapError(err)) return err @@ -272,6 +274,15 @@ func (m *ModeManager) drSwitchToAsyncWithLock(availableStores []uint64) error { return nil } +func (m *ModeManager) drDurationSinceAsyncStart() time.Duration { + m.RLock() + defer m.RUnlock() + if m.drAutoSync.AsyncStartTime == nil { + return 0 + } + return time.Since(*m.drAutoSync.AsyncStartTime) +} + func (m *ModeManager) drSwitchToSyncRecover() error { m.Lock() defer m.Unlock() @@ -477,7 +488,7 @@ func (m *ModeManager) tickUpdateState() { m.drSwitchToAsync(storeIDs[primaryUp]) } case drStateAsync: - if canSync { + if canSync && m.drDurationSinceAsyncStart() > m.config.DRAutoSync.WaitRecoverTimeout.Duration { m.drSwitchToSyncRecover() break } diff --git a/pkg/replication/replication_mode_test.go b/pkg/replication/replication_mode_test.go index e01fb7a0b9a..5cf9f1a1450 100644 --- a/pkg/replication/replication_mode_test.go +++ b/pkg/replication/replication_mode_test.go @@ -16,6 +16,7 @@ package replication import ( "context" + "encoding/json" "errors" "fmt" "testing" @@ -159,6 +160,20 @@ func newMockReplicator(ids []uint64) *mockFileReplicator { } } +func assertLastData(t *testing.T, data string, state string, stateID uint64, availableStores []uint64) { + type status struct { + State string `json:"state"` + StateID uint64 `json:"state_id"` + AvailableStores []uint64 `json:"available_stores"` + } + var s status + err := json.Unmarshal([]byte(data), &s) + require.NoError(t, err) + require.Equal(t, state, s.State) + require.Equal(t, stateID, s.StateID) + require.Equal(t, availableStores, s.AvailableStores) +} + func TestStateSwitch(t *testing.T) { re := require.New(t) ctx, cancel := context.WithCancel(context.Background()) @@ -190,7 +205,7 @@ func TestStateSwitch(t *testing.T) { stateID := rep.drAutoSync.StateID re.NotEqual(uint64(0), stateID) rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"sync","state_id":%d}`, stateID), replicator.lastData[1]) + assertLastData(t, replicator.lastData[1], "sync", stateID, nil) assertStateIDUpdate := func() { re.NotEqual(stateID, rep.drAutoSync.StateID) stateID = rep.drAutoSync.StateID @@ -207,7 +222,7 @@ func TestStateSwitch(t *testing.T) { re.Equal(drStateAsyncWait, rep.drGetState()) assertStateIDUpdate() rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2,3,4]}`, stateID), replicator.lastData[1]) + assertLastData(t, replicator.lastData[1], "async_wait", stateID, []uint64{1, 2, 3, 4}) re.False(rep.GetReplicationStatus().GetDrAutoSync().GetPauseRegionSplit()) conf.DRAutoSync.PauseRegionSplit = true @@ -218,7 +233,7 @@ func TestStateSwitch(t *testing.T) { rep.tickUpdateState() assertStateIDUpdate() rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"async","state_id":%d,"available_stores":[1,2,3,4]}`, stateID), replicator.lastData[1]) + assertLastData(t, replicator.lastData[1], "async", stateID, []uint64{1, 2, 3, 4}) // add new store in dr zone. cluster.AddLabelsStore(5, 1, map[string]string{"zone": "zone2"}) @@ -268,18 +283,19 @@ func TestStateSwitch(t *testing.T) { rep.tickUpdateState() re.Equal(drStateAsyncWait, rep.drGetState()) assertStateIDUpdate() + rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2,3,4]}`, stateID), replicator.lastData[1]) + assertLastData(t, replicator.lastData[1], "async_wait", stateID, []uint64{1, 2, 3, 4}) setStoreState(cluster, "down", "up", "up", "up", "down", "down") rep.tickUpdateState() assertStateIDUpdate() rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[2,3,4]}`, stateID), replicator.lastData[1]) + assertLastData(t, replicator.lastData[1], "async_wait", stateID, []uint64{2, 3, 4}) setStoreState(cluster, "up", "down", "up", "up", "down", "down") rep.tickUpdateState() assertStateIDUpdate() rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,3,4]}`, stateID), replicator.lastData[1]) + assertLastData(t, replicator.lastData[1], "async_wait", stateID, []uint64{1, 3, 4}) // async_wait -> async rep.tickUpdateState() @@ -291,26 +307,32 @@ func TestStateSwitch(t *testing.T) { rep.tickUpdateState() assertStateIDUpdate() rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"async","state_id":%d,"available_stores":[1,3,4]}`, stateID), replicator.lastData[1]) + assertLastData(t, replicator.lastData[1], "async", stateID, []uint64{1, 3, 4}) // async -> async setStoreState(cluster, "up", "up", "up", "up", "down", "down") rep.tickUpdateState() // store 2 won't be available before it syncs status. rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"async","state_id":%d,"available_stores":[1,3,4]}`, stateID), replicator.lastData[1]) + assertLastData(t, replicator.lastData[1], "async", stateID, []uint64{1, 3, 4}) syncStoreStatus(1, 2, 3, 4) rep.tickUpdateState() assertStateIDUpdate() rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"async","state_id":%d,"available_stores":[1,2,3,4]}`, stateID), replicator.lastData[1]) + assertLastData(t, replicator.lastData[1], "async", stateID, []uint64{1, 2, 3, 4}) // async -> sync_recover setStoreState(cluster, "up", "up", "up", "up", "up", "up") rep.tickUpdateState() re.Equal(drStateSyncRecover, rep.drGetState()) assertStateIDUpdate() + rep.drSwitchToAsync([]uint64{1, 2, 3, 4, 5}) + rep.config.DRAutoSync.WaitRecoverTimeout = typeutil.NewDuration(time.Hour) + rep.tickUpdateState() + re.Equal(drStateAsync, rep.drGetState()) // wait recover timeout + + rep.config.DRAutoSync.WaitRecoverTimeout = typeutil.NewDuration(0) setStoreState(cluster, "down", "up", "up", "up", "up", "up") rep.tickUpdateState() re.Equal(drStateSyncRecover, rep.drGetState()) @@ -387,27 +409,27 @@ func TestReplicateState(t *testing.T) { stateID := rep.drAutoSync.StateID // replicate after initialized rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"sync","state_id":%d}`, stateID), replicator.lastData[1]) + assertLastData(t, replicator.lastData[1], "sync", stateID, nil) // repliate state to new member replicator.memberIDs = append(replicator.memberIDs, 2, 3) rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"sync","state_id":%d}`, stateID), replicator.lastData[2]) - re.Equal(fmt.Sprintf(`{"state":"sync","state_id":%d}`, stateID), replicator.lastData[3]) + assertLastData(t, replicator.lastData[2], "sync", stateID, nil) + assertLastData(t, replicator.lastData[3], "sync", stateID, nil) // inject error replicator.errors[2] = errors.New("failed to persist") rep.tickUpdateState() // switch async_wait since there is only one zone newStateID := rep.drAutoSync.StateID rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2]}`, newStateID), replicator.lastData[1]) - re.Equal(fmt.Sprintf(`{"state":"sync","state_id":%d}`, stateID), replicator.lastData[2]) - re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2]}`, newStateID), replicator.lastData[3]) + assertLastData(t, replicator.lastData[1], "async_wait", newStateID, []uint64{1, 2}) + assertLastData(t, replicator.lastData[2], "sync", stateID, nil) + assertLastData(t, replicator.lastData[3], "async_wait", newStateID, []uint64{1, 2}) // clear error, replicate to node 2 next time delete(replicator.errors, 2) rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2]}`, newStateID), replicator.lastData[2]) + assertLastData(t, replicator.lastData[2], "async_wait", newStateID, []uint64{1, 2}) } func TestAsynctimeout(t *testing.T) { @@ -637,7 +659,7 @@ func TestComplexPlacementRules(t *testing.T) { rep.tickUpdateState() re.Equal(drStateAsyncWait, rep.drGetState()) rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2,3,4,5,6]}`, rep.drAutoSync.StateID), replicator.lastData[1]) + assertLastData(t, replicator.lastData[1], "async_wait", rep.drAutoSync.StateID, []uint64{1, 2, 3, 4, 5, 6}) // reset to sync setStoreState(cluster, "up", "up", "up", "up", "up", "up", "up", "up", "up", "up") @@ -698,7 +720,7 @@ func TestComplexPlacementRules2(t *testing.T) { rep.tickUpdateState() re.Equal(drStateAsyncWait, rep.drGetState()) rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2,3,4]}`, rep.drAutoSync.StateID), replicator.lastData[1]) + assertLastData(t, replicator.lastData[1], "async_wait", rep.drAutoSync.StateID, []uint64{1, 2, 3, 4}) } func TestComplexPlacementRules3(t *testing.T) { @@ -737,7 +759,7 @@ func TestComplexPlacementRules3(t *testing.T) { rep.tickUpdateState() re.Equal(drStateAsyncWait, rep.drGetState()) rep.tickReplicateStatus() - re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2,3,4]}`, rep.drAutoSync.StateID), replicator.lastData[1]) + assertLastData(t, replicator.lastData[1], "async_wait", rep.drAutoSync.StateID, []uint64{1, 2, 3, 4}) } func genRegions(cluster *mockcluster.Cluster, stateID uint64, n int) []*core.RegionInfo { diff --git a/server/config/config.go b/server/config/config.go index 0485e077c67..da6b0e29e07 100644 --- a/server/config/config.go +++ b/server/config/config.go @@ -831,13 +831,14 @@ func NormalizeReplicationMode(m string) string { // DRAutoSyncReplicationConfig is the configuration for auto sync mode between 2 data centers. type DRAutoSyncReplicationConfig struct { - LabelKey string `toml:"label-key" json:"label-key"` - Primary string `toml:"primary" json:"primary"` - DR string `toml:"dr" json:"dr"` - PrimaryReplicas int `toml:"primary-replicas" json:"primary-replicas"` - DRReplicas int `toml:"dr-replicas" json:"dr-replicas"` - WaitStoreTimeout typeutil.Duration `toml:"wait-store-timeout" json:"wait-store-timeout"` - PauseRegionSplit bool `toml:"pause-region-split" json:"pause-region-split,string"` + LabelKey string `toml:"label-key" json:"label-key"` + Primary string `toml:"primary" json:"primary"` + DR string `toml:"dr" json:"dr"` + PrimaryReplicas int `toml:"primary-replicas" json:"primary-replicas"` + DRReplicas int `toml:"dr-replicas" json:"dr-replicas"` + WaitStoreTimeout typeutil.Duration `toml:"wait-store-timeout" json:"wait-store-timeout"` + WaitRecoverTimeout typeutil.Duration `toml:"wait-recover-timeout" json:"wait-recover-timeout"` + PauseRegionSplit bool `toml:"pause-region-split" json:"pause-region-split,string"` } func (c *DRAutoSyncReplicationConfig) adjust(meta *configutil.ConfigMetaData) { From f1cee6c3971e18c6ab201e50555261a8c51c3041 Mon Sep 17 00:00:00 2001 From: guo-shaoge Date: Fri, 10 Nov 2023 15:48:43 +0800 Subject: [PATCH 17/20] mcs/resourcemanager: delete expire tokenSlot (#7344) close tikv/pd#7346 Signed-off-by: guo-shaoge --- .../resourcemanager/server/token_buckets.go | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/pkg/mcs/resourcemanager/server/token_buckets.go b/pkg/mcs/resourcemanager/server/token_buckets.go index a0acba3b54d..05a93c32673 100644 --- a/pkg/mcs/resourcemanager/server/token_buckets.go +++ b/pkg/mcs/resourcemanager/server/token_buckets.go @@ -20,6 +20,8 @@ import ( "github.com/gogo/protobuf/proto" rmpb "github.com/pingcap/kvproto/pkg/resource_manager" + "github.com/pingcap/log" + "go.uber.org/zap" ) const ( @@ -31,6 +33,7 @@ const ( defaultReserveRatio = 0.5 defaultLoanCoefficient = 2 maxAssignTokens = math.MaxFloat64 / 1024 // assume max client connect is 1024 + slotExpireTimeout = 10 * time.Minute ) // GroupTokenBucket is a token bucket for a resource group. @@ -62,6 +65,7 @@ type TokenSlot struct { // tokenCapacity is the number of tokens in the slot. tokenCapacity float64 lastTokenCapacity float64 + lastReqTime time.Time } // GroupTokenBucketState is the running state of TokenBucket. @@ -75,7 +79,8 @@ type GroupTokenBucketState struct { LastUpdate *time.Time `json:"last_update,omitempty"` Initialized bool `json:"initialized"` // settingChanged is used to avoid that the number of tokens returned is jitter because of changing fill rate. - settingChanged bool + settingChanged bool + lastCheckExpireSlot time.Time } // Clone returns the copy of GroupTokenBucketState @@ -95,6 +100,7 @@ func (gts *GroupTokenBucketState) Clone() *GroupTokenBucketState { Initialized: gts.Initialized, tokenSlots: tokenSlots, clientConsumptionTokensSum: gts.clientConsumptionTokensSum, + lastCheckExpireSlot: gts.lastCheckExpireSlot, } } @@ -119,16 +125,18 @@ func (gts *GroupTokenBucketState) balanceSlotTokens( clientUniqueID uint64, settings *rmpb.TokenLimitSettings, requiredToken, elapseTokens float64) { + now := time.Now() slot, exist := gts.tokenSlots[clientUniqueID] if !exist { // Only slots that require a positive number will be considered alive, // but still need to allocate the elapsed tokens as well. if requiredToken != 0 { - slot = &TokenSlot{} + slot = &TokenSlot{lastReqTime: now} gts.tokenSlots[clientUniqueID] = slot gts.clientConsumptionTokensSum = 0 } } else { + slot.lastReqTime = now if gts.clientConsumptionTokensSum >= maxAssignTokens { gts.clientConsumptionTokensSum = 0 } @@ -139,6 +147,16 @@ func (gts *GroupTokenBucketState) balanceSlotTokens( } } + if time.Since(gts.lastCheckExpireSlot) >= slotExpireTimeout { + gts.lastCheckExpireSlot = now + for clientUniqueID, slot := range gts.tokenSlots { + if time.Since(slot.lastReqTime) >= slotExpireTimeout { + delete(gts.tokenSlots, clientUniqueID) + log.Info("delete resource group slot because expire", zap.Time("last-req-time", slot.lastReqTime), + zap.Any("expire timeout", slotExpireTimeout), zap.Any("del client id", clientUniqueID), zap.Any("len", len(gts.tokenSlots))) + } + } + } if len(gts.tokenSlots) == 0 { return } @@ -264,6 +282,7 @@ func (gtb *GroupTokenBucket) init(now time.Time, clientID uint64) { lastTokenCapacity: gtb.Tokens, } gtb.LastUpdate = &now + gtb.lastCheckExpireSlot = now gtb.Initialized = true } From b5119ea4bf2c3bc1d94256810c7e3e3670e96f45 Mon Sep 17 00:00:00 2001 From: lucasliang Date: Fri, 10 Nov 2023 16:32:12 +0800 Subject: [PATCH 18/20] scheduler: refine the interval of scheduling tick in evict-slow-trend-scheduler. (#7326) ref tikv/pd#7156 Implement the `GetNextInterval` for `evict-slow-trend-scheduler`, to refine the ticking interval. Default `GetNextInterval` is not appropriate for `evict-slow-trend-scheduler`, as it might delay the checking of other nodes' slowness status. This pr adjusts the ticking interval of the evict-slow-trend-scheduler to optimize its behavior. If a slow node is already identified as a candidate, the next interval is now set to be shorter, ensuring quicker subsequent scheduling. This refinement aims to decrease response time. Signed-off-by: lucasliang Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/schedule/schedulers/evict_slow_trend.go | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pkg/schedule/schedulers/evict_slow_trend.go b/pkg/schedule/schedulers/evict_slow_trend.go index 3983e9c345d..f31ba420c97 100644 --- a/pkg/schedule/schedulers/evict_slow_trend.go +++ b/pkg/schedule/schedulers/evict_slow_trend.go @@ -108,8 +108,12 @@ func (conf *evictSlowTrendSchedulerConfig) getKeyRangesByID(id uint64) []core.Ke return []core.KeyRange{core.NewKeyRange("", "")} } +func (conf *evictSlowTrendSchedulerConfig) hasEvictedStores() bool { + return len(conf.EvictedStores) > 0 +} + func (conf *evictSlowTrendSchedulerConfig) evictedStore() uint64 { - if len(conf.EvictedStores) == 0 { + if !conf.hasEvictedStores() { return 0 } // If a candidate passes all checks and proved to be slow, it will be @@ -237,6 +241,19 @@ type evictSlowTrendScheduler struct { handler http.Handler } +func (s *evictSlowTrendScheduler) GetNextInterval(interval time.Duration) time.Duration { + var growthType intervalGrowthType + // If it already found a slow node as candidate, the next interval should be shorter + // to make the next scheduling as soon as possible. This adjustment will decrease the + // response time, as heartbeats from other nodes will be received and updated more quickly. + if s.conf.hasEvictedStores() { + growthType = zeroGrowth + } else { + growthType = exponentialGrowth + } + return intervalGrow(s.GetMinInterval(), MaxScheduleInterval, growthType) +} + func (s *evictSlowTrendScheduler) ServeHTTP(w http.ResponseWriter, r *http.Request) { s.handler.ServeHTTP(w, r) } From fe8a393e5cc898ab65c8d683b2f7aaa33252dfc1 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Fri, 10 Nov 2023 16:45:42 +0800 Subject: [PATCH 19/20] mcs: tso service should not forward again (#7348) ref tikv/pd#5836 Signed-off-by: Ryan Leung Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/mcs/tso/server/grpc_service.go | 34 ------------------------------ pkg/mcs/tso/server/server.go | 3 --- 2 files changed, 37 deletions(-) diff --git a/pkg/mcs/tso/server/grpc_service.go b/pkg/mcs/tso/server/grpc_service.go index 40a308c72f8..9006faf49da 100644 --- a/pkg/mcs/tso/server/grpc_service.go +++ b/pkg/mcs/tso/server/grpc_service.go @@ -28,8 +28,6 @@ import ( bs "github.com/tikv/pd/pkg/basicserver" "github.com/tikv/pd/pkg/mcs/registry" "github.com/tikv/pd/pkg/utils/apiutil" - "github.com/tikv/pd/pkg/utils/grpcutil" - "github.com/tikv/pd/pkg/utils/tsoutil" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -88,21 +86,9 @@ func (s *Service) RegisterRESTHandler(userDefineHandlers map[string]http.Handler // Tso returns a stream of timestamps func (s *Service) Tso(stream tsopb.TSO_TsoServer) error { - var ( - doneCh chan struct{} - errCh chan error - ) ctx, cancel := context.WithCancel(stream.Context()) defer cancel() for { - // Prevent unnecessary performance overhead of the channel. - if errCh != nil { - select { - case err := <-errCh: - return errors.WithStack(err) - default: - } - } request, err := stream.Recv() if err == io.EOF { return nil @@ -111,26 +97,6 @@ func (s *Service) Tso(stream tsopb.TSO_TsoServer) error { return errors.WithStack(err) } - streamCtx := stream.Context() - forwardedHost := grpcutil.GetForwardedHost(streamCtx) - if !s.IsLocalRequest(forwardedHost) { - clientConn, err := s.GetDelegateClient(s.Context(), s.GetTLSConfig(), forwardedHost) - if err != nil { - return errors.WithStack(err) - } - - if errCh == nil { - doneCh = make(chan struct{}) - defer close(doneCh) - errCh = make(chan error) - } - - tsoProtoFactory := s.tsoProtoFactory - tsoRequest := tsoutil.NewTSOProtoRequest(forwardedHost, clientConn, request, stream) - s.tsoDispatcher.DispatchRequest(ctx, tsoRequest, tsoProtoFactory, doneCh, errCh) - continue - } - start := time.Now() // TSO uses leader lease to determine validity. No need to check leader here. if s.IsClosed() { diff --git a/pkg/mcs/tso/server/server.go b/pkg/mcs/tso/server/server.go index 16ef3216c62..1a2430477d8 100644 --- a/pkg/mcs/tso/server/server.go +++ b/pkg/mcs/tso/server/server.go @@ -78,9 +78,6 @@ type Server struct { service *Service keyspaceGroupManager *tso.KeyspaceGroupManager - // tsoDispatcher is used to dispatch the TSO requests to - // the corresponding forwarding TSO channels. - tsoDispatcher *tsoutil.TSODispatcher // tsoProtoFactory is the abstract factory for creating tso // related data structures defined in the tso grpc protocol tsoProtoFactory *tsoutil.TSOProtoFactory From afe6afccf9ddbf35c4210d40e00c6d69a030d3b3 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Fri, 10 Nov 2023 17:09:42 +0800 Subject: [PATCH 20/20] mcs: support rules http interface in scheduling server (#7199) ref tikv/pd#5839 Signed-off-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- errors.toml | 20 + pkg/errs/errno.go | 13 +- pkg/mcs/scheduling/server/apis/v1/api.go | 296 ++++++++++++- pkg/schedule/handler/handler.go | 43 ++ pkg/utils/apiutil/serverapi/middleware.go | 5 +- server/api/region_test.go | 12 +- server/api/rule.go | 296 ++++++++----- server/api/server.go | 25 ++ tests/integrations/mcs/scheduling/api_test.go | 102 ++++- tests/pdctl/config/config_test.go | 10 +- {server => tests/server}/api/rule_test.go | 390 ++++++++++++------ 11 files changed, 932 insertions(+), 280 deletions(-) rename {server => tests/server}/api/rule_test.go (67%) diff --git a/errors.toml b/errors.toml index 1d10d40d294..b6123058310 100644 --- a/errors.toml +++ b/errors.toml @@ -551,6 +551,11 @@ error = ''' build rule list failed, %s ''' +["PD:placement:ErrKeyFormat"] +error = ''' +key should be in hex format, %s +''' + ["PD:placement:ErrLoadRule"] error = ''' load rule failed @@ -561,11 +566,21 @@ error = ''' load rule group failed ''' +["PD:placement:ErrPlacementDisabled"] +error = ''' +placement rules feature is disabled +''' + ["PD:placement:ErrRuleContent"] error = ''' invalid rule content, %s ''' +["PD:placement:ErrRuleNotFound"] +error = ''' +rule not found +''' + ["PD:plugin:ErrLoadPlugin"] error = ''' failed to load plugin @@ -616,6 +631,11 @@ error = ''' region %v has abnormal peer ''' +["PD:region:ErrRegionInvalidID"] +error = ''' +invalid region id +''' + ["PD:region:ErrRegionNotAdjacent"] error = ''' two regions are not adjacent diff --git a/pkg/errs/errno.go b/pkg/errs/errno.go index e5bac8519be..b8a882cd187 100644 --- a/pkg/errs/errno.go +++ b/pkg/errs/errno.go @@ -102,6 +102,8 @@ var ( // region errors var ( + // ErrRegionInvalidID is error info for region id invalid. + ErrRegionInvalidID = errors.Normalize("invalid region id", errors.RFCCodeText("PD:region:ErrRegionInvalidID")) // ErrRegionNotAdjacent is error info for region not adjacent. ErrRegionNotAdjacent = errors.Normalize("two regions are not adjacent", errors.RFCCodeText("PD:region:ErrRegionNotAdjacent")) // ErrRegionNotFound is error info for region not found. @@ -153,10 +155,13 @@ var ( // placement errors var ( - ErrRuleContent = errors.Normalize("invalid rule content, %s", errors.RFCCodeText("PD:placement:ErrRuleContent")) - ErrLoadRule = errors.Normalize("load rule failed", errors.RFCCodeText("PD:placement:ErrLoadRule")) - ErrLoadRuleGroup = errors.Normalize("load rule group failed", errors.RFCCodeText("PD:placement:ErrLoadRuleGroup")) - ErrBuildRuleList = errors.Normalize("build rule list failed, %s", errors.RFCCodeText("PD:placement:ErrBuildRuleList")) + ErrRuleContent = errors.Normalize("invalid rule content, %s", errors.RFCCodeText("PD:placement:ErrRuleContent")) + ErrLoadRule = errors.Normalize("load rule failed", errors.RFCCodeText("PD:placement:ErrLoadRule")) + ErrLoadRuleGroup = errors.Normalize("load rule group failed", errors.RFCCodeText("PD:placement:ErrLoadRuleGroup")) + ErrBuildRuleList = errors.Normalize("build rule list failed, %s", errors.RFCCodeText("PD:placement:ErrBuildRuleList")) + ErrPlacementDisabled = errors.Normalize("placement rules feature is disabled", errors.RFCCodeText("PD:placement:ErrPlacementDisabled")) + ErrKeyFormat = errors.Normalize("key should be in hex format, %s", errors.RFCCodeText("PD:placement:ErrKeyFormat")) + ErrRuleNotFound = errors.Normalize("rule not found", errors.RFCCodeText("PD:placement:ErrRuleNotFound")) ) // region label errors diff --git a/pkg/mcs/scheduling/server/apis/v1/api.go b/pkg/mcs/scheduling/server/apis/v1/api.go index 47fdb95543f..172515d8620 100644 --- a/pkg/mcs/scheduling/server/apis/v1/api.go +++ b/pkg/mcs/scheduling/server/apis/v1/api.go @@ -15,6 +15,7 @@ package apis import ( + "encoding/hex" "net/http" "strconv" "sync" @@ -127,12 +128,6 @@ func (s *Service) RegisterAdminRouter() { router.DELETE("cache/regions/:id", deleteRegionCacheByID) } -// RegisterConfigRouter registers the router of the config handler. -func (s *Service) RegisterConfigRouter() { - router := s.root.Group("config") - router.GET("", getConfig) -} - // RegisterSchedulersRouter registers the router of the schedulers handler. func (s *Service) RegisterSchedulersRouter() { router := s.root.Group("schedulers") @@ -172,6 +167,32 @@ func (s *Service) RegisterOperatorsRouter() { router.GET("/records", getOperatorRecords) } +// RegisterConfigRouter registers the router of the config handler. +func (s *Service) RegisterConfigRouter() { + router := s.root.Group("config") + router.GET("", getConfig) + + rules := router.Group("rules") + rules.GET("", getAllRules) + rules.GET("/group/:group", getRuleByGroup) + rules.GET("/region/:region", getRulesByRegion) + rules.GET("/region/:region/detail", checkRegionPlacementRule) + rules.GET("/key/:key", getRulesByKey) + + // We cannot merge `/rule` and `/rules`, because we allow `group_id` to be "group", + // which is the same as the prefix of `/rules/group/:group`. + rule := router.Group("rule") + rule.GET("/:group/:id", getRuleByGroupAndID) + + groups := router.Group("rule_groups") + groups.GET("", getAllGroupConfigs) + groups.GET("/:id", getRuleGroupConfig) + + placementRule := router.Group("placement-rule") + placementRule.GET("", getPlacementRules) + placementRule.GET("/:group", getPlacementRuleByGroup) +} + // @Tags admin // @Summary Change the log level. // @Produce json @@ -671,3 +692,266 @@ func getHistoryHotRegions(c *gin.Context) { var res storage.HistoryHotRegions c.IndentedJSON(http.StatusOK, res) } + +// @Tags rule +// @Summary List all rules of cluster. +// @Produce json +// @Success 200 {array} placement.Rule +// @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." +// @Router /config/rules [get] +func getAllRules(c *gin.Context) { + handler := c.MustGet(handlerKey).(*handler.Handler) + manager, err := handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + c.String(http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + rules := manager.GetAllRules() + c.IndentedJSON(http.StatusOK, rules) +} + +// @Tags rule +// @Summary List all rules of cluster by group. +// @Param group path string true "The name of group" +// @Produce json +// @Success 200 {array} placement.Rule +// @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." +// @Router /config/rules/group/{group} [get] +func getRuleByGroup(c *gin.Context) { + handler := c.MustGet(handlerKey).(*handler.Handler) + manager, err := handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + c.String(http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + group := c.Param("group") + rules := manager.GetRulesByGroup(group) + c.IndentedJSON(http.StatusOK, rules) +} + +// @Tags rule +// @Summary List all rules of cluster by region. +// @Param id path integer true "Region Id" +// @Produce json +// @Success 200 {array} placement.Rule +// @Failure 400 {string} string "The input is invalid." +// @Failure 404 {string} string "The region does not exist." +// @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." +// @Router /config/rules/region/{region} [get] +func getRulesByRegion(c *gin.Context) { + handler := c.MustGet(handlerKey).(*handler.Handler) + manager, err := handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + c.String(http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + regionStr := c.Param("region") + region, code, err := handler.PreCheckForRegion(regionStr) + if err != nil { + c.String(code, err.Error()) + return + } + rules := manager.GetRulesForApplyRegion(region) + c.IndentedJSON(http.StatusOK, rules) +} + +// @Tags rule +// @Summary List rules and matched peers related to the given region. +// @Param id path integer true "Region Id" +// @Produce json +// @Success 200 {object} placement.RegionFit +// @Failure 400 {string} string "The input is invalid." +// @Failure 404 {string} string "The region does not exist." +// @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." +// @Router /config/rules/region/{region}/detail [get] +func checkRegionPlacementRule(c *gin.Context) { + handler := c.MustGet(handlerKey).(*handler.Handler) + regionStr := c.Param("region") + region, code, err := handler.PreCheckForRegion(regionStr) + if err != nil { + c.String(code, err.Error()) + return + } + regionFit, err := handler.CheckRegionPlacementRule(region) + if err == errs.ErrPlacementDisabled { + c.String(http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + c.IndentedJSON(http.StatusOK, regionFit) +} + +// @Tags rule +// @Summary List all rules of cluster by key. +// @Param key path string true "The name of key" +// @Produce json +// @Success 200 {array} placement.Rule +// @Failure 400 {string} string "The input is invalid." +// @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." +// @Router /config/rules/key/{key} [get] +func getRulesByKey(c *gin.Context) { + handler := c.MustGet(handlerKey).(*handler.Handler) + manager, err := handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + c.String(http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + keyHex := c.Param("key") + key, err := hex.DecodeString(keyHex) + if err != nil { + c.String(http.StatusBadRequest, errs.ErrKeyFormat.Error()) + return + } + rules := manager.GetRulesByKey(key) + c.IndentedJSON(http.StatusOK, rules) +} + +// @Tags rule +// @Summary Get rule of cluster by group and id. +// @Param group path string true "The name of group" +// @Param id path string true "Rule Id" +// @Produce json +// @Success 200 {object} placement.Rule +// @Failure 404 {string} string "The rule does not exist." +// @Failure 412 {string} string "Placement rules feature is disabled." +// @Router /config/rule/{group}/{id} [get] +func getRuleByGroupAndID(c *gin.Context) { + handler := c.MustGet(handlerKey).(*handler.Handler) + manager, err := handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + c.String(http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + group, id := c.Param("group"), c.Param("id") + rule := manager.GetRule(group, id) + if rule == nil { + c.String(http.StatusNotFound, errs.ErrRuleNotFound.Error()) + return + } + c.IndentedJSON(http.StatusOK, rule) +} + +// @Tags rule +// @Summary List all rule group configs. +// @Produce json +// @Success 200 {array} placement.RuleGroup +// @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." +// @Router /config/rule_groups [get] +func getAllGroupConfigs(c *gin.Context) { + handler := c.MustGet(handlerKey).(*handler.Handler) + manager, err := handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + c.String(http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + ruleGroups := manager.GetRuleGroups() + c.IndentedJSON(http.StatusOK, ruleGroups) +} + +// @Tags rule +// @Summary Get rule group config by group id. +// @Param id path string true "Group Id" +// @Produce json +// @Success 200 {object} placement.RuleGroup +// @Failure 404 {string} string "The RuleGroup does not exist." +// @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." +// @Router /config/rule_groups/{id} [get] +func getRuleGroupConfig(c *gin.Context) { + handler := c.MustGet(handlerKey).(*handler.Handler) + manager, err := handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + c.String(http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + id := c.Param("id") + group := manager.GetRuleGroup(id) + if group == nil { + c.String(http.StatusNotFound, errs.ErrRuleNotFound.Error()) + return + } + c.IndentedJSON(http.StatusOK, group) +} + +// @Tags rule +// @Summary List all rules and groups configuration. +// @Produce json +// @Success 200 {array} placement.GroupBundle +// @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." +// @Router /config/placement-rules [get] +func getPlacementRules(c *gin.Context) { + handler := c.MustGet(handlerKey).(*handler.Handler) + manager, err := handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + c.String(http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + bundles := manager.GetAllGroupBundles() + c.IndentedJSON(http.StatusOK, bundles) +} + +// @Tags rule +// @Summary Get group config and all rules belong to the group. +// @Param group path string true "The name of group" +// @Produce json +// @Success 200 {object} placement.GroupBundle +// @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." +// @Router /config/placement-rules/{group} [get] +func getPlacementRuleByGroup(c *gin.Context) { + handler := c.MustGet(handlerKey).(*handler.Handler) + manager, err := handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + c.String(http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + c.String(http.StatusInternalServerError, err.Error()) + return + } + g := c.Param("group") + group := manager.GetGroupBundle(g) + c.IndentedJSON(http.StatusOK, group) +} diff --git a/pkg/schedule/handler/handler.go b/pkg/schedule/handler/handler.go index 45b0eaf502f..3f9f4f96622 100644 --- a/pkg/schedule/handler/handler.go +++ b/pkg/schedule/handler/handler.go @@ -18,6 +18,7 @@ import ( "bytes" "encoding/hex" "net/http" + "strconv" "strings" "time" @@ -1061,3 +1062,45 @@ func (h *Handler) GetHotBuckets(regionIDs ...uint64) (HotBucketsResponse, error) } return ret, nil } + +// GetRuleManager returns the rule manager. +func (h *Handler) GetRuleManager() (*placement.RuleManager, error) { + c := h.GetCluster() + if c == nil { + return nil, errs.ErrNotBootstrapped + } + if !c.GetSharedConfig().IsPlacementRulesEnabled() { + return nil, errs.ErrPlacementDisabled + } + return c.GetRuleManager(), nil +} + +// PreCheckForRegion checks if the region is valid. +func (h *Handler) PreCheckForRegion(regionStr string) (*core.RegionInfo, int, error) { + c := h.GetCluster() + if c == nil { + return nil, http.StatusInternalServerError, errs.ErrNotBootstrapped.GenWithStackByArgs() + } + regionID, err := strconv.ParseUint(regionStr, 10, 64) + if err != nil { + return nil, http.StatusBadRequest, errs.ErrRegionInvalidID.FastGenByArgs() + } + region := c.GetRegion(regionID) + if region == nil { + return nil, http.StatusNotFound, errs.ErrRegionNotFound.FastGenByArgs(regionID) + } + return region, http.StatusOK, nil +} + +// CheckRegionPlacementRule checks if the region matches the placement rules. +func (h *Handler) CheckRegionPlacementRule(region *core.RegionInfo) (*placement.RegionFit, error) { + c := h.GetCluster() + if c == nil { + return nil, errs.ErrNotBootstrapped.GenWithStackByArgs() + } + manager, err := h.GetRuleManager() + if err != nil { + return nil, err + } + return manager.FitRegion(c, region), nil +} diff --git a/pkg/utils/apiutil/serverapi/middleware.go b/pkg/utils/apiutil/serverapi/middleware.go index 19438ad0f91..2bb742ccbba 100644 --- a/pkg/utils/apiutil/serverapi/middleware.go +++ b/pkg/utils/apiutil/serverapi/middleware.go @@ -117,6 +117,7 @@ func (h *redirector) matchMicroServiceRedirectRules(r *http.Request) (bool, stri r.URL.Path = strings.TrimRight(r.URL.Path, "/") for _, rule := range h.microserviceRedirectRules { if strings.HasPrefix(r.URL.Path, rule.matchPath) && slice.Contains(rule.matchMethods, r.Method) { + origin := r.URL.Path addr, ok := h.s.GetServicePrimaryAddr(r.Context(), rule.targetServiceName) if !ok || addr == "" { log.Warn("failed to get the service primary addr when trying to match redirect rules", @@ -134,8 +135,8 @@ func (h *redirector) matchMicroServiceRedirectRules(r *http.Request) (bool, stri } else { r.URL.Path = rule.targetPath } - log.Debug("redirect to micro service", zap.String("path", r.URL.Path), zap.String("target", addr), - zap.String("method", r.Method)) + log.Debug("redirect to micro service", zap.String("path", r.URL.Path), zap.String("origin-path", origin), + zap.String("target", addr), zap.String("method", r.Method)) return true, addr } } diff --git a/server/api/region_test.go b/server/api/region_test.go index a39a1e5c5fd..379fcf7d463 100644 --- a/server/api/region_test.go +++ b/server/api/region_test.go @@ -241,14 +241,14 @@ func (suite *regionTestSuite) TestRegions() { mustRegionHeartbeat(re, suite.svr, r) } url := fmt.Sprintf("%s/regions", suite.urlPrefix) - RegionsInfo := &RegionsInfo{} - err := tu.ReadGetJSON(re, testDialClient, url, RegionsInfo) + regionsInfo := &RegionsInfo{} + err := tu.ReadGetJSON(re, testDialClient, url, regionsInfo) suite.NoError(err) - suite.Len(regions, RegionsInfo.Count) - sort.Slice(RegionsInfo.Regions, func(i, j int) bool { - return RegionsInfo.Regions[i].ID < RegionsInfo.Regions[j].ID + suite.Len(regions, regionsInfo.Count) + sort.Slice(regionsInfo.Regions, func(i, j int) bool { + return regionsInfo.Regions[i].ID < regionsInfo.Regions[j].ID }) - for i, r := range RegionsInfo.Regions { + for i, r := range regionsInfo.Regions { suite.Equal(regions[i].ID, r.ID) suite.Equal(regions[i].ApproximateSize, r.ApproximateSize) suite.Equal(regions[i].ApproximateKeys, r.ApproximateKeys) diff --git a/server/api/rule.go b/server/api/rule.go index b3a720ece41..77aad42eb42 100644 --- a/server/api/rule.go +++ b/server/api/rule.go @@ -19,30 +19,26 @@ import ( "fmt" "net/http" "net/url" - "strconv" "github.com/gorilla/mux" - "github.com/pingcap/errors" - "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/apiutil" "github.com/tikv/pd/server" - "github.com/tikv/pd/server/cluster" "github.com/unrolled/render" ) -var errPlacementDisabled = errors.New("placement rules feature is disabled") - type ruleHandler struct { + *server.Handler svr *server.Server rd *render.Render } func newRulesHandler(svr *server.Server, rd *render.Render) *ruleHandler { return &ruleHandler{ - svr: svr, - rd: rd, + Handler: svr.GetHandler(), + svr: svr, + rd: rd, } } @@ -51,14 +47,19 @@ func newRulesHandler(svr *server.Server, rd *render.Render) *ruleHandler { // @Produce json // @Success 200 {array} placement.Rule // @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/rules [get] func (h *ruleHandler) GetAllRules(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } - rules := cluster.GetRuleManager().GetAllRules() + rules := manager.GetAllRules() h.rd.JSON(w, http.StatusOK, rules) } @@ -72,9 +73,13 @@ func (h *ruleHandler) GetAllRules(w http.ResponseWriter, r *http.Request) { // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/rules [post] func (h *ruleHandler) SetAllRules(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } var rules []*placement.Rule @@ -87,7 +92,7 @@ func (h *ruleHandler) SetAllRules(w http.ResponseWriter, r *http.Request) { return } } - if err := cluster.GetRuleManager().SetKeyType(h.svr.GetConfig().PDServerCfg.KeyType). + if err := manager.SetKeyType(h.svr.GetConfig().PDServerCfg.KeyType). SetRules(rules); err != nil { if errs.ErrRuleContent.Equal(err) || errs.ErrHexDecodingString.Equal(err) { h.rd.JSON(w, http.StatusBadRequest, err.Error()) @@ -105,15 +110,20 @@ func (h *ruleHandler) SetAllRules(w http.ResponseWriter, r *http.Request) { // @Produce json // @Success 200 {array} placement.Rule // @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/rules/group/{group} [get] func (h *ruleHandler) GetRuleByGroup(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } group := mux.Vars(r)["group"] - rules := cluster.GetRuleManager().GetRulesByGroup(group) + rules := manager.GetRulesByGroup(group) h.rd.JSON(w, http.StatusOK, rules) } @@ -125,13 +135,25 @@ func (h *ruleHandler) GetRuleByGroup(w http.ResponseWriter, r *http.Request) { // @Failure 400 {string} string "The input is invalid." // @Failure 404 {string} string "The region does not exist." // @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/rules/region/{region} [get] func (h *ruleHandler) GetRulesByRegion(w http.ResponseWriter, r *http.Request) { - cluster, region := h.preCheckForRegionAndRule(w, r) - if cluster == nil || region == nil { + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } - rules := cluster.GetRuleManager().GetRulesForApplyRegion(region) + regionStr := mux.Vars(r)["region"] + region, code, err := h.PreCheckForRegion(regionStr) + if err != nil { + h.rd.JSON(w, code, err.Error()) + return + } + rules := manager.GetRulesForApplyRegion(region) h.rd.JSON(w, http.StatusOK, rules) } @@ -143,34 +165,25 @@ func (h *ruleHandler) GetRulesByRegion(w http.ResponseWriter, r *http.Request) { // @Failure 400 {string} string "The input is invalid." // @Failure 404 {string} string "The region does not exist." // @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/rules/region/{region}/detail [get] func (h *ruleHandler) CheckRegionPlacementRule(w http.ResponseWriter, r *http.Request) { - cluster, region := h.preCheckForRegionAndRule(w, r) - if cluster == nil || region == nil { + regionStr := mux.Vars(r)["region"] + region, code, err := h.PreCheckForRegion(regionStr) + if err != nil { + h.rd.JSON(w, code, err.Error()) return } - regionFit := cluster.GetRuleManager().FitRegion(cluster, region) - h.rd.JSON(w, http.StatusOK, regionFit) -} - -func (h *ruleHandler) preCheckForRegionAndRule(w http.ResponseWriter, r *http.Request) (*cluster.RaftCluster, *core.RegionInfo) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) - return cluster, nil + regionFit, err := h.Handler.CheckRegionPlacementRule(region) + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return } - regionStr := mux.Vars(r)["region"] - regionID, err := strconv.ParseUint(regionStr, 10, 64) if err != nil { - h.rd.JSON(w, http.StatusBadRequest, "invalid region id") - return cluster, nil - } - region := cluster.GetRegion(regionID) - if region == nil { - h.rd.JSON(w, http.StatusNotFound, errs.ErrRegionNotFound.FastGenByArgs(regionID).Error()) - return cluster, nil + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) + return } - return cluster, region + h.rd.JSON(w, http.StatusOK, regionFit) } // @Tags rule @@ -180,20 +193,25 @@ func (h *ruleHandler) preCheckForRegionAndRule(w http.ResponseWriter, r *http.Re // @Success 200 {array} placement.Rule // @Failure 400 {string} string "The input is invalid." // @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/rules/key/{key} [get] func (h *ruleHandler) GetRulesByKey(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } keyHex := mux.Vars(r)["key"] key, err := hex.DecodeString(keyHex) if err != nil { - h.rd.JSON(w, http.StatusBadRequest, "key should be in hex format") + h.rd.JSON(w, http.StatusBadRequest, errs.ErrKeyFormat.FastGenByArgs(err).Error()) return } - rules := cluster.GetRuleManager().GetRulesByKey(key) + rules := manager.GetRulesByKey(key) h.rd.JSON(w, http.StatusOK, rules) } @@ -207,15 +225,19 @@ func (h *ruleHandler) GetRulesByKey(w http.ResponseWriter, r *http.Request) { // @Failure 412 {string} string "Placement rules feature is disabled." // @Router /config/rule/{group}/{id} [get] func (h *ruleHandler) GetRuleByGroupAndID(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } group, id := mux.Vars(r)["group"], mux.Vars(r)["id"] - rule := cluster.GetRuleManager().GetRule(group, id) + rule := manager.GetRule(group, id) if rule == nil { - h.rd.JSON(w, http.StatusNotFound, nil) + h.rd.JSON(w, http.StatusNotFound, errs.ErrRuleNotFound.Error()) return } h.rd.JSON(w, http.StatusOK, rule) @@ -232,21 +254,25 @@ func (h *ruleHandler) GetRuleByGroupAndID(w http.ResponseWriter, r *http.Request // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/rule [post] func (h *ruleHandler) SetRule(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } var rule placement.Rule if err := apiutil.ReadJSONRespondError(h.rd, w, r.Body, &rule); err != nil { return } - oldRule := cluster.GetRuleManager().GetRule(rule.GroupID, rule.ID) + oldRule := manager.GetRule(rule.GroupID, rule.ID) if err := h.syncReplicateConfigWithDefaultRule(&rule); err != nil { h.rd.JSON(w, http.StatusBadRequest, err.Error()) return } - if err := cluster.GetRuleManager().SetKeyType(h.svr.GetConfig().PDServerCfg.KeyType). + if err := manager.SetKeyType(h.svr.GetConfig().PDServerCfg.KeyType). SetRule(&rule); err != nil { if errs.ErrRuleContent.Equal(err) || errs.ErrHexDecodingString.Equal(err) { h.rd.JSON(w, http.StatusBadRequest, err.Error()) @@ -255,6 +281,7 @@ func (h *ruleHandler) SetRule(w http.ResponseWriter, r *http.Request) { } return } + cluster := getCluster(r) cluster.AddSuspectKeyRange(rule.StartKey, rule.EndKey) if oldRule != nil { cluster.AddSuspectKeyRange(oldRule.StartKey, oldRule.EndKey) @@ -285,18 +312,23 @@ func (h *ruleHandler) syncReplicateConfigWithDefaultRule(rule *placement.Rule) e // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/rule/{group}/{id} [delete] func (h *ruleHandler) DeleteRuleByGroup(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } group, id := mux.Vars(r)["group"], mux.Vars(r)["id"] - rule := cluster.GetRuleManager().GetRule(group, id) - if err := cluster.GetRuleManager().DeleteRule(group, id); err != nil { + rule := manager.GetRule(group, id) + if err := manager.DeleteRule(group, id); err != nil { h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } if rule != nil { + cluster := getCluster(r) cluster.AddSuspectKeyRange(rule.StartKey, rule.EndKey) } @@ -313,16 +345,20 @@ func (h *ruleHandler) DeleteRuleByGroup(w http.ResponseWriter, r *http.Request) // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/rules/batch [post] func (h *ruleHandler) BatchRules(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } var opts []placement.RuleOp if err := apiutil.ReadJSONRespondError(h.rd, w, r.Body, &opts); err != nil { return } - if err := cluster.GetRuleManager().SetKeyType(h.svr.GetConfig().PDServerCfg.KeyType). + if err := manager.SetKeyType(h.svr.GetConfig().PDServerCfg.KeyType). Batch(opts); err != nil { if errs.ErrRuleContent.Equal(err) || errs.ErrHexDecodingString.Equal(err) { h.rd.JSON(w, http.StatusBadRequest, err.Error()) @@ -341,15 +377,20 @@ func (h *ruleHandler) BatchRules(w http.ResponseWriter, r *http.Request) { // @Success 200 {object} placement.RuleGroup // @Failure 404 {string} string "The RuleGroup does not exist." // @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/rule_group/{id} [get] func (h *ruleHandler) GetGroupConfig(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } id := mux.Vars(r)["id"] - group := cluster.GetRuleManager().GetRuleGroup(id) + group := manager.GetRuleGroup(id) if group == nil { h.rd.JSON(w, http.StatusNotFound, nil) return @@ -368,21 +409,26 @@ func (h *ruleHandler) GetGroupConfig(w http.ResponseWriter, r *http.Request) { // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/rule_group [post] func (h *ruleHandler) SetGroupConfig(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } var ruleGroup placement.RuleGroup if err := apiutil.ReadJSONRespondError(h.rd, w, r.Body, &ruleGroup); err != nil { return } - if err := cluster.GetRuleManager().SetRuleGroup(&ruleGroup); err != nil { + if err := manager.SetRuleGroup(&ruleGroup); err != nil { h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } - for _, r := range cluster.GetRuleManager().GetRulesByGroup(ruleGroup.ID) { - cluster.AddSuspectKeyRange(r.StartKey, r.EndKey) + cluster := getCluster(r) + for _, rule := range manager.GetRulesByGroup(ruleGroup.ID) { + cluster.AddSuspectKeyRange(rule.StartKey, rule.EndKey) } h.rd.JSON(w, http.StatusOK, "Update rule group successfully.") } @@ -396,18 +442,23 @@ func (h *ruleHandler) SetGroupConfig(w http.ResponseWriter, r *http.Request) { // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/rule_group/{id} [delete] func (h *ruleHandler) DeleteGroupConfig(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } id := mux.Vars(r)["id"] - err := cluster.GetRuleManager().DeleteRuleGroup(id) + err = manager.DeleteRuleGroup(id) if err != nil { h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } - for _, r := range cluster.GetRuleManager().GetRulesByGroup(id) { + cluster := getCluster(r) + for _, r := range manager.GetRulesByGroup(id) { cluster.AddSuspectKeyRange(r.StartKey, r.EndKey) } h.rd.JSON(w, http.StatusOK, "Delete rule group successfully.") @@ -418,14 +469,19 @@ func (h *ruleHandler) DeleteGroupConfig(w http.ResponseWriter, r *http.Request) // @Produce json // @Success 200 {array} placement.RuleGroup // @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/rule_groups [get] func (h *ruleHandler) GetAllGroupConfigs(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) return } - ruleGroups := cluster.GetRuleManager().GetRuleGroups() + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) + return + } + ruleGroups := manager.GetRuleGroups() h.rd.JSON(w, http.StatusOK, ruleGroups) } @@ -434,14 +490,19 @@ func (h *ruleHandler) GetAllGroupConfigs(w http.ResponseWriter, r *http.Request) // @Produce json // @Success 200 {array} placement.GroupBundle // @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/placement-rule [get] func (h *ruleHandler) GetPlacementRules(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } - bundles := cluster.GetRuleManager().GetAllGroupBundles() + bundles := manager.GetAllGroupBundles() h.rd.JSON(w, http.StatusOK, bundles) } @@ -455,9 +516,13 @@ func (h *ruleHandler) GetPlacementRules(w http.ResponseWriter, r *http.Request) // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/placement-rule [post] func (h *ruleHandler) SetPlacementRules(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } var groups []placement.GroupBundle @@ -465,7 +530,7 @@ func (h *ruleHandler) SetPlacementRules(w http.ResponseWriter, r *http.Request) return } _, partial := r.URL.Query()["partial"] - if err := cluster.GetRuleManager().SetKeyType(h.svr.GetConfig().PDServerCfg.KeyType). + if err := manager.SetKeyType(h.svr.GetConfig().PDServerCfg.KeyType). SetAllGroupBundles(groups, !partial); err != nil { if errs.ErrRuleContent.Equal(err) || errs.ErrHexDecodingString.Equal(err) { h.rd.JSON(w, http.StatusBadRequest, err.Error()) @@ -483,14 +548,20 @@ func (h *ruleHandler) SetPlacementRules(w http.ResponseWriter, r *http.Request) // @Produce json // @Success 200 {object} placement.GroupBundle // @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/placement-rule/{group} [get] func (h *ruleHandler) GetPlacementRuleByGroup(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) return } - group := cluster.GetRuleManager().GetGroupBundle(mux.Vars(r)["group"]) + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) + return + } + g := mux.Vars(r)["group"] + group := manager.GetGroupBundle(g) h.rd.JSON(w, http.StatusOK, group) } @@ -502,21 +573,26 @@ func (h *ruleHandler) GetPlacementRuleByGroup(w http.ResponseWriter, r *http.Req // @Success 200 {string} string "Delete group and rules successfully." // @Failure 400 {string} string "Bad request." // @Failure 412 {string} string "Placement rules feature is disabled." +// @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/placement-rule [delete] func (h *ruleHandler) DeletePlacementRuleByGroup(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } group := mux.Vars(r)["group"] - group, err := url.PathUnescape(group) + group, err = url.PathUnescape(group) if err != nil { h.rd.JSON(w, http.StatusBadRequest, err.Error()) return } _, regex := r.URL.Query()["regexp"] - if err := cluster.GetRuleManager().DeleteGroupBundle(group, regex); err != nil { + if err := manager.DeleteGroupBundle(group, regex); err != nil { h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } @@ -532,9 +608,13 @@ func (h *ruleHandler) DeletePlacementRuleByGroup(w http.ResponseWriter, r *http. // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /config/placement-rule/{group} [post] func (h *ruleHandler) SetPlacementRuleByGroup(w http.ResponseWriter, r *http.Request) { - cluster := getCluster(r) - if !cluster.GetOpts().IsPlacementRulesEnabled() { - h.rd.JSON(w, http.StatusPreconditionFailed, errPlacementDisabled.Error()) + manager, err := h.Handler.GetRuleManager() + if err == errs.ErrPlacementDisabled { + h.rd.JSON(w, http.StatusPreconditionFailed, err.Error()) + return + } + if err != nil { + h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } groupID := mux.Vars(r)["group"] @@ -549,7 +629,7 @@ func (h *ruleHandler) SetPlacementRuleByGroup(w http.ResponseWriter, r *http.Req h.rd.JSON(w, http.StatusBadRequest, fmt.Sprintf("group id %s does not match request URI %s", group.ID, groupID)) return } - if err := cluster.GetRuleManager().SetKeyType(h.svr.GetConfig().PDServerCfg.KeyType). + if err := manager.SetKeyType(h.svr.GetConfig().PDServerCfg.KeyType). SetGroupBundle(group); err != nil { if errs.ErrRuleContent.Equal(err) || errs.ErrHexDecodingString.Equal(err) { h.rd.JSON(w, http.StatusBadRequest, err.Error()) diff --git a/server/api/server.go b/server/api/server.go index ae877b8407c..77a51eb04e5 100644 --- a/server/api/server.go +++ b/server/api/server.go @@ -84,6 +84,31 @@ func NewHandler(_ context.Context, svr *server.Server) (http.Handler, apiutil.AP scheapi.APIPathPrefix+"/hotspot", mcs.SchedulingServiceName, []string{http.MethodGet}), + serverapi.MicroserviceRedirectRule( + prefix+"/config/rules", + scheapi.APIPathPrefix+"/config/rules", + mcs.SchedulingServiceName, + []string{http.MethodGet}), + serverapi.MicroserviceRedirectRule( + prefix+"/config/rule/", + scheapi.APIPathPrefix+"/config/rule", + mcs.SchedulingServiceName, + []string{http.MethodGet}), + serverapi.MicroserviceRedirectRule( + prefix+"/config/rule_group/", + scheapi.APIPathPrefix+"/config/rule_groups", // Note: this is a typo in the original code + mcs.SchedulingServiceName, + []string{http.MethodGet}), + serverapi.MicroserviceRedirectRule( + prefix+"/config/rule_groups", + scheapi.APIPathPrefix+"/config/rule_groups", + mcs.SchedulingServiceName, + []string{http.MethodGet}), + serverapi.MicroserviceRedirectRule( + prefix+"/config/placement-rule", + scheapi.APIPathPrefix+"/config/placement-rule", + mcs.SchedulingServiceName, + []string{http.MethodGet}), // because the writing of all the meta information of the scheduling service is in the API server, // we should not post and delete the scheduler directly in the scheduling service. serverapi.MicroserviceRedirectRule( diff --git a/tests/integrations/mcs/scheduling/api_test.go b/tests/integrations/mcs/scheduling/api_test.go index 15c66ce5829..cfeaa4db033 100644 --- a/tests/integrations/mcs/scheduling/api_test.go +++ b/tests/integrations/mcs/scheduling/api_test.go @@ -15,10 +15,10 @@ import ( _ "github.com/tikv/pd/pkg/mcs/scheduling/server/apis/v1" "github.com/tikv/pd/pkg/mcs/scheduling/server/config" "github.com/tikv/pd/pkg/schedule/handler" + "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/statistics" "github.com/tikv/pd/pkg/storage" "github.com/tikv/pd/pkg/utils/apiutil" - "github.com/tikv/pd/pkg/utils/tempurl" "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/tests" ) @@ -43,7 +43,7 @@ func TestAPI(t *testing.T) { suite.Run(t, &apiTestSuite{}) } -func (suite *apiTestSuite) SetupSuite() { +func (suite *apiTestSuite) SetupTest() { ctx, cancel := context.WithCancel(context.Background()) suite.ctx = ctx cluster, err := tests.NewTestAPICluster(suite.ctx, 1) @@ -62,14 +62,19 @@ func (suite *apiTestSuite) SetupSuite() { suite.cleanupFunc = func() { cancel() } + tc, err := tests.NewTestSchedulingCluster(suite.ctx, 2, suite.backendEndpoints) + suite.NoError(err) + suite.cluster.SetSchedulingCluster(tc) + tc.WaitForPrimaryServing(suite.Require()) } -func (suite *apiTestSuite) TearDownSuite() { +func (suite *apiTestSuite) TearDownTest() { suite.cluster.Destroy() suite.cleanupFunc() } func (suite *apiTestSuite) TestGetCheckerByName() { + re := suite.Require() testCases := []struct { name string }{ @@ -81,14 +86,8 @@ func (suite *apiTestSuite) TestGetCheckerByName() { {name: "joint-state"}, } - re := suite.Require() - s, cleanup := tests.StartSingleSchedulingTestServer(suite.ctx, re, suite.backendEndpoints, tempurl.Alloc()) - defer cleanup() - testutil.Eventually(re, func() bool { - return s.IsServing() - }, testutil.WithWaitFor(5*time.Second), testutil.WithTickInterval(50*time.Millisecond)) - addr := s.GetAddr() - urlPrefix := fmt.Sprintf("%s/scheduling/api/v1/checkers", addr) + s := suite.cluster.GetSchedulingPrimaryServer() + urlPrefix := fmt.Sprintf("%s/scheduling/api/v1/checkers", s.GetAddr()) co := s.GetCoordinator() for _, testCase := range testCases { @@ -123,17 +122,12 @@ func (suite *apiTestSuite) TestAPIForward() { re.NoError(failpoint.Disable("github.com/tikv/pd/pkg/utils/apiutil/serverapi/checkHeader")) }() - tc, err := tests.NewTestSchedulingCluster(suite.ctx, 2, suite.backendEndpoints) - re.NoError(err) - defer tc.Destroy() - tc.WaitForPrimaryServing(re) - urlPrefix := fmt.Sprintf("%s/pd/api/v1", suite.backendEndpoints) var slice []string var resp map[string]interface{} // Test opeartor - err = testutil.ReadGetJSON(re, testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "operators"), &slice, + err := testutil.ReadGetJSON(re, testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "operators"), &slice, testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) re.NoError(err) re.Len(slice, 0) @@ -241,6 +235,80 @@ func (suite *apiTestSuite) TestAPIForward() { err = testutil.ReadGetJSON(re, testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "hotspot/regions/history"), &history, testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) re.NoError(err) + + // Test rules: only forward `GET` request + var rules []*placement.Rule + tests.MustPutRegion(re, suite.cluster, 2, 1, []byte("a"), []byte("b"), core.SetApproximateSize(60)) + rules = []*placement.Rule{ + { + GroupID: "pd", + ID: "default", + Role: "voter", + Count: 3, + LocationLabels: []string{}, + }, + } + rulesArgs, err := json.Marshal(rules) + suite.NoError(err) + + err = testutil.ReadGetJSON(re, testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "/config/rules"), &rules, + testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) + re.NoError(err) + err = testutil.CheckPostJSON(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/rules"), rulesArgs, + testutil.WithoutHeader(re, apiutil.ForwardToMicroServiceHeader)) + re.NoError(err) + err = testutil.CheckPostJSON(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/rules/batch"), rulesArgs, + testutil.WithoutHeader(re, apiutil.ForwardToMicroServiceHeader)) + re.NoError(err) + err = testutil.ReadGetJSON(re, testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/rules/group/pd"), &rules, + testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) + re.NoError(err) + err = testutil.ReadGetJSON(re, testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/rules/region/2"), &rules, + testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) + re.NoError(err) + var fit placement.RegionFit + err = testutil.ReadGetJSON(re, testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/rules/region/2/detail"), &fit, + testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) + re.NoError(err) + err = testutil.ReadGetJSON(re, testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/rules/key/0000000000000001"), &rules, + testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) + re.NoError(err) + err = testutil.CheckGetJSON(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/rule/pd/2"), nil, + testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) + re.NoError(err) + err = testutil.CheckDelete(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/rule/pd/2"), + testutil.WithoutHeader(re, apiutil.ForwardToMicroServiceHeader)) + re.NoError(err) + err = testutil.CheckPostJSON(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/rule"), rulesArgs, + testutil.WithoutHeader(re, apiutil.ForwardToMicroServiceHeader)) + re.NoError(err) + err = testutil.CheckGetJSON(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/rule_group/pd"), nil, + testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) + re.NoError(err) + err = testutil.CheckDelete(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/rule_group/pd"), + testutil.WithoutHeader(re, apiutil.ForwardToMicroServiceHeader)) + re.NoError(err) + err = testutil.CheckPostJSON(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/rule_group"), rulesArgs, + testutil.WithoutHeader(re, apiutil.ForwardToMicroServiceHeader)) + re.NoError(err) + err = testutil.CheckGetJSON(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/rule_groups"), nil, + testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) + re.NoError(err) + err = testutil.CheckGetJSON(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/placement-rule"), nil, + testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) + re.NoError(err) + err = testutil.CheckPostJSON(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/placement-rule"), rulesArgs, + testutil.WithoutHeader(re, apiutil.ForwardToMicroServiceHeader)) + re.NoError(err) + err = testutil.CheckGetJSON(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/placement-rule/pd"), nil, + testutil.WithHeader(re, apiutil.ForwardToMicroServiceHeader, "true")) + re.NoError(err) + err = testutil.CheckDelete(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/placement-rule/pd"), + testutil.WithoutHeader(re, apiutil.ForwardToMicroServiceHeader)) + re.NoError(err) + err = testutil.CheckPostJSON(testDialClient, fmt.Sprintf("%s/%s", urlPrefix, "config/placement-rule/pd"), rulesArgs, + testutil.WithoutHeader(re, apiutil.ForwardToMicroServiceHeader)) + re.NoError(err) } func (suite *apiTestSuite) TestConfig() { diff --git a/tests/pdctl/config/config_test.go b/tests/pdctl/config/config_test.go index 26d70bb955f..2cc8427911a 100644 --- a/tests/pdctl/config/config_test.go +++ b/tests/pdctl/config/config_test.go @@ -19,6 +19,7 @@ import ( "encoding/json" "os" "reflect" + "strings" "testing" "time" @@ -409,9 +410,12 @@ func (suite *configTestSuite) checkPlacementRuleGroups(cluster *tests.TestCluste // test show var group placement.RuleGroup - output, err = pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "placement-rules", "rule-group", "show", "pd") - re.NoError(err) - re.NoError(json.Unmarshal(output, &group)) + testutil.Eventually(re, func() bool { // wait for the config to be synced to the scheduling server + output, err = pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "placement-rules", "rule-group", "show", "pd") + re.NoError(err) + return !strings.Contains(string(output), "404") + }) + re.NoError(json.Unmarshal(output, &group), string(output)) re.Equal(placement.RuleGroup{ID: "pd"}, group) // test set diff --git a/server/api/rule_test.go b/tests/server/api/rule_test.go similarity index 67% rename from server/api/rule_test.go rename to tests/server/api/rule_test.go index d2dc50f1119..3ee3357e031 100644 --- a/server/api/rule_test.go +++ b/tests/server/api/rule_test.go @@ -25,57 +25,37 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/stretchr/testify/suite" "github.com/tikv/pd/pkg/core" + "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/pkg/schedule/placement" tu "github.com/tikv/pd/pkg/utils/testutil" - "github.com/tikv/pd/server" "github.com/tikv/pd/server/config" + "github.com/tikv/pd/tests" ) type ruleTestSuite struct { suite.Suite - svr *server.Server - cleanup tu.CleanupFunc - urlPrefix string } func TestRuleTestSuite(t *testing.T) { suite.Run(t, new(ruleTestSuite)) } -func (suite *ruleTestSuite) SetupSuite() { - re := suite.Require() - suite.svr, suite.cleanup = mustNewServer(re) - server.MustWaitLeader(re, []*server.Server{suite.svr}) - - addr := suite.svr.GetAddr() - suite.urlPrefix = fmt.Sprintf("%s%s/api/v1/config", addr, apiPrefix) - - mustBootstrapCluster(re, suite.svr) - PDServerCfg := suite.svr.GetConfig().PDServerCfg - PDServerCfg.KeyType = "raw" - err := suite.svr.SetPDServerConfig(PDServerCfg) - suite.NoError(err) - suite.NoError(tu.CheckPostJSON(testDialClient, suite.urlPrefix, []byte(`{"enable-placement-rules":"true"}`), tu.StatusOK(re))) -} - -func (suite *ruleTestSuite) TearDownSuite() { - suite.cleanup() -} - -func (suite *ruleTestSuite) TearDownTest() { - def := placement.GroupBundle{ - ID: "pd", - Rules: []*placement.Rule{ - {GroupID: "pd", ID: "default", Role: "voter", Count: 3}, +func (suite *ruleTestSuite) TestSet() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.PDServerCfg.KeyType = "raw" + conf.Replication.EnablePlacementRules = true }, } - data, err := json.Marshal([]placement.GroupBundle{def}) - suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/placement-rule", data, tu.StatusOK(suite.Require())) - suite.NoError(err) + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + env.RunTestInTwoModes(suite.checkSet) } -func (suite *ruleTestSuite) TestSet() { +func (suite *ruleTestSuite) checkSet(cluster *tests.TestCluster) { + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() + urlPrefix := fmt.Sprintf("%s%s/api/v1/config", pdAddr, apiPrefix) + rule := placement.Rule{GroupID: "a", ID: "10", StartKeyHex: "1111", EndKeyHex: "3333", Role: "voter", Count: 1} successData, err := json.Marshal(rule) suite.NoError(err) @@ -159,12 +139,12 @@ func (suite *ruleTestSuite) TestSet() { for _, testCase := range testCases { suite.T().Log(testCase.name) // clear suspect keyRanges to prevent test case from others - suite.svr.GetRaftCluster().ClearSuspectKeyRanges() + leaderServer.GetRaftCluster().ClearSuspectKeyRanges() if testCase.success { - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/rule", testCase.rawData, tu.StatusOK(re)) + err = tu.CheckPostJSON(testDialClient, urlPrefix+"/rule", testCase.rawData, tu.StatusOK(re)) popKeyRangeMap := map[string]struct{}{} for i := 0; i < len(testCase.popKeyRange)/2; i++ { - v, got := suite.svr.GetRaftCluster().PopOneSuspectKeyRange() + v, got := leaderServer.GetRaftCluster().PopOneSuspectKeyRange() suite.True(got) popKeyRangeMap[hex.EncodeToString(v[0])] = struct{}{} popKeyRangeMap[hex.EncodeToString(v[1])] = struct{}{} @@ -175,7 +155,7 @@ func (suite *ruleTestSuite) TestSet() { suite.True(ok) } } else { - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/rule", testCase.rawData, + err = tu.CheckPostJSON(testDialClient, urlPrefix+"/rule", testCase.rawData, tu.StatusNotOK(re), tu.StringEqual(re, testCase.response)) } @@ -184,11 +164,26 @@ func (suite *ruleTestSuite) TestSet() { } func (suite *ruleTestSuite) TestGet() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.PDServerCfg.KeyType = "raw" + conf.Replication.EnablePlacementRules = true + }, + } + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + env.RunTestInTwoModes(suite.checkGet) +} + +func (suite *ruleTestSuite) checkGet(cluster *tests.TestCluster) { + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() + urlPrefix := fmt.Sprintf("%s%s/api/v1/config", pdAddr, apiPrefix) + rule := placement.Rule{GroupID: "a", ID: "20", StartKeyHex: "1111", EndKeyHex: "3333", Role: "voter", Count: 1} data, err := json.Marshal(rule) suite.NoError(err) re := suite.Require() - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/rule", data, tu.StatusOK(re)) + err = tu.CheckPostJSON(testDialClient, urlPrefix+"/rule", data, tu.StatusOK(re)) suite.NoError(err) testCases := []struct { @@ -213,7 +208,7 @@ func (suite *ruleTestSuite) TestGet() { for _, testCase := range testCases { suite.T().Log(testCase.name) var resp placement.Rule - url := fmt.Sprintf("%s/rule/%s/%s", suite.urlPrefix, testCase.rule.GroupID, testCase.rule.ID) + url := fmt.Sprintf("%s/rule/%s/%s", urlPrefix, testCase.rule.GroupID, testCase.rule.ID) if testCase.found { err = tu.ReadGetJSON(re, testDialClient, url, &resp) suite.compareRule(&resp, &testCase.rule) @@ -225,20 +220,50 @@ func (suite *ruleTestSuite) TestGet() { } func (suite *ruleTestSuite) TestGetAll() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.PDServerCfg.KeyType = "raw" + conf.Replication.EnablePlacementRules = true + }, + } + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + env.RunTestInTwoModes(suite.checkGetAll) +} + +func (suite *ruleTestSuite) checkGetAll(cluster *tests.TestCluster) { + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() + urlPrefix := fmt.Sprintf("%s%s/api/v1/config", pdAddr, apiPrefix) + rule := placement.Rule{GroupID: "b", ID: "20", StartKeyHex: "1111", EndKeyHex: "3333", Role: "voter", Count: 1} data, err := json.Marshal(rule) suite.NoError(err) re := suite.Require() - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/rule", data, tu.StatusOK(re)) + err = tu.CheckPostJSON(testDialClient, urlPrefix+"/rule", data, tu.StatusOK(re)) suite.NoError(err) var resp2 []*placement.Rule - err = tu.ReadGetJSON(re, testDialClient, suite.urlPrefix+"/rules", &resp2) + err = tu.ReadGetJSON(re, testDialClient, urlPrefix+"/rules", &resp2) suite.NoError(err) suite.GreaterOrEqual(len(resp2), 1) } func (suite *ruleTestSuite) TestSetAll() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.PDServerCfg.KeyType = "raw" + conf.Replication.EnablePlacementRules = true + }, + } + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + env.RunTestInTwoModes(suite.checkSetAll) +} + +func (suite *ruleTestSuite) checkSetAll(cluster *tests.TestCluster) { + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() + urlPrefix := fmt.Sprintf("%s%s/api/v1/config", pdAddr, apiPrefix) + rule1 := placement.Rule{GroupID: "a", ID: "12", StartKeyHex: "1111", EndKeyHex: "3333", Role: "voter", Count: 1} rule2 := placement.Rule{GroupID: "b", ID: "12", StartKeyHex: "1111", EndKeyHex: "3333", Role: "voter", Count: 1} rule3 := placement.Rule{GroupID: "a", ID: "12", StartKeyHex: "XXXX", EndKeyHex: "3333", Role: "voter", Count: 1} @@ -247,10 +272,10 @@ func (suite *ruleTestSuite) TestSetAll() { LocationLabels: []string{"host"}} rule6 := placement.Rule{GroupID: "pd", ID: "default", StartKeyHex: "", EndKeyHex: "", Role: "voter", Count: 3} - suite.svr.GetPersistOptions().GetReplicationConfig().LocationLabels = []string{"host"} - defaultRule := suite.svr.GetRaftCluster().GetRuleManager().GetRule("pd", "default") + leaderServer.GetPersistOptions().GetReplicationConfig().LocationLabels = []string{"host"} + defaultRule := leaderServer.GetRaftCluster().GetRuleManager().GetRule("pd", "default") defaultRule.LocationLabels = []string{"host"} - suite.svr.GetRaftCluster().GetRuleManager().SetRule(defaultRule) + leaderServer.GetRaftCluster().GetRuleManager().SetRule(defaultRule) successData, err := json.Marshal([]*placement.Rule{&rule1, &rule2}) suite.NoError(err) @@ -333,13 +358,13 @@ func (suite *ruleTestSuite) TestSetAll() { for _, testCase := range testCases { suite.T().Log(testCase.name) if testCase.success { - err := tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/rules", testCase.rawData, tu.StatusOK(re)) + err := tu.CheckPostJSON(testDialClient, urlPrefix+"/rules", testCase.rawData, tu.StatusOK(re)) suite.NoError(err) if testCase.isDefaultRule { - suite.Equal(int(suite.svr.GetPersistOptions().GetReplicationConfig().MaxReplicas), testCase.count) + suite.Equal(int(leaderServer.GetPersistOptions().GetReplicationConfig().MaxReplicas), testCase.count) } } else { - err := tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/rules", testCase.rawData, + err := tu.CheckPostJSON(testDialClient, urlPrefix+"/rules", testCase.rawData, tu.StringEqual(re, testCase.response)) suite.NoError(err) } @@ -347,17 +372,32 @@ func (suite *ruleTestSuite) TestSetAll() { } func (suite *ruleTestSuite) TestGetAllByGroup() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.PDServerCfg.KeyType = "raw" + conf.Replication.EnablePlacementRules = true + }, + } + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + env.RunTestInTwoModes(suite.checkGetAllByGroup) +} + +func (suite *ruleTestSuite) checkGetAllByGroup(cluster *tests.TestCluster) { + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() + urlPrefix := fmt.Sprintf("%s%s/api/v1/config", pdAddr, apiPrefix) + re := suite.Require() rule := placement.Rule{GroupID: "c", ID: "20", StartKeyHex: "1111", EndKeyHex: "3333", Role: "voter", Count: 1} data, err := json.Marshal(rule) suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/rule", data, tu.StatusOK(re)) + err = tu.CheckPostJSON(testDialClient, urlPrefix+"/rule", data, tu.StatusOK(re)) suite.NoError(err) rule1 := placement.Rule{GroupID: "c", ID: "30", StartKeyHex: "1111", EndKeyHex: "3333", Role: "voter", Count: 1} data, err = json.Marshal(rule1) suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/rule", data, tu.StatusOK(re)) + err = tu.CheckPostJSON(testDialClient, urlPrefix+"/rule", data, tu.StatusOK(re)) suite.NoError(err) testCases := []struct { @@ -380,7 +420,7 @@ func (suite *ruleTestSuite) TestGetAllByGroup() { for _, testCase := range testCases { suite.T().Log(testCase.name) var resp []*placement.Rule - url := fmt.Sprintf("%s/rules/group/%s", suite.urlPrefix, testCase.groupID) + url := fmt.Sprintf("%s/rules/group/%s", urlPrefix, testCase.groupID) err = tu.ReadGetJSON(re, testDialClient, url, &resp) suite.NoError(err) suite.Len(resp, testCase.count) @@ -392,15 +432,30 @@ func (suite *ruleTestSuite) TestGetAllByGroup() { } func (suite *ruleTestSuite) TestGetAllByRegion() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.PDServerCfg.KeyType = "raw" + conf.Replication.EnablePlacementRules = true + }, + } + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + env.RunTestInTwoModes(suite.checkGetAllByRegion) +} + +func (suite *ruleTestSuite) checkGetAllByRegion(cluster *tests.TestCluster) { + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() + urlPrefix := fmt.Sprintf("%s%s/api/v1/config", pdAddr, apiPrefix) + rule := placement.Rule{GroupID: "e", ID: "20", StartKeyHex: "1111", EndKeyHex: "3333", Role: "voter", Count: 1} data, err := json.Marshal(rule) suite.NoError(err) re := suite.Require() - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/rule", data, tu.StatusOK(re)) + err = tu.CheckPostJSON(testDialClient, urlPrefix+"/rule", data, tu.StatusOK(re)) suite.NoError(err) r := core.NewTestRegionInfo(4, 1, []byte{0x22, 0x22}, []byte{0x33, 0x33}) - mustRegionHeartbeat(re, suite.svr, r) + tests.MustPutRegionInfo(re, cluster, r) testCases := []struct { name string @@ -429,7 +484,7 @@ func (suite *ruleTestSuite) TestGetAllByRegion() { for _, testCase := range testCases { suite.T().Log(testCase.name) var resp []*placement.Rule - url := fmt.Sprintf("%s/rules/region/%s", suite.urlPrefix, testCase.regionID) + url := fmt.Sprintf("%s/rules/region/%s", urlPrefix, testCase.regionID) if testCase.success { err = tu.ReadGetJSON(re, testDialClient, url, &resp) @@ -446,11 +501,26 @@ func (suite *ruleTestSuite) TestGetAllByRegion() { } func (suite *ruleTestSuite) TestGetAllByKey() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.PDServerCfg.KeyType = "raw" + conf.Replication.EnablePlacementRules = true + }, + } + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + env.RunTestInTwoModes(suite.checkGetAllByKey) +} + +func (suite *ruleTestSuite) checkGetAllByKey(cluster *tests.TestCluster) { + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() + urlPrefix := fmt.Sprintf("%s%s/api/v1/config", pdAddr, apiPrefix) + rule := placement.Rule{GroupID: "f", ID: "40", StartKeyHex: "8888", EndKeyHex: "9111", Role: "voter", Count: 1} data, err := json.Marshal(rule) suite.NoError(err) re := suite.Require() - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/rule", data, tu.StatusOK(re)) + err = tu.CheckPostJSON(testDialClient, urlPrefix+"/rule", data, tu.StatusOK(re)) suite.NoError(err) testCases := []struct { @@ -483,7 +553,7 @@ func (suite *ruleTestSuite) TestGetAllByKey() { for _, testCase := range testCases { suite.T().Log(testCase.name) var resp []*placement.Rule - url := fmt.Sprintf("%s/rules/key/%s", suite.urlPrefix, testCase.key) + url := fmt.Sprintf("%s/rules/key/%s", urlPrefix, testCase.key) if testCase.success { err = tu.ReadGetJSON(re, testDialClient, url, &resp) suite.Len(resp, testCase.respSize) @@ -495,10 +565,25 @@ func (suite *ruleTestSuite) TestGetAllByKey() { } func (suite *ruleTestSuite) TestDelete() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.PDServerCfg.KeyType = "raw" + conf.Replication.EnablePlacementRules = true + }, + } + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + env.RunTestInTwoModes(suite.checkDelete) +} + +func (suite *ruleTestSuite) checkDelete(cluster *tests.TestCluster) { + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() + urlPrefix := fmt.Sprintf("%s%s/api/v1/config", pdAddr, apiPrefix) + rule := placement.Rule{GroupID: "g", ID: "10", StartKeyHex: "8888", EndKeyHex: "9111", Role: "voter", Count: 1} data, err := json.Marshal(rule) suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/rule", data, tu.StatusOK(suite.Require())) + err = tu.CheckPostJSON(testDialClient, urlPrefix+"/rule", data, tu.StatusOK(suite.Require())) suite.NoError(err) oldStartKey, err := hex.DecodeString(rule.StartKeyHex) suite.NoError(err) @@ -529,15 +614,15 @@ func (suite *ruleTestSuite) TestDelete() { } for _, testCase := range testCases { suite.T().Log(testCase.name) - url := fmt.Sprintf("%s/rule/%s/%s", suite.urlPrefix, testCase.groupID, testCase.id) + url := fmt.Sprintf("%s/rule/%s/%s", urlPrefix, testCase.groupID, testCase.id) // clear suspect keyRanges to prevent test case from others - suite.svr.GetRaftCluster().ClearSuspectKeyRanges() + leaderServer.GetRaftCluster().ClearSuspectKeyRanges() err = tu.CheckDelete(testDialClient, url, tu.StatusOK(suite.Require())) suite.NoError(err) if len(testCase.popKeyRange) > 0 { popKeyRangeMap := map[string]struct{}{} for i := 0; i < len(testCase.popKeyRange)/2; i++ { - v, got := suite.svr.GetRaftCluster().PopOneSuspectKeyRange() + v, got := leaderServer.GetRaftCluster().PopOneSuspectKeyRange() suite.True(got) popKeyRangeMap[hex.EncodeToString(v[0])] = struct{}{} popKeyRangeMap[hex.EncodeToString(v[1])] = struct{}{} @@ -551,16 +636,22 @@ func (suite *ruleTestSuite) TestDelete() { } } -func (suite *ruleTestSuite) compareRule(r1 *placement.Rule, r2 *placement.Rule) { - suite.Equal(r2.GroupID, r1.GroupID) - suite.Equal(r2.ID, r1.ID) - suite.Equal(r2.StartKeyHex, r1.StartKeyHex) - suite.Equal(r2.EndKeyHex, r1.EndKeyHex) - suite.Equal(r2.Role, r1.Role) - suite.Equal(r2.Count, r1.Count) +func (suite *ruleTestSuite) TestBatch() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.PDServerCfg.KeyType = "raw" + conf.Replication.EnablePlacementRules = true + }, + } + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + env.RunTestInTwoModes(suite.checkBatch) } -func (suite *ruleTestSuite) TestBatch() { +func (suite *ruleTestSuite) checkBatch(cluster *tests.TestCluster) { + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() + urlPrefix := fmt.Sprintf("%s%s/api/v1/config", pdAddr, apiPrefix) + opt1 := placement.RuleOp{ Action: placement.RuleOpAdd, Rule: &placement.Rule{GroupID: "a", ID: "13", StartKeyHex: "1111", EndKeyHex: "3333", Role: "voter", Count: 1}, @@ -670,10 +761,10 @@ func (suite *ruleTestSuite) TestBatch() { for _, testCase := range testCases { suite.T().Log(testCase.name) if testCase.success { - err := tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/rules/batch", testCase.rawData, tu.StatusOK(re)) + err := tu.CheckPostJSON(testDialClient, urlPrefix+"/rules/batch", testCase.rawData, tu.StatusOK(re)) suite.NoError(err) } else { - err := tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/rules/batch", testCase.rawData, + err := tu.CheckPostJSON(testDialClient, urlPrefix+"/rules/batch", testCase.rawData, tu.StatusNotOK(re), tu.StringEqual(re, testCase.response)) suite.NoError(err) @@ -682,6 +773,21 @@ func (suite *ruleTestSuite) TestBatch() { } func (suite *ruleTestSuite) TestBundle() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.PDServerCfg.KeyType = "raw" + conf.Replication.EnablePlacementRules = true + }, + } + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + env.RunTestInTwoModes(suite.checkBundle) +} + +func (suite *ruleTestSuite) checkBundle(cluster *tests.TestCluster) { + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() + urlPrefix := fmt.Sprintf("%s%s/api/v1/config", pdAddr, apiPrefix) + re := suite.Require() // GetAll b1 := placement.GroupBundle{ @@ -691,7 +797,7 @@ func (suite *ruleTestSuite) TestBundle() { }, } var bundles []placement.GroupBundle - err := tu.ReadGetJSON(re, testDialClient, suite.urlPrefix+"/placement-rule", &bundles) + err := tu.ReadGetJSON(re, testDialClient, urlPrefix+"/placement-rule", &bundles) suite.NoError(err) suite.Len(bundles, 1) suite.compareBundle(bundles[0], b1) @@ -707,28 +813,28 @@ func (suite *ruleTestSuite) TestBundle() { } data, err := json.Marshal(b2) suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/placement-rule/foo", data, tu.StatusOK(re)) + err = tu.CheckPostJSON(testDialClient, urlPrefix+"/placement-rule/foo", data, tu.StatusOK(re)) suite.NoError(err) // Get var bundle placement.GroupBundle - err = tu.ReadGetJSON(re, testDialClient, suite.urlPrefix+"/placement-rule/foo", &bundle) + err = tu.ReadGetJSON(re, testDialClient, urlPrefix+"/placement-rule/foo", &bundle) suite.NoError(err) suite.compareBundle(bundle, b2) // GetAll again - err = tu.ReadGetJSON(re, testDialClient, suite.urlPrefix+"/placement-rule", &bundles) + err = tu.ReadGetJSON(re, testDialClient, urlPrefix+"/placement-rule", &bundles) suite.NoError(err) suite.Len(bundles, 2) suite.compareBundle(bundles[0], b1) suite.compareBundle(bundles[1], b2) // Delete - err = tu.CheckDelete(testDialClient, suite.urlPrefix+"/placement-rule/pd", tu.StatusOK(suite.Require())) + err = tu.CheckDelete(testDialClient, urlPrefix+"/placement-rule/pd", tu.StatusOK(suite.Require())) suite.NoError(err) // GetAll again - err = tu.ReadGetJSON(re, testDialClient, suite.urlPrefix+"/placement-rule", &bundles) + err = tu.ReadGetJSON(re, testDialClient, urlPrefix+"/placement-rule", &bundles) suite.NoError(err) suite.Len(bundles, 1) suite.compareBundle(bundles[0], b2) @@ -739,11 +845,11 @@ func (suite *ruleTestSuite) TestBundle() { b3 := placement.GroupBundle{ID: "foobar", Index: 100} data, err = json.Marshal([]placement.GroupBundle{b1, b2, b3}) suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/placement-rule", data, tu.StatusOK(re)) + err = tu.CheckPostJSON(testDialClient, urlPrefix+"/placement-rule", data, tu.StatusOK(re)) suite.NoError(err) // GetAll again - err = tu.ReadGetJSON(re, testDialClient, suite.urlPrefix+"/placement-rule", &bundles) + err = tu.ReadGetJSON(re, testDialClient, urlPrefix+"/placement-rule", &bundles) suite.NoError(err) suite.Len(bundles, 3) suite.compareBundle(bundles[0], b2) @@ -751,11 +857,11 @@ func (suite *ruleTestSuite) TestBundle() { suite.compareBundle(bundles[2], b3) // Delete using regexp - err = tu.CheckDelete(testDialClient, suite.urlPrefix+"/placement-rule/"+url.PathEscape("foo.*")+"?regexp", tu.StatusOK(suite.Require())) + err = tu.CheckDelete(testDialClient, urlPrefix+"/placement-rule/"+url.PathEscape("foo.*")+"?regexp", tu.StatusOK(suite.Require())) suite.NoError(err) // GetAll again - err = tu.ReadGetJSON(re, testDialClient, suite.urlPrefix+"/placement-rule", &bundles) + err = tu.ReadGetJSON(re, testDialClient, urlPrefix+"/placement-rule", &bundles) suite.NoError(err) suite.Len(bundles, 1) suite.compareBundle(bundles[0], b1) @@ -770,19 +876,19 @@ func (suite *ruleTestSuite) TestBundle() { } data, err = json.Marshal(b4) suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/placement-rule/"+id, data, tu.StatusOK(re)) + err = tu.CheckPostJSON(testDialClient, urlPrefix+"/placement-rule/"+id, data, tu.StatusOK(re)) suite.NoError(err) b4.ID = id b4.Rules[0].GroupID = b4.ID // Get - err = tu.ReadGetJSON(re, testDialClient, suite.urlPrefix+"/placement-rule/"+id, &bundle) + err = tu.ReadGetJSON(re, testDialClient, urlPrefix+"/placement-rule/"+id, &bundle) suite.NoError(err) suite.compareBundle(bundle, b4) // GetAll again - err = tu.ReadGetJSON(re, testDialClient, suite.urlPrefix+"/placement-rule", &bundles) + err = tu.ReadGetJSON(re, testDialClient, urlPrefix+"/placement-rule", &bundles) suite.NoError(err) suite.Len(bundles, 2) suite.compareBundle(bundles[0], b1) @@ -798,13 +904,13 @@ func (suite *ruleTestSuite) TestBundle() { } data, err = json.Marshal([]placement.GroupBundle{b1, b4, b5}) suite.NoError(err) - err = tu.CheckPostJSON(testDialClient, suite.urlPrefix+"/placement-rule", data, tu.StatusOK(re)) + err = tu.CheckPostJSON(testDialClient, urlPrefix+"/placement-rule", data, tu.StatusOK(re)) suite.NoError(err) b5.Rules[0].GroupID = b5.ID // GetAll again - err = tu.ReadGetJSON(re, testDialClient, suite.urlPrefix+"/placement-rule", &bundles) + err = tu.ReadGetJSON(re, testDialClient, urlPrefix+"/placement-rule", &bundles) suite.NoError(err) suite.Len(bundles, 3) suite.compareBundle(bundles[0], b1) @@ -813,6 +919,21 @@ func (suite *ruleTestSuite) TestBundle() { } func (suite *ruleTestSuite) TestBundleBadRequest() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.PDServerCfg.KeyType = "raw" + conf.Replication.EnablePlacementRules = true + }, + } + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + env.RunTestInTwoModes(suite.checkBundleBadRequest) +} + +func (suite *ruleTestSuite) checkBundleBadRequest(cluster *tests.TestCluster) { + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() + urlPrefix := fmt.Sprintf("%s%s/api/v1/config", pdAddr, apiPrefix) + testCases := []struct { uri string data string @@ -826,7 +947,7 @@ func (suite *ruleTestSuite) TestBundleBadRequest() { {"/placement-rule", `[{"group_id":"foo", "rules": [{"group_id":"bar", "id":"baz", "role":"voter", "count":1}]}]`, false}, } for _, testCase := range testCases { - err := tu.CheckPostJSON(testDialClient, suite.urlPrefix+testCase.uri, []byte(testCase.data), + err := tu.CheckPostJSON(testDialClient, urlPrefix+testCase.uri, []byte(testCase.data), func(_ []byte, code int, _ http.Header) { suite.Equal(testCase.ok, code == http.StatusOK) }) @@ -844,22 +965,42 @@ func (suite *ruleTestSuite) compareBundle(b1, b2 placement.GroupBundle) { } } +func (suite *ruleTestSuite) compareRule(r1 *placement.Rule, r2 *placement.Rule) { + suite.Equal(r2.GroupID, r1.GroupID) + suite.Equal(r2.ID, r1.ID) + suite.Equal(r2.StartKeyHex, r1.StartKeyHex) + suite.Equal(r2.EndKeyHex, r1.EndKeyHex) + suite.Equal(r2.Role, r1.Role) + suite.Equal(r2.Count, r1.Count) +} + type regionRuleTestSuite struct { suite.Suite - svr *server.Server - grpcSvr *server.GrpcServer - cleanup tu.CleanupFunc - urlPrefix string - stores []*metapb.Store - regions []*core.RegionInfo } func TestRegionRuleTestSuite(t *testing.T) { suite.Run(t, new(regionRuleTestSuite)) } -func (suite *regionRuleTestSuite) SetupSuite() { - suite.stores = []*metapb.Store{ +func (suite *regionRuleTestSuite) TestRegionPlacementRule() { + opts := []tests.ConfigOption{ + func(conf *config.Config, serverName string) { + conf.Replication.EnablePlacementRules = true + conf.Replication.MaxReplicas = 1 + }, + } + env := tests.NewSchedulingTestEnvironment(suite.T(), opts...) + // FIXME: enable this test in two modes after we support region label forward. + env.RunTestInPDMode(suite.checkRegionPlacementRule) +} + +func (suite *regionRuleTestSuite) checkRegionPlacementRule(cluster *tests.TestCluster) { + re := suite.Require() + leaderServer := cluster.GetLeaderServer() + pdAddr := leaderServer.GetAddr() + urlPrefix := fmt.Sprintf("%s%s/api/v1", pdAddr, apiPrefix) + + stores := []*metapb.Store{ { Id: 1, Address: "tikv1", @@ -875,49 +1016,30 @@ func (suite *regionRuleTestSuite) SetupSuite() { Version: "2.0.0", }, } - re := suite.Require() - suite.svr, suite.cleanup = mustNewServer(re, func(cfg *config.Config) { - cfg.Replication.EnablePlacementRules = true - cfg.Replication.MaxReplicas = 1 - }) - server.MustWaitLeader(re, []*server.Server{suite.svr}) - - addr := suite.svr.GetAddr() - suite.grpcSvr = &server.GrpcServer{Server: suite.svr} - suite.urlPrefix = fmt.Sprintf("%s%s/api/v1", addr, apiPrefix) - - mustBootstrapCluster(re, suite.svr) - - for _, store := range suite.stores { - mustPutStore(re, suite.svr, store.Id, store.State, store.NodeState, nil) + for _, store := range stores { + tests.MustPutStore(re, cluster, store) } - suite.regions = make([]*core.RegionInfo, 0) + regions := make([]*core.RegionInfo, 0) peers1 := []*metapb.Peer{ {Id: 102, StoreId: 1, Role: metapb.PeerRole_Voter}, {Id: 103, StoreId: 2, Role: metapb.PeerRole_Voter}} - suite.regions = append(suite.regions, core.NewRegionInfo(&metapb.Region{Id: 1, Peers: peers1, RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 1}}, peers1[0], + regions = append(regions, core.NewRegionInfo(&metapb.Region{Id: 1, Peers: peers1, RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 1}}, peers1[0], core.WithStartKey([]byte("abc")), core.WithEndKey([]byte("def")))) peers2 := []*metapb.Peer{ {Id: 104, StoreId: 1, Role: metapb.PeerRole_Voter}, {Id: 105, StoreId: 2, Role: metapb.PeerRole_Learner}} - suite.regions = append(suite.regions, core.NewRegionInfo(&metapb.Region{Id: 2, Peers: peers2, RegionEpoch: &metapb.RegionEpoch{ConfVer: 2, Version: 2}}, peers2[0], + regions = append(regions, core.NewRegionInfo(&metapb.Region{Id: 2, Peers: peers2, RegionEpoch: &metapb.RegionEpoch{ConfVer: 2, Version: 2}}, peers2[0], core.WithStartKey([]byte("ghi")), core.WithEndKey([]byte("jkl")))) peers3 := []*metapb.Peer{ {Id: 106, StoreId: 1, Role: metapb.PeerRole_Voter}, {Id: 107, StoreId: 2, Role: metapb.PeerRole_Learner}} - suite.regions = append(suite.regions, core.NewRegionInfo(&metapb.Region{Id: 3, Peers: peers3, RegionEpoch: &metapb.RegionEpoch{ConfVer: 3, Version: 3}}, peers3[0], + regions = append(regions, core.NewRegionInfo(&metapb.Region{Id: 3, Peers: peers3, RegionEpoch: &metapb.RegionEpoch{ConfVer: 3, Version: 3}}, peers3[0], core.WithStartKey([]byte("mno")), core.WithEndKey([]byte("pqr")))) - for _, rg := range suite.regions { - suite.svr.GetBasicCluster().PutRegion(rg) + for _, rg := range regions { + tests.MustPutRegionInfo(re, cluster, rg) } -} - -func (suite *regionRuleTestSuite) TearDownSuite() { - suite.cleanup() -} -func (suite *regionRuleTestSuite) TestRegionPlacementRule() { - ruleManager := suite.svr.GetRaftCluster().GetRuleManager() + ruleManager := leaderServer.GetRaftCluster().GetRuleManager() ruleManager.SetRule(&placement.Rule{ GroupID: "test", ID: "test2", @@ -934,38 +1056,38 @@ func (suite *regionRuleTestSuite) TestRegionPlacementRule() { Role: placement.Learner, Count: 1, }) - re := suite.Require() - url := fmt.Sprintf("%s/config/rules/region/%d/detail", suite.urlPrefix, 1) fit := &placement.RegionFit{} + + url := fmt.Sprintf("%s/config/rules/region/%d/detail", urlPrefix, 1) err := tu.ReadGetJSON(re, testDialClient, url, fit) + suite.NoError(err) suite.Equal(len(fit.RuleFits), 1) suite.Equal(len(fit.OrphanPeers), 1) - suite.NoError(err) - url = fmt.Sprintf("%s/config/rules/region/%d/detail", suite.urlPrefix, 2) + url = fmt.Sprintf("%s/config/rules/region/%d/detail", urlPrefix, 2) fit = &placement.RegionFit{} err = tu.ReadGetJSON(re, testDialClient, url, fit) + suite.NoError(err) suite.Equal(len(fit.RuleFits), 2) suite.Equal(len(fit.OrphanPeers), 0) - suite.NoError(err) - url = fmt.Sprintf("%s/config/rules/region/%d/detail", suite.urlPrefix, 3) + url = fmt.Sprintf("%s/config/rules/region/%d/detail", urlPrefix, 3) fit = &placement.RegionFit{} err = tu.ReadGetJSON(re, testDialClient, url, fit) + suite.NoError(err) suite.Equal(len(fit.RuleFits), 0) suite.Equal(len(fit.OrphanPeers), 2) - suite.NoError(err) - url = fmt.Sprintf("%s/config/rules/region/%d/detail", suite.urlPrefix, 4) + url = fmt.Sprintf("%s/config/rules/region/%d/detail", urlPrefix, 4) err = tu.CheckGetJSON(testDialClient, url, nil, tu.Status(re, http.StatusNotFound), tu.StringContain( re, "region 4 not found")) suite.NoError(err) - url = fmt.Sprintf("%s/config/rules/region/%s/detail", suite.urlPrefix, "id") + url = fmt.Sprintf("%s/config/rules/region/%s/detail", urlPrefix, "id") err = tu.CheckGetJSON(testDialClient, url, nil, tu.Status(re, http.StatusBadRequest), tu.StringContain( - re, "invalid region id")) + re, errs.ErrRegionInvalidID.Error())) suite.NoError(err) - suite.svr.GetRaftCluster().GetReplicationConfig().EnablePlacementRules = false - url = fmt.Sprintf("%s/config/rules/region/%d/detail", suite.urlPrefix, 1) + leaderServer.GetRaftCluster().GetReplicationConfig().EnablePlacementRules = false + url = fmt.Sprintf("%s/config/rules/region/%d/detail", urlPrefix, 1) err = tu.CheckGetJSON(testDialClient, url, nil, tu.Status(re, http.StatusPreconditionFailed), tu.StringContain( re, "placement rules feature is disabled")) suite.NoError(err)