Merge branch 'master' into support_transfer_primary2

HuSharp · Jul 8, 2024 · 9e3b798 · 9e3b798
2 parents ec8e737 + b35f18d
commit 9e3b798
Show file tree

Hide file tree

Showing 27 changed files with 444 additions and 213 deletions.
diff --git a/Makefile b/Makefile
@@ -254,7 +254,7 @@ basic-test: install-tools
 
 ci-test-job: install-tools dashboard-ui pd-ut
 	@$(FAILPOINT_ENABLE)
-	./scripts/ci-subtask.sh $(JOB_COUNT) $(JOB_INDEX) || { $(FAILPOINT_DISABLE); exit 1; }
+	./scripts/ci-subtask.sh $(JOB_INDEX) || { $(FAILPOINT_DISABLE); exit 1; }
 	@$(FAILPOINT_DISABLE)
 
 TSO_INTEGRATION_TEST_PKGS := $(PD_PKG)/tests/server/tso

diff --git a/client/go.mod b/client/go.mod
@@ -33,6 +33,7 @@ require (
 	github.com/prometheus/client_model v0.5.0 // indirect
 	github.com/prometheus/common v0.46.0 // indirect
 	github.com/prometheus/procfs v0.12.0 // indirect
+	github.com/stretchr/objx v0.5.0 // indirect
 	go.uber.org/multierr v1.11.0 // indirect
 	golang.org/x/net v0.23.0 // indirect
 	golang.org/x/sys v0.18.0 // indirect

diff --git a/client/go.sum b/client/go.sum
@@ -68,6 +68,7 @@ github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncj
 github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=

diff --git a/client/metrics.go b/client/metrics.go
@@ -105,7 +105,7 @@ func initMetrics(constLabels prometheus.Labels) {
 			Subsystem:   "request",
 			Name:        "tso_batch_send_latency",
 			ConstLabels: constLabels,
-			Buckets:     prometheus.ExponentialBuckets(1, 2, 34), // 1ns ~ 8s
+			Buckets:     prometheus.ExponentialBuckets(0.0005, 2, 13),
 			Help:        "tso batch send latency",
 		})
 

diff --git a/client/resource_group/controller/config.go b/client/resource_group/controller/config.go
@@ -52,8 +52,10 @@ const (
 	defaultTargetPeriod = 5 * time.Second
 	// defaultMaxWaitDuration is the max duration to wait for the token before throwing error.
 	defaultMaxWaitDuration = 30 * time.Second
+	// defaultLTBTokenRPCMaxDelay is the upper bound of backoff delay for local token bucket RPC.
+	defaultLTBTokenRPCMaxDelay = 1 * time.Second
 	// defaultWaitRetryTimes is the times to retry when waiting for the token.
-	defaultWaitRetryTimes = 10
+	defaultWaitRetryTimes = 20
 	// defaultWaitRetryInterval is the interval to retry when waiting for the token.
 	defaultWaitRetryInterval = 50 * time.Millisecond
 )
@@ -77,23 +79,35 @@ const (
 
 	// Because the resource manager has not been deployed in microservice mode,
 	// do not enable this function.
-	defaultDegradedModeWaitDuration = 0
+	defaultDegradedModeWaitDuration = time.Duration(0)
 	defaultAvgBatchProportion       = 0.7
 )
 
-// Config is the configuration of the resource manager controller which includes some option for client needed.
-type Config struct {
+// TokenRPCParams is the parameters for local bucket RPC.
+type TokenRPCParams struct {
+	// WaitRetryInterval is the interval to retry when waiting for the token.
+	WaitRetryInterval Duration `toml:"wait-retry-interval" json:"wait-retry-interval"`
+
+	// WaitRetryTimes is the times to retry when waiting for the token.
+	WaitRetryTimes int `toml:"wait-retry-times" json:"wait-retry-times"`
+}
+
+// LocalBucketConfig is the configuration for local bucket. not export to server side.
+type LocalBucketConfig struct {
+	TokenRPCParams `toml:"token-rpc-params" json:"token-rpc-params"`
+}
+
+// BaseConfig is the configuration of the resource manager controller which includes some option for client needed.
+// TODO: unified the configuration for client and server, server side in pkg/mcs/resourcemanger/config.go.
+type BaseConfig struct {
 	// EnableDegradedMode is to control whether resource control client enable degraded mode when server is disconnect.
 	DegradedModeWaitDuration Duration `toml:"degraded-mode-wait-duration" json:"degraded-mode-wait-duration"`
 
 	// LTBMaxWaitDuration is the max wait time duration for local token bucket.
 	LTBMaxWaitDuration Duration `toml:"ltb-max-wait-duration" json:"ltb-max-wait-duration"`
 
-	// WaitRetryInterval is the interval to retry when waiting for the token.
-	WaitRetryInterval Duration `toml:"wait-retry-interval" json:"wait-retry-interval"`
-
-	// WaitRetryTimes is the times to retry when waiting for the token.
-	WaitRetryTimes int `toml:"wait-retry-times" json:"wait-retry-times"`
+	// LTBTokenRPCMaxDelay is the upper bound of backoff delay for local token bucket RPC.
+	LTBTokenRPCMaxDelay Duration `toml:"ltb-token-rpc-max-delay" json:"ltb-token-rpc-max-delay"`
 
 	// RequestUnit is the configuration determines the coefficients of the RRU and WRU cost.
 	// This configuration should be modified carefully.
@@ -103,15 +117,43 @@ type Config struct {
 	EnableControllerTraceLog bool `toml:"enable-controller-trace-log" json:"enable-controller-trace-log,string"`
 }
 
+// Config is the configuration of the resource manager controller.
+type Config struct {
+	BaseConfig
+	LocalBucketConfig
+}
+
+// Adjust adjusts the configuration.
+func (c *Config) Adjust() {
+	// valid the configuration, TODO: separately add the valid function.
+	if c.BaseConfig.LTBMaxWaitDuration.Duration == 0 {
+		c.BaseConfig.LTBMaxWaitDuration = NewDuration(defaultMaxWaitDuration)
+	}
+	if c.LocalBucketConfig.WaitRetryInterval.Duration == 0 {
+		c.LocalBucketConfig.WaitRetryInterval = NewDuration(defaultWaitRetryInterval)
+	}
+	// adjust the client settings. calculate the retry times.
+	if int(c.BaseConfig.LTBTokenRPCMaxDelay.Duration) != int(c.LocalBucketConfig.WaitRetryInterval.Duration)*c.LocalBucketConfig.WaitRetryTimes {
+		c.LocalBucketConfig.WaitRetryTimes = int(c.BaseConfig.LTBTokenRPCMaxDelay.Duration / c.LocalBucketConfig.WaitRetryInterval.Duration)
+	}
+}
+
 // DefaultConfig returns the default resource manager controller configuration.
 func DefaultConfig() *Config {
 	return &Config{
-		DegradedModeWaitDuration: NewDuration(defaultDegradedModeWaitDuration),
-		LTBMaxWaitDuration:       NewDuration(defaultMaxWaitDuration),
-		WaitRetryInterval:        NewDuration(defaultWaitRetryInterval),
-		WaitRetryTimes:           defaultWaitRetryTimes,
-		RequestUnit:              DefaultRequestUnitConfig(),
-		EnableControllerTraceLog: false,
+		BaseConfig: BaseConfig{
+			DegradedModeWaitDuration: NewDuration(defaultDegradedModeWaitDuration),
+			RequestUnit:              DefaultRequestUnitConfig(),
+			EnableControllerTraceLog: false,
+			LTBMaxWaitDuration:       NewDuration(defaultMaxWaitDuration),
+			LTBTokenRPCMaxDelay:      NewDuration(defaultLTBTokenRPCMaxDelay),
+		},
+		LocalBucketConfig: LocalBucketConfig{
+			TokenRPCParams: TokenRPCParams{
+				WaitRetryInterval: NewDuration(defaultWaitRetryInterval),
+				WaitRetryTimes:    defaultWaitRetryTimes,
+			},
+		},
 	}
 }
 

diff --git a/client/resource_group/controller/controller.go b/client/resource_group/controller/controller.go
@@ -193,6 +193,7 @@ func NewResourceGroupController(
 	log.Info("load resource controller config", zap.Reflect("config", config), zap.Reflect("ru-config", controller.ruConfig))
 	controller.calculators = []ResourceCalculator{newKVCalculator(controller.ruConfig), newSQLCalculator(controller.ruConfig)}
 	controller.safeRuConfig.Store(controller.ruConfig)
+	enableControllerTraceLog.Store(config.EnableControllerTraceLog)
 	return controller, nil
 }
 
@@ -201,12 +202,13 @@ func loadServerConfig(ctx context.Context, provider ResourceGroupProvider) (*Con
 	if err != nil {
 		return nil, err
 	}
+	config := DefaultConfig()
+	defer config.Adjust()
 	kvs := resp.GetKvs()
 	if len(kvs) == 0 {
 		log.Warn("[resource group controller] server does not save config, load config failed")
-		return DefaultConfig(), nil
+		return config, nil
 	}
-	config := DefaultConfig()
 	err = json.Unmarshal(kvs[0].GetValue(), config)
 	if err != nil {
 		return nil, err
@@ -309,7 +311,6 @@ func (c *ResourceGroupsController) Start(ctx context.Context) {
 						watchRetryTimer.Reset(watchRetryInterval)
 					}
 				}
-
 			case <-emergencyTokenAcquisitionTicker.C:
 				c.executeOnAllGroups((*groupCostController).resetEmergencyTokenAcquisition)
 			/* channels */
@@ -329,9 +330,7 @@ func (c *ResourceGroupsController) Start(ctx context.Context) {
 			case notifyMsg := <-c.lowTokenNotifyChan:
 				c.executeOnAllGroups((*groupCostController).updateRunState)
 				c.executeOnAllGroups((*groupCostController).updateAvgRequestResourcePerSec)
-				if len(c.run.currentRequests) == 0 {
-					c.collectTokenBucketRequests(c.loopCtx, FromLowRU, lowToken /* select low tokens resource group */, notifyMsg)
-				}
+				c.collectTokenBucketRequests(c.loopCtx, FromLowRU, lowToken /* select low tokens resource group */, notifyMsg)
 				if c.run.inDegradedMode {
 					c.executeOnAllGroups((*groupCostController).applyDegradedMode)
 				}
@@ -391,6 +390,7 @@ func (c *ResourceGroupsController) Start(ctx context.Context) {
 					if err := json.Unmarshal(item.Kv.Value, config); err != nil {
 						continue
 					}
+					config.Adjust()
 					c.ruConfig = GenerateRUConfig(config)
 
 					// Stay compatible with serverless
@@ -404,7 +404,6 @@ func (c *ResourceGroupsController) Start(ctx context.Context) {
 					}
 					log.Info("load resource controller config after config changed", zap.Reflect("config", config), zap.Reflect("ruConfig", c.ruConfig))
 				}
-
 			case gc := <-c.tokenBucketUpdateChan:
 				go gc.handleTokenBucketUpdateEvent(c.loopCtx)
 			}
@@ -1178,11 +1177,19 @@ func (gc *groupCostController) collectRequestAndConsumption(selectTyp selectType
 			switch selectTyp {
 			case periodicReport:
 				selected = selected || gc.shouldReportConsumption()
+				failpoint.Inject("triggerPeriodicReport", func(val failpoint.Value) {
+					selected = gc.name == val.(string)
+				})
 				fallthrough
 			case lowToken:
 				if counter.limiter.IsLowTokens() {
 					selected = true
 				}
+				failpoint.Inject("triggerLowRUReport", func(val failpoint.Value) {
+					if selectTyp == lowToken {
+						selected = gc.name == val.(string)
+					}
+				})
 			}
 			request := &rmpb.RequestUnitItem{
 				Type:  typ,

diff --git a/client/resource_group/controller/controller_test.go b/client/resource_group/controller/controller_test.go
@@ -24,8 +24,12 @@ import (
 	"testing"
 	"time"
 
+	"github.com/pingcap/failpoint"
+	"github.com/pingcap/kvproto/pkg/meta_storagepb"
 	rmpb "github.com/pingcap/kvproto/pkg/resource_manager"
+	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
+	pd "github.com/tikv/pd/client"
 	"github.com/tikv/pd/client/errs"
 )
 
@@ -132,3 +136,138 @@ func TestResourceGroupThrottledError(t *testing.T) {
 	re.Error(err)
 	re.True(errs.ErrClientResourceGroupThrottled.Equal(err))
 }
+
+// MockResourceGroupProvider is a mock implementation of the ResourceGroupProvider interface.
+type MockResourceGroupProvider struct {
+	mock.Mock
+}
+
+func (m *MockResourceGroupProvider) GetResourceGroup(ctx context.Context, resourceGroupName string, opts ...pd.GetResourceGroupOption) (*rmpb.ResourceGroup, error) {
+	args := m.Called(ctx, resourceGroupName, opts)
+	return args.Get(0).(*rmpb.ResourceGroup), args.Error(1)
+}
+
+func (m *MockResourceGroupProvider) ListResourceGroups(ctx context.Context, opts ...pd.GetResourceGroupOption) ([]*rmpb.ResourceGroup, error) {
+	args := m.Called(ctx, opts)
+	return args.Get(0).([]*rmpb.ResourceGroup), args.Error(1)
+}
+
+func (m *MockResourceGroupProvider) AddResourceGroup(ctx context.Context, metaGroup *rmpb.ResourceGroup) (string, error) {
+	args := m.Called(ctx, metaGroup)
+	return args.String(0), args.Error(1)
+}
+
+func (m *MockResourceGroupProvider) ModifyResourceGroup(ctx context.Context, metaGroup *rmpb.ResourceGroup) (string, error) {
+	args := m.Called(ctx, metaGroup)
+	return args.String(0), args.Error(1)
+}
+
+func (m *MockResourceGroupProvider) DeleteResourceGroup(ctx context.Context, resourceGroupName string) (string, error) {
+	args := m.Called(ctx, resourceGroupName)
+	return args.String(0), args.Error(1)
+}
+
+func (m *MockResourceGroupProvider) AcquireTokenBuckets(ctx context.Context, request *rmpb.TokenBucketsRequest) ([]*rmpb.TokenBucketResponse, error) {
+	args := m.Called(ctx, request)
+	return args.Get(0).([]*rmpb.TokenBucketResponse), args.Error(1)
+}
+
+func (m *MockResourceGroupProvider) LoadResourceGroups(ctx context.Context) ([]*rmpb.ResourceGroup, int64, error) {
+	args := m.Called(ctx)
+	return args.Get(0).([]*rmpb.ResourceGroup), args.Get(1).(int64), args.Error(2)
+}
+
+func (m *MockResourceGroupProvider) Watch(ctx context.Context, key []byte, opts ...pd.OpOption) (chan []*meta_storagepb.Event, error) {
+	args := m.Called(ctx, key, opts)
+	return args.Get(0).(chan []*meta_storagepb.Event), args.Error(1)
+}
+
+func (m *MockResourceGroupProvider) Get(ctx context.Context, key []byte, opts ...pd.OpOption) (*meta_storagepb.GetResponse, error) {
+	args := m.Called(ctx, key, opts)
+	return args.Get(0).(*meta_storagepb.GetResponse), args.Error(1)
+}
+
+func TestControllerWithTwoGroupRequestConcurrency(t *testing.T) {
+	re := require.New(t)
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	mockProvider := new(MockResourceGroupProvider)
+
+	mockProvider.On("Get", mock.Anything, mock.Anything, mock.Anything).Return(&meta_storagepb.GetResponse{}, nil)
+	// LoadResourceGroups
+	mockProvider.On("LoadResourceGroups", mock.Anything).Return([]*rmpb.ResourceGroup{}, int64(0), nil)
+	// Watch
+	mockProvider.On("Watch", mock.Anything, mock.Anything, mock.Anything).Return(make(chan []*meta_storagepb.Event), nil)
+
+	re.NoError(failpoint.Enable("github.com/tikv/pd/client/resource_group/controller/triggerPeriodicReport", fmt.Sprintf("return(\"%s\")", "default")))
+	defer failpoint.Disable("github.com/tikv/pd/client/resource_group/controller/triggerPeriodicReport")
+	re.NoError(failpoint.Enable("github.com/tikv/pd/client/resource_group/controller/triggerLowRUReport", fmt.Sprintf("return(\"%s\")", "test-group")))
+	defer failpoint.Disable("github.com/tikv/pd/client/resource_group/controller/triggerLowRUReport")
+
+	controller, _ := NewResourceGroupController(ctx, 1, mockProvider, nil)
+	controller.Start(ctx)
+
+	defaultResourceGroup := &rmpb.ResourceGroup{Name: "default", Mode: rmpb.GroupMode_RUMode, RUSettings: &rmpb.GroupRequestUnitSettings{RU: &rmpb.TokenBucket{Settings: &rmpb.TokenLimitSettings{FillRate: 1000000}}}}
+	testResourceGroup := &rmpb.ResourceGroup{Name: "test-group", Mode: rmpb.GroupMode_RUMode, RUSettings: &rmpb.GroupRequestUnitSettings{RU: &rmpb.TokenBucket{Settings: &rmpb.TokenLimitSettings{FillRate: 1000000}}}}
+	mockProvider.On("GetResourceGroup", mock.Anything, "default", mock.Anything).Return(defaultResourceGroup, nil)
+	mockProvider.On("GetResourceGroup", mock.Anything, "test-group", mock.Anything).Return(testResourceGroup, nil)
+
+	c1, err := controller.tryGetResourceGroup(ctx, "default")
+	re.NoError(err)
+	re.Equal(defaultResourceGroup, c1.meta)
+
+	c2, err := controller.tryGetResourceGroup(ctx, "test-group")
+	re.NoError(err)
+	re.Equal(testResourceGroup, c2.meta)
+
+	var expectResp []*rmpb.TokenBucketResponse
+	recTestGroupAcquireTokenRequest := make(chan bool)
+	mockProvider.On("AcquireTokenBuckets", mock.Anything, mock.Anything).Run(func(args mock.Arguments) {
+		request := args.Get(1).(*rmpb.TokenBucketsRequest)
+		var responses []*rmpb.TokenBucketResponse
+		for _, req := range request.Requests {
+			if req.ResourceGroupName == "default" {
+				// no response the default group request, that's mean `len(c.run.currentRequests) != 0` always.
+				time.Sleep(100 * time.Second)
+				responses = append(responses, &rmpb.TokenBucketResponse{
+					ResourceGroupName: "default",
+					GrantedRUTokens: []*rmpb.GrantedRUTokenBucket{
+						{
+							GrantedTokens: &rmpb.TokenBucket{
+								Tokens: 100000,
+							},
+						},
+					},
+				})
+			} else {
+				responses = append(responses, &rmpb.TokenBucketResponse{
+					ResourceGroupName: req.ResourceGroupName,
+					GrantedRUTokens: []*rmpb.GrantedRUTokenBucket{
+						{
+							GrantedTokens: &rmpb.TokenBucket{
+								Tokens: 100000,
+							},
+						},
+					},
+				})
+			}
+		}
+		// receive test-group request
+		if len(request.Requests) == 1 && request.Requests[0].ResourceGroupName == "test-group" {
+			recTestGroupAcquireTokenRequest <- true
+		}
+		expectResp = responses
+	}).Return(expectResp, nil)
+	// wait default group request token by PeriodicReport.
+	time.Sleep(2 * time.Second)
+	counter := c2.run.requestUnitTokens[0]
+	counter.limiter.mu.Lock()
+	counter.limiter.notify()
+	counter.limiter.mu.Unlock()
+	select {
+	case res := <-recTestGroupAcquireTokenRequest:
+		re.True(res)
+	case <-time.After(5 * time.Second):
+		re.Fail("timeout")
+	}
+}