From c1eabda686e539e4bf13c166fda6b2ac29cced39 Mon Sep 17 00:00:00 2001 From: Yongbo Jiang Date: Tue, 12 Mar 2024 17:53:09 +0800 Subject: [PATCH 1/5] client: fix scheme when tls config not match (#7901) close tikv/pd#7900, close tikv/pd#7902 Signed-off-by: Cabinfever_B Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- client/client.go | 21 +- client/grpcutil/grpcutil.go | 4 +- client/http/client.go | 12 +- client/meta_storage_client.go | 4 +- client/mock_pd_service_discovery.go | 16 +- client/pd_service_discovery.go | 301 +++++++++--------- client/pd_service_discovery_test.go | 78 +++-- client/resource_manager_client.go | 2 +- client/tso_client.go | 68 ++-- client/tso_dispatcher.go | 60 ++-- client/tso_service_discovery.go | 218 ++++++------- client/tso_stream.go | 34 +- pkg/utils/grpcutil/grpcutil.go | 4 +- tests/integrations/client/client_test.go | 17 +- tests/integrations/client/http_client_test.go | 2 +- .../resourcemanager/resource_manager_test.go | 2 +- 16 files changed, 446 insertions(+), 397 deletions(-) diff --git a/client/client.go b/client/client.go index 81bf809ef4d9..e2ceb41cfd27 100644 --- a/client/client.go +++ b/client/client.go @@ -76,9 +76,9 @@ type Client interface { GetClusterID(ctx context.Context) uint64 // GetAllMembers gets the members Info from PD GetAllMembers(ctx context.Context) ([]*pdpb.Member, error) - // GetLeaderAddr returns current leader's address. It returns "" before + // GetLeaderURL returns current leader's URL. It returns "" before // syncing leader from server. - GetLeaderAddr() string + GetLeaderURL() string // GetRegion gets a region and its leader Peer from PD by key. // The region may expire after split. Caller is responsible for caching and // taking care of region change. @@ -575,7 +575,7 @@ func (c *client) setup() error { } // Register callbacks - c.pdSvcDiscovery.AddServingAddrSwitchedCallback(c.scheduleUpdateTokenConnection) + c.pdSvcDiscovery.AddServingURLSwitchedCallback(c.scheduleUpdateTokenConnection) // Create dispatchers c.createTokenDispatcher() @@ -680,9 +680,9 @@ func (c *client) GetClusterID(context.Context) uint64 { return c.pdSvcDiscovery.GetClusterID() } -// GetLeaderAddr returns the leader address. -func (c *client) GetLeaderAddr() string { - return c.pdSvcDiscovery.GetServingAddr() +// GetLeaderURL returns the leader URL. +func (c *client) GetLeaderURL() string { + return c.pdSvcDiscovery.GetServingURL() } // GetServiceDiscovery returns the client-side service discovery object @@ -1402,9 +1402,14 @@ func IsLeaderChange(err error) bool { strings.Contains(errMsg, errs.NotServedErr) } +const ( + httpSchemePrefix = "http://" + httpsSchemePrefix = "https://" +) + func trimHTTPPrefix(str string) string { - str = strings.TrimPrefix(str, "http://") - str = strings.TrimPrefix(str, "https://") + str = strings.TrimPrefix(str, httpSchemePrefix) + str = strings.TrimPrefix(str, httpsSchemePrefix) return str } diff --git a/client/grpcutil/grpcutil.go b/client/grpcutil/grpcutil.go index 070cdf7822f6..fb9e84f0ca1a 100644 --- a/client/grpcutil/grpcutil.go +++ b/client/grpcutil/grpcutil.go @@ -84,8 +84,8 @@ func GetClientConn(ctx context.Context, addr string, tlsCfg *tls.Config, do ...g // BuildForwardContext creates a context with receiver metadata information. // It is used in client side. -func BuildForwardContext(ctx context.Context, addr string) context.Context { - md := metadata.Pairs(ForwardMetadataKey, addr) +func BuildForwardContext(ctx context.Context, url string) context.Context { + md := metadata.Pairs(ForwardMetadataKey, url) return metadata.NewOutgoingContext(ctx, md) } diff --git a/client/http/client.go b/client/http/client.go index 5ac00a8a43b3..18802346a4c5 100644 --- a/client/http/client.go +++ b/client/http/client.go @@ -130,13 +130,13 @@ func (ci *clientInner) requestWithRetry( return errs.ErrClientNoAvailableMember } for _, cli := range clients { - addr := cli.GetHTTPAddress() - statusCode, err = ci.doRequest(ctx, addr, reqInfo, headerOpts...) + url := cli.GetURL() + statusCode, err = ci.doRequest(ctx, url, reqInfo, headerOpts...) if err == nil || noNeedRetry(statusCode) { return err } - log.Debug("[pd] request addr failed", - zap.String("source", ci.source), zap.Bool("is-leader", cli.IsConnectedToLeader()), zap.String("addr", addr), zap.Error(err)) + log.Debug("[pd] request url failed", + zap.String("source", ci.source), zap.Bool("is-leader", cli.IsConnectedToLeader()), zap.String("url", url), zap.Error(err)) } return err } @@ -160,19 +160,19 @@ func noNeedRetry(statusCode int) bool { func (ci *clientInner) doRequest( ctx context.Context, - addr string, reqInfo *requestInfo, + url string, reqInfo *requestInfo, headerOpts ...HeaderOption, ) (int, error) { var ( source = ci.source callerID = reqInfo.callerID name = reqInfo.name - url = reqInfo.getURL(addr) method = reqInfo.method body = reqInfo.body res = reqInfo.res respHandler = reqInfo.respHandler ) + url = reqInfo.getURL(url) logFields := []zap.Field{ zap.String("source", source), zap.String("name", name), diff --git a/client/meta_storage_client.go b/client/meta_storage_client.go index 8b158af22128..fe7e8a33e932 100644 --- a/client/meta_storage_client.go +++ b/client/meta_storage_client.go @@ -124,7 +124,7 @@ func (c *client) Put(ctx context.Context, key, value []byte, opts ...OpOption) ( Lease: options.lease, PrevKv: options.prevKv, } - ctx = grpcutil.BuildForwardContext(ctx, c.GetLeaderAddr()) + ctx = grpcutil.BuildForwardContext(ctx, c.GetLeaderURL()) cli := c.metaStorageClient() if cli == nil { cancel() @@ -162,7 +162,7 @@ func (c *client) Get(ctx context.Context, key []byte, opts ...OpOption) (*meta_s Limit: options.limit, Revision: options.revision, } - ctx = grpcutil.BuildForwardContext(ctx, c.GetLeaderAddr()) + ctx = grpcutil.BuildForwardContext(ctx, c.GetLeaderURL()) cli := c.metaStorageClient() if cli == nil { cancel() diff --git a/client/mock_pd_service_discovery.go b/client/mock_pd_service_discovery.go index 10f7f0801068..b33c8405af9e 100644 --- a/client/mock_pd_service_discovery.go +++ b/client/mock_pd_service_discovery.go @@ -41,7 +41,7 @@ func NewMockPDServiceDiscovery(urls []string, tlsCfg *tls.Config) *mockPDService func (m *mockPDServiceDiscovery) Init() error { m.clients = make([]ServiceClient, 0, len(m.urls)) for _, url := range m.urls { - m.clients = append(m.clients, newPDServiceClient(url, url, m.tlsCfg, nil, false)) + m.clients = append(m.clients, newPDServiceClient(url, url, nil, false)) } return nil } @@ -62,13 +62,13 @@ func (m *mockPDServiceDiscovery) GetKeyspaceGroupID() uint32 func (m *mockPDServiceDiscovery) GetServiceURLs() []string { return nil } func (m *mockPDServiceDiscovery) GetServingEndpointClientConn() *grpc.ClientConn { return nil } func (m *mockPDServiceDiscovery) GetClientConns() *sync.Map { return nil } -func (m *mockPDServiceDiscovery) GetServingAddr() string { return "" } -func (m *mockPDServiceDiscovery) GetBackupAddrs() []string { return nil } +func (m *mockPDServiceDiscovery) GetServingURL() string { return "" } +func (m *mockPDServiceDiscovery) GetBackupURLs() []string { return nil } func (m *mockPDServiceDiscovery) GetServiceClient() ServiceClient { return nil } -func (m *mockPDServiceDiscovery) GetOrCreateGRPCConn(addr string) (*grpc.ClientConn, error) { +func (m *mockPDServiceDiscovery) GetOrCreateGRPCConn(url string) (*grpc.ClientConn, error) { return nil, nil } -func (m *mockPDServiceDiscovery) ScheduleCheckMemberChanged() {} -func (m *mockPDServiceDiscovery) CheckMemberChanged() error { return nil } -func (m *mockPDServiceDiscovery) AddServingAddrSwitchedCallback(callbacks ...func()) {} -func (m *mockPDServiceDiscovery) AddServiceAddrsSwitchedCallback(callbacks ...func()) {} +func (m *mockPDServiceDiscovery) ScheduleCheckMemberChanged() {} +func (m *mockPDServiceDiscovery) CheckMemberChanged() error { return nil } +func (m *mockPDServiceDiscovery) AddServingURLSwitchedCallback(callbacks ...func()) {} +func (m *mockPDServiceDiscovery) AddServiceURLsSwitchedCallback(callbacks ...func()) {} diff --git a/client/pd_service_discovery.go b/client/pd_service_discovery.go index 5d9105e76815..bf627d76ac2e 100644 --- a/client/pd_service_discovery.go +++ b/client/pd_service_discovery.go @@ -17,7 +17,7 @@ package pd import ( "context" "crypto/tls" - "fmt" + "net/url" "reflect" "sort" "strings" @@ -87,15 +87,15 @@ type ServiceDiscovery interface { // which is the leader in a quorum-based cluster or the primary in a primary/secondary // configured cluster. GetServingEndpointClientConn() *grpc.ClientConn - // GetClientConns returns the mapping {addr -> a gRPC connection} + // GetClientConns returns the mapping {URL -> a gRPC connection} GetClientConns() *sync.Map - // GetServingAddr returns the serving endpoint which is the leader in a quorum-based cluster + // GetServingURL returns the serving endpoint which is the leader in a quorum-based cluster // or the primary in a primary/secondary configured cluster. - GetServingAddr() string - // GetBackupAddrs gets the addresses of the current reachable backup service + GetServingURL() string + // GetBackupURLs gets the URLs of the current reachable backup service // endpoints. Backup service endpoints are followers in a quorum-based cluster or // secondaries in a primary/secondary configured cluster. - GetBackupAddrs() []string + GetBackupURLs() []string // GetServiceClient tries to get the leader/primary ServiceClient. // If the leader ServiceClient meets network problem, // it returns a follower/secondary ServiceClient which can forward the request to leader. @@ -103,8 +103,8 @@ type ServiceDiscovery interface { // GetAllServiceClients tries to get all ServiceClient. // If the leader is not nil, it will put the leader service client first in the slice. GetAllServiceClients() []ServiceClient - // GetOrCreateGRPCConn returns the corresponding grpc client connection of the given addr - GetOrCreateGRPCConn(addr string) (*grpc.ClientConn, error) + // GetOrCreateGRPCConn returns the corresponding grpc client connection of the given url. + GetOrCreateGRPCConn(url string) (*grpc.ClientConn, error) // ScheduleCheckMemberChanged is used to trigger a check to see if there is any membership change // among the leader/followers in a quorum-based cluster or among the primary/secondaries in a // primary/secondary configured cluster. @@ -112,22 +112,20 @@ type ServiceDiscovery interface { // CheckMemberChanged immediately check if there is any membership change among the leader/followers // in a quorum-based cluster or among the primary/secondaries in a primary/secondary configured cluster. CheckMemberChanged() error - // AddServingAddrSwitchedCallback adds callbacks which will be called when the leader + // AddServingURLSwitchedCallback adds callbacks which will be called when the leader // in a quorum-based cluster or the primary in a primary/secondary configured cluster // is switched. - AddServingAddrSwitchedCallback(callbacks ...func()) - // AddServiceAddrsSwitchedCallback adds callbacks which will be called when any leader/follower + AddServingURLSwitchedCallback(callbacks ...func()) + // AddServiceURLsSwitchedCallback adds callbacks which will be called when any leader/follower // in a quorum-based cluster or any primary/secondary in a primary/secondary configured cluster // is changed. - AddServiceAddrsSwitchedCallback(callbacks ...func()) + AddServiceURLsSwitchedCallback(callbacks ...func()) } // ServiceClient is an interface that defines a set of operations for a raw PD gRPC client to specific PD server. type ServiceClient interface { - // GetAddress returns the address information of the PD server. - GetAddress() string - // GetHTTPAddress returns the address with HTTP scheme of the PD server. - GetHTTPAddress() string + // GetURL returns the client url of the PD/etcd server. + GetURL() string // GetClientConn returns the gRPC connection of the service client GetClientConn() *grpc.ClientConn // BuildGRPCTargetContext builds a context object with a gRPC context. @@ -149,43 +147,23 @@ var ( ) type pdServiceClient struct { - addr string - httpAddress string - conn *grpc.ClientConn - isLeader bool - leaderAddr string + url string + conn *grpc.ClientConn + isLeader bool + leaderURL string networkFailure atomic.Bool } -func newPDServiceClient(addr, leaderAddr string, tlsCfg *tls.Config, conn *grpc.ClientConn, isLeader bool) ServiceClient { - var httpAddress string - if tlsCfg == nil { - if strings.HasPrefix(addr, httpsScheme) { - addr = strings.TrimPrefix(addr, httpsScheme) - httpAddress = fmt.Sprintf("%s%s", httpScheme, addr) - } else if strings.HasPrefix(addr, httpScheme) { - httpAddress = addr - } else { - httpAddress = fmt.Sprintf("%s://%s", httpScheme, addr) - } - } else { - if strings.HasPrefix(addr, httpsScheme) { - httpAddress = addr - } else if strings.HasPrefix(addr, httpScheme) { - addr = strings.TrimPrefix(addr, httpScheme) - httpAddress = fmt.Sprintf("%s%s", httpsScheme, addr) - } else { - httpAddress = fmt.Sprintf("%s://%s", httpsScheme, addr) - } - } - +// NOTE: In the current implementation, the URL passed in is bound to have a scheme, +// because it is processed in `newPDServiceDiscovery`, and the url returned by etcd member owns the sheme. +// When testing, the URL is also bound to have a scheme. +func newPDServiceClient(url, leaderURL string, conn *grpc.ClientConn, isLeader bool) ServiceClient { cli := &pdServiceClient{ - addr: addr, - httpAddress: httpAddress, - conn: conn, - isLeader: isLeader, - leaderAddr: leaderAddr, + url: url, + conn: conn, + isLeader: isLeader, + leaderURL: leaderURL, } if conn == nil { cli.networkFailure.Store(true) @@ -193,20 +171,12 @@ func newPDServiceClient(addr, leaderAddr string, tlsCfg *tls.Config, conn *grpc. return cli } -// GetAddress implements ServiceClient. -func (c *pdServiceClient) GetAddress() string { - if c == nil { - return "" - } - return c.addr -} - -// GetHTTPAddress implements ServiceClient. -func (c *pdServiceClient) GetHTTPAddress() string { +// GetURL implements ServiceClient. +func (c *pdServiceClient) GetURL() string { if c == nil { return "" } - return c.httpAddress + return c.url } // BuildGRPCTargetContext implements ServiceClient. @@ -215,7 +185,7 @@ func (c *pdServiceClient) BuildGRPCTargetContext(ctx context.Context, toLeader b return ctx } if toLeader { - return grpcutil.BuildForwardContext(ctx, c.leaderAddr) + return grpcutil.BuildForwardContext(ctx, c.leaderURL) } return grpcutil.BuildFollowerHandleContext(ctx) } @@ -243,7 +213,7 @@ func (c *pdServiceClient) checkNetworkAvailable(ctx context.Context) { healthCli := healthpb.NewHealthClient(c.conn) resp, err := healthCli.Check(ctx, &healthpb.HealthCheckRequest{Service: ""}) failpoint.Inject("unreachableNetwork1", func(val failpoint.Value) { - if val, ok := val.(string); (ok && val == c.GetAddress()) || !ok { + if val, ok := val.(string); (ok && val == c.GetURL()) || !ok { resp = nil err = status.New(codes.Unavailable, "unavailable").Err() } @@ -412,16 +382,16 @@ func (c *pdServiceBalancer) get() (ret ServiceClient) { } type updateKeyspaceIDFunc func() error -type tsoLocalServAddrsUpdatedFunc func(map[string]string) error -type tsoGlobalServAddrUpdatedFunc func(string) error +type tsoLocalServURLsUpdatedFunc func(map[string]string) error +type tsoGlobalServURLUpdatedFunc func(string) error type tsoAllocatorEventSource interface { - // SetTSOLocalServAddrsUpdatedCallback adds a callback which will be called when the local tso + // SetTSOLocalServURLsUpdatedCallback adds a callback which will be called when the local tso // allocator leader list is updated. - SetTSOLocalServAddrsUpdatedCallback(callback tsoLocalServAddrsUpdatedFunc) - // SetTSOGlobalServAddrUpdatedCallback adds a callback which will be called when the global tso + SetTSOLocalServURLsUpdatedCallback(callback tsoLocalServURLsUpdatedFunc) + // SetTSOGlobalServURLUpdatedCallback adds a callback which will be called when the global tso // allocator leader is updated. - SetTSOGlobalServAddrUpdatedCallback(callback tsoGlobalServAddrUpdatedFunc) + SetTSOGlobalServURLUpdatedCallback(callback tsoGlobalServURLUpdatedFunc) } var ( @@ -442,10 +412,10 @@ type pdServiceDiscovery struct { all atomic.Value // Store as []pdServiceClient apiCandidateNodes [apiKindCount]*pdServiceBalancer // PD follower URLs. Only for tso. - followerAddresses atomic.Value // Store as []string + followerURLs atomic.Value // Store as []string clusterID uint64 - // addr -> a gRPC connection + // url -> a gRPC connection clientConns sync.Map // Store as map[string]*grpc.ClientConn // serviceModeUpdateCb will be called when the service mode gets updated @@ -456,11 +426,11 @@ type pdServiceDiscovery struct { // leader and followers membersChangedCbs []func() // tsoLocalAllocLeadersUpdatedCb will be called when the local tso allocator - // leader list is updated. The input is a map {DC Location -> Leader Addr} - tsoLocalAllocLeadersUpdatedCb tsoLocalServAddrsUpdatedFunc + // leader list is updated. The input is a map {DC Location -> Leader URL} + tsoLocalAllocLeadersUpdatedCb tsoLocalServURLsUpdatedFunc // tsoGlobalAllocLeaderUpdatedCb will be called when the global tso allocator // leader is updated. - tsoGlobalAllocLeaderUpdatedCb tsoGlobalServAddrUpdatedFunc + tsoGlobalAllocLeaderUpdatedCb tsoGlobalServURLUpdatedFunc checkMembershipCh chan struct{} @@ -506,7 +476,7 @@ func newPDServiceDiscovery( tlsCfg: tlsCfg, option: option, } - urls = addrsToUrls(urls) + urls = addrsToURLs(urls, tlsCfg) pdsd.urls.Store(urls) return pdsd } @@ -701,17 +671,17 @@ func (c *pdServiceDiscovery) discoverMicroservice(svcType serviceType) (urls []s case apiService: urls = c.GetServiceURLs() case tsoService: - leaderAddr := c.getLeaderAddr() - if len(leaderAddr) > 0 { - clusterInfo, err := c.getClusterInfo(c.ctx, leaderAddr, c.option.timeout) + leaderURL := c.getLeaderURL() + if len(leaderURL) > 0 { + clusterInfo, err := c.getClusterInfo(c.ctx, leaderURL, c.option.timeout) if err != nil { log.Error("[pd] failed to get cluster info", - zap.String("leader-addr", leaderAddr), errs.ZapError(err)) + zap.String("leader-url", leaderURL), errs.ZapError(err)) return nil, err } urls = clusterInfo.TsoUrls } else { - err = errors.New("failed to get leader addr") + err = errors.New("failed to get leader url") return nil, err } default: @@ -731,26 +701,26 @@ func (c *pdServiceDiscovery) GetServiceURLs() []string { // which is the leader in a quorum-based cluster or the primary in a primary/secondary // configured cluster. func (c *pdServiceDiscovery) GetServingEndpointClientConn() *grpc.ClientConn { - if cc, ok := c.clientConns.Load(c.getLeaderAddr()); ok { + if cc, ok := c.clientConns.Load(c.getLeaderURL()); ok { return cc.(*grpc.ClientConn) } return nil } -// GetClientConns returns the mapping {addr -> a gRPC connection} +// GetClientConns returns the mapping {URL -> a gRPC connection} func (c *pdServiceDiscovery) GetClientConns() *sync.Map { return &c.clientConns } -// GetServingAddr returns the leader address -func (c *pdServiceDiscovery) GetServingAddr() string { - return c.getLeaderAddr() +// GetServingURL returns the leader url +func (c *pdServiceDiscovery) GetServingURL() string { + return c.getLeaderURL() } -// GetBackupAddrs gets the addresses of the current reachable followers +// GetBackupURLs gets the URLs of the current reachable followers // in a quorum-based cluster. Used for tso currently. -func (c *pdServiceDiscovery) GetBackupAddrs() []string { - return c.getFollowerAddrs() +func (c *pdServiceDiscovery) GetBackupURLs() []string { + return c.getFollowerURLs() } // getLeaderServiceClient returns the leader ServiceClient. @@ -776,7 +746,7 @@ func (c *pdServiceDiscovery) GetServiceClient() ServiceClient { leaderClient := c.getLeaderServiceClient() if c.option.enableForwarding && !leaderClient.Available() { if followerClient := c.getServiceClientByKind(forwardAPIKind); followerClient != nil { - log.Debug("[pd] use follower client", zap.String("addr", followerClient.GetAddress())) + log.Debug("[pd] use follower client", zap.String("url", followerClient.GetURL())) return followerClient } } @@ -811,46 +781,46 @@ func (c *pdServiceDiscovery) CheckMemberChanged() error { return c.updateMember() } -// AddServingAddrSwitchedCallback adds callbacks which will be called +// AddServingURLSwitchedCallback adds callbacks which will be called // when the leader is switched. -func (c *pdServiceDiscovery) AddServingAddrSwitchedCallback(callbacks ...func()) { +func (c *pdServiceDiscovery) AddServingURLSwitchedCallback(callbacks ...func()) { c.leaderSwitchedCbs = append(c.leaderSwitchedCbs, callbacks...) } -// AddServiceAddrsSwitchedCallback adds callbacks which will be called when +// AddServiceURLsSwitchedCallback adds callbacks which will be called when // any leader/follower is changed. -func (c *pdServiceDiscovery) AddServiceAddrsSwitchedCallback(callbacks ...func()) { +func (c *pdServiceDiscovery) AddServiceURLsSwitchedCallback(callbacks ...func()) { c.membersChangedCbs = append(c.membersChangedCbs, callbacks...) } -// SetTSOLocalServAddrsUpdatedCallback adds a callback which will be called when the local tso +// SetTSOLocalServURLsUpdatedCallback adds a callback which will be called when the local tso // allocator leader list is updated. -func (c *pdServiceDiscovery) SetTSOLocalServAddrsUpdatedCallback(callback tsoLocalServAddrsUpdatedFunc) { +func (c *pdServiceDiscovery) SetTSOLocalServURLsUpdatedCallback(callback tsoLocalServURLsUpdatedFunc) { c.tsoLocalAllocLeadersUpdatedCb = callback } -// SetTSOGlobalServAddrUpdatedCallback adds a callback which will be called when the global tso +// SetTSOGlobalServURLUpdatedCallback adds a callback which will be called when the global tso // allocator leader is updated. -func (c *pdServiceDiscovery) SetTSOGlobalServAddrUpdatedCallback(callback tsoGlobalServAddrUpdatedFunc) { - addr := c.getLeaderAddr() - if len(addr) > 0 { - callback(addr) +func (c *pdServiceDiscovery) SetTSOGlobalServURLUpdatedCallback(callback tsoGlobalServURLUpdatedFunc) { + url := c.getLeaderURL() + if len(url) > 0 { + callback(url) } c.tsoGlobalAllocLeaderUpdatedCb = callback } -// getLeaderAddr returns the leader address. -func (c *pdServiceDiscovery) getLeaderAddr() string { - return c.getLeaderServiceClient().GetAddress() +// getLeaderURL returns the leader URL. +func (c *pdServiceDiscovery) getLeaderURL() string { + return c.getLeaderServiceClient().GetURL() } -// getFollowerAddrs returns the follower address. -func (c *pdServiceDiscovery) getFollowerAddrs() []string { - followerAddrs := c.followerAddresses.Load() - if followerAddrs == nil { +// getFollowerURLs returns the follower URLs. +func (c *pdServiceDiscovery) getFollowerURLs() []string { + followerURLs := c.followerURLs.Load() + if followerURLs == nil { return []string{} } - return followerAddrs.([]string) + return followerURLs.([]string) } func (c *pdServiceDiscovery) initClusterID() error { @@ -884,12 +854,12 @@ func (c *pdServiceDiscovery) initClusterID() error { } func (c *pdServiceDiscovery) checkServiceModeChanged() error { - leaderAddr := c.getLeaderAddr() - if len(leaderAddr) == 0 { + leaderURL := c.getLeaderURL() + if len(leaderURL) == 0 { return errors.New("no leader found") } - clusterInfo, err := c.getClusterInfo(c.ctx, leaderAddr, c.option.timeout) + clusterInfo, err := c.getClusterInfo(c.ctx, leaderURL, c.option.timeout) if err != nil { if strings.Contains(err.Error(), "Unimplemented") { // If the method is not supported, we set it to pd mode. @@ -928,7 +898,7 @@ func (c *pdServiceDiscovery) updateMember() error { var errTSO error if err == nil { if members.GetLeader() == nil || len(members.GetLeader().GetClientUrls()) == 0 { - err = errs.ErrClientGetLeader.FastGenByArgs("leader address doesn't exist") + err = errs.ErrClientGetLeader.FastGenByArgs("leader url doesn't exist") } // Still need to update TsoAllocatorLeaders, even if there is no PD leader errTSO = c.switchTSOAllocatorLeaders(members.GetTsoAllocatorLeaders()) @@ -936,8 +906,8 @@ func (c *pdServiceDiscovery) updateMember() error { // Failed to get members if err != nil { - log.Info("[pd] cannot update member from this address", - zap.String("address", url), + log.Info("[pd] cannot update member from this url", + zap.String("url", url), errs.ZapError(err)) select { case <-c.ctx.Done(): @@ -1020,68 +990,67 @@ func (c *pdServiceDiscovery) updateURLs(members []*pdpb.Member) { log.Info("[pd] update member urls", zap.Strings("old-urls", oldURLs), zap.Strings("new-urls", urls)) } -func (c *pdServiceDiscovery) switchLeader(addrs []string) (bool, error) { - // FIXME: How to safely compare leader urls? For now, only allows one client url. - addr := addrs[0] +func (c *pdServiceDiscovery) switchLeader(url string) (bool, error) { oldLeader := c.getLeaderServiceClient() - if addr == oldLeader.GetAddress() && oldLeader.GetClientConn() != nil { + if url == oldLeader.GetURL() && oldLeader.GetClientConn() != nil { return false, nil } - newConn, err := c.GetOrCreateGRPCConn(addr) + newConn, err := c.GetOrCreateGRPCConn(url) // If gRPC connect is created successfully or leader is new, still saves. - if addr != oldLeader.GetAddress() || newConn != nil { + if url != oldLeader.GetURL() || newConn != nil { // Set PD leader and Global TSO Allocator (which is also the PD leader) - leaderClient := newPDServiceClient(addr, addr, c.tlsCfg, newConn, true) + leaderClient := newPDServiceClient(url, url, newConn, true) c.leader.Store(leaderClient) } // Run callbacks if c.tsoGlobalAllocLeaderUpdatedCb != nil { - if err := c.tsoGlobalAllocLeaderUpdatedCb(addr); err != nil { + if err := c.tsoGlobalAllocLeaderUpdatedCb(url); err != nil { return true, err } } for _, cb := range c.leaderSwitchedCbs { cb() } - log.Info("[pd] switch leader", zap.String("new-leader", addr), zap.String("old-leader", oldLeader.GetAddress())) + log.Info("[pd] switch leader", zap.String("new-leader", url), zap.String("old-leader", oldLeader.GetURL())) return true, err } -func (c *pdServiceDiscovery) updateFollowers(members []*pdpb.Member, leader *pdpb.Member) (changed bool) { +func (c *pdServiceDiscovery) updateFollowers(members []*pdpb.Member, leaderID uint64, leaderURL string) (changed bool) { followers := make(map[string]*pdServiceClient) c.followers.Range(func(key, value any) bool { followers[key.(string)] = value.(*pdServiceClient) return true }) - var followerAddrs []string + var followerURLs []string for _, member := range members { - if member.GetMemberId() != leader.GetMemberId() { + if member.GetMemberId() != leaderID { if len(member.GetClientUrls()) > 0 { - followerAddrs = append(followerAddrs, member.GetClientUrls()...) + // Now we don't apply ServiceClient for TSO Follower Proxy, so just keep the all URLs. + followerURLs = append(followerURLs, member.GetClientUrls()...) // FIXME: How to safely compare urls(also for leader)? For now, only allows one client url. - addr := member.GetClientUrls()[0] - if client, ok := c.followers.Load(addr); ok { + url := pickMatchedURL(member.GetClientUrls(), c.tlsCfg) + if client, ok := c.followers.Load(url); ok { if client.(*pdServiceClient).GetClientConn() == nil { - conn, err := c.GetOrCreateGRPCConn(addr) + conn, err := c.GetOrCreateGRPCConn(url) if err != nil || conn == nil { - log.Warn("[pd] failed to connect follower", zap.String("follower", addr), errs.ZapError(err)) + log.Warn("[pd] failed to connect follower", zap.String("follower", url), errs.ZapError(err)) continue } - follower := newPDServiceClient(addr, leader.GetClientUrls()[0], c.tlsCfg, conn, false) - c.followers.Store(addr, follower) + follower := newPDServiceClient(url, leaderURL, conn, false) + c.followers.Store(url, follower) changed = true } - delete(followers, addr) + delete(followers, url) } else { changed = true - conn, err := c.GetOrCreateGRPCConn(addr) - follower := newPDServiceClient(addr, leader.GetClientUrls()[0], c.tlsCfg, conn, false) + conn, err := c.GetOrCreateGRPCConn(url) + follower := newPDServiceClient(url, leaderURL, conn, false) if err != nil || conn == nil { - log.Warn("[pd] failed to connect follower", zap.String("follower", addr), errs.ZapError(err)) + log.Warn("[pd] failed to connect follower", zap.String("follower", url), errs.ZapError(err)) } - c.followers.LoadOrStore(addr, follower) + c.followers.LoadOrStore(url, follower) } } } @@ -1092,13 +1061,15 @@ func (c *pdServiceDiscovery) updateFollowers(members []*pdpb.Member, leader *pdp c.followers.Delete(key) } } - c.followerAddresses.Store(followerAddrs) + c.followerURLs.Store(followerURLs) return } func (c *pdServiceDiscovery) updateServiceClient(members []*pdpb.Member, leader *pdpb.Member) error { - leaderChanged, err := c.switchLeader(leader.GetClientUrls()) - followerChanged := c.updateFollowers(members, leader) + // FIXME: How to safely compare leader urls? For now, only allows one client url. + leaderURL := pickMatchedURL(leader.GetClientUrls(), c.tlsCfg) + leaderChanged, err := c.switchLeader(leaderURL) + followerChanged := c.updateFollowers(members, leader.GetMemberId(), leaderURL) // don't need to recreate balancer if no changess. if !followerChanged && !leaderChanged { return err @@ -1145,20 +1116,54 @@ func (c *pdServiceDiscovery) switchTSOAllocatorLeaders(allocatorMap map[string]* return nil } -// GetOrCreateGRPCConn returns the corresponding grpc client connection of the given addr -func (c *pdServiceDiscovery) GetOrCreateGRPCConn(addr string) (*grpc.ClientConn, error) { - return grpcutil.GetOrCreateGRPCConn(c.ctx, &c.clientConns, addr, c.tlsCfg, c.option.gRPCDialOptions...) +// GetOrCreateGRPCConn returns the corresponding grpc client connection of the given URL. +func (c *pdServiceDiscovery) GetOrCreateGRPCConn(url string) (*grpc.ClientConn, error) { + return grpcutil.GetOrCreateGRPCConn(c.ctx, &c.clientConns, url, c.tlsCfg, c.option.gRPCDialOptions...) } -func addrsToUrls(addrs []string) []string { +func addrsToURLs(addrs []string, tlsCfg *tls.Config) []string { // Add default schema "http://" to addrs. urls := make([]string, 0, len(addrs)) for _, addr := range addrs { - if strings.Contains(addr, "://") { - urls = append(urls, addr) - } else { - urls = append(urls, "http://"+addr) - } + urls = append(urls, modifyURLScheme(addr, tlsCfg)) } return urls } + +func modifyURLScheme(uStr string, tlsCfg *tls.Config) string { + u, err := url.Parse(uStr) + if err != nil { + if tlsCfg != nil { + return httpsSchemePrefix + uStr + } + return httpSchemePrefix + uStr + } + if tlsCfg != nil { + u.Scheme = httpsScheme + } else { + u.Scheme = httpScheme + } + return u.String() +} + +// pickMatchedURL picks the matched URL based on the TLS config. +// Note: please make sure the URLs are valid. +func pickMatchedURL(urls []string, tlsCfg *tls.Config) string { + for _, uStr := range urls { + u, err := url.Parse(uStr) + if err != nil { + continue + } + if tlsCfg != nil && u.Scheme == httpsScheme { + return uStr + } + if tlsCfg == nil && u.Scheme == httpScheme { + return uStr + } + } + ret := modifyURLScheme(urls[0], tlsCfg) + log.Warn("[pd] no matched url found", zap.Strings("urls", urls), + zap.Bool("tls-enabled", tlsCfg != nil), + zap.String("attempted-url", ret)) + return ret +} diff --git a/client/pd_service_discovery_test.go b/client/pd_service_discovery_test.go index 226d407b56b4..2373fc4c3049 100644 --- a/client/pd_service_discovery_test.go +++ b/client/pd_service_discovery_test.go @@ -140,10 +140,16 @@ func (suite *serviceClientTestSuite) SetupSuite() { leaderConn, err1 := grpc.Dial(suite.leaderServer.addr, grpc.WithTransportCredentials(insecure.NewCredentials())) followerConn, err2 := grpc.Dial(suite.followerServer.addr, grpc.WithTransportCredentials(insecure.NewCredentials())) if err1 == nil && err2 == nil { - suite.followerClient = newPDServiceClient(suite.followerServer.addr, suite.leaderServer.addr, nil, followerConn, false) - suite.leaderClient = newPDServiceClient(suite.leaderServer.addr, suite.leaderServer.addr, nil, leaderConn, true) + suite.followerClient = newPDServiceClient( + modifyURLScheme(suite.followerServer.addr, nil), + modifyURLScheme(suite.leaderServer.addr, nil), + followerConn, false) + suite.leaderClient = newPDServiceClient( + modifyURLScheme(suite.leaderServer.addr, nil), + modifyURLScheme(suite.leaderServer.addr, nil), + leaderConn, true) suite.followerServer.server.leaderConn = suite.leaderClient.GetClientConn() - suite.followerServer.server.leaderAddr = suite.leaderClient.GetAddress() + suite.followerServer.server.leaderAddr = suite.leaderClient.GetURL() return } time.Sleep(50 * time.Millisecond) @@ -166,16 +172,14 @@ func (suite *serviceClientTestSuite) TearDownSuite() { func (suite *serviceClientTestSuite) TestServiceClient() { re := suite.Require() - leaderAddress := suite.leaderServer.addr - followerAddress := suite.followerServer.addr + leaderAddress := modifyURLScheme(suite.leaderServer.addr, nil) + followerAddress := modifyURLScheme(suite.followerServer.addr, nil) follower := suite.followerClient leader := suite.leaderClient - re.Equal(follower.GetAddress(), followerAddress) - re.Equal(leader.GetAddress(), leaderAddress) - re.Equal(follower.GetHTTPAddress(), "http://"+followerAddress) - re.Equal(leader.GetHTTPAddress(), "http://"+leaderAddress) + re.Equal(follower.GetURL(), followerAddress) + re.Equal(leader.GetURL(), leaderAddress) re.True(follower.Available()) re.True(leader.Available()) @@ -301,18 +305,48 @@ func (suite *serviceClientTestSuite) TestServiceClientBalancer() { re.Equal(int32(5), suite.followerServer.server.getForwardCount()) } -func TestHTTPScheme(t *testing.T) { +func TestServiceClientScheme(t *testing.T) { re := require.New(t) - cli := newPDServiceClient("127.0.0.1:2379", "127.0.0.1:2379", nil, nil, false) - re.Equal("http://127.0.0.1:2379", cli.GetHTTPAddress()) - cli = newPDServiceClient("https://127.0.0.1:2379", "127.0.0.1:2379", nil, nil, false) - re.Equal("http://127.0.0.1:2379", cli.GetHTTPAddress()) - cli = newPDServiceClient("http://127.0.0.1:2379", "127.0.0.1:2379", nil, nil, false) - re.Equal("http://127.0.0.1:2379", cli.GetHTTPAddress()) - cli = newPDServiceClient("127.0.0.1:2379", "127.0.0.1:2379", &tls.Config{}, nil, false) - re.Equal("https://127.0.0.1:2379", cli.GetHTTPAddress()) - cli = newPDServiceClient("https://127.0.0.1:2379", "127.0.0.1:2379", &tls.Config{}, nil, false) - re.Equal("https://127.0.0.1:2379", cli.GetHTTPAddress()) - cli = newPDServiceClient("http://127.0.0.1:2379", "127.0.0.1:2379", &tls.Config{}, nil, false) - re.Equal("https://127.0.0.1:2379", cli.GetHTTPAddress()) + cli := newPDServiceClient(modifyURLScheme("127.0.0.1:2379", nil), modifyURLScheme("127.0.0.1:2379", nil), nil, false) + re.Equal("http://127.0.0.1:2379", cli.GetURL()) + cli = newPDServiceClient(modifyURLScheme("https://127.0.0.1:2379", nil), modifyURLScheme("127.0.0.1:2379", nil), nil, false) + re.Equal("http://127.0.0.1:2379", cli.GetURL()) + cli = newPDServiceClient(modifyURLScheme("http://127.0.0.1:2379", nil), modifyURLScheme("127.0.0.1:2379", nil), nil, false) + re.Equal("http://127.0.0.1:2379", cli.GetURL()) + cli = newPDServiceClient(modifyURLScheme("127.0.0.1:2379", &tls.Config{}), modifyURLScheme("127.0.0.1:2379", &tls.Config{}), nil, false) + re.Equal("https://127.0.0.1:2379", cli.GetURL()) + cli = newPDServiceClient(modifyURLScheme("https://127.0.0.1:2379", &tls.Config{}), modifyURLScheme("127.0.0.1:2379", &tls.Config{}), nil, false) + re.Equal("https://127.0.0.1:2379", cli.GetURL()) + cli = newPDServiceClient(modifyURLScheme("http://127.0.0.1:2379", &tls.Config{}), modifyURLScheme("127.0.0.1:2379", &tls.Config{}), nil, false) + re.Equal("https://127.0.0.1:2379", cli.GetURL()) +} + +func TestSchemeFunction(t *testing.T) { + re := require.New(t) + tlsCfg := &tls.Config{} + re.Equal("https://127.0.0.1:2379", modifyURLScheme("https://127.0.0.1:2379", tlsCfg)) + re.Equal("https://127.0.0.1:2379", modifyURLScheme("http://127.0.0.1:2379", tlsCfg)) + re.Equal("https://127.0.0.1:2379", modifyURLScheme("127.0.0.1:2379", tlsCfg)) + re.Equal("http://127.0.0.1:2379", modifyURLScheme("https://127.0.0.1:2379", nil)) + re.Equal("http://127.0.0.1:2379", modifyURLScheme("http://127.0.0.1:2379", nil)) + re.Equal("http://127.0.0.1:2379", modifyURLScheme("127.0.0.1:2379", nil)) + + urls := []string{ + "http://127.0.0.1:2379", + "https://127.0.0.1:2379", + } + re.Equal("https://127.0.0.1:2379", pickMatchedURL(urls, tlsCfg)) + urls = []string{ + "http://127.0.0.1:2379", + } + re.Equal("https://127.0.0.1:2379", pickMatchedURL(urls, tlsCfg)) + urls = []string{ + "http://127.0.0.1:2379", + "https://127.0.0.1:2379", + } + re.Equal("http://127.0.0.1:2379", pickMatchedURL(urls, nil)) + urls = []string{ + "https://127.0.0.1:2379", + } + re.Equal("http://127.0.0.1:2379", pickMatchedURL(urls, nil)) } diff --git a/client/resource_manager_client.go b/client/resource_manager_client.go index 433d17ceeee6..872b241cfe72 100644 --- a/client/resource_manager_client.go +++ b/client/resource_manager_client.go @@ -74,7 +74,7 @@ func WithRUStats(op *GetResourceGroupOp) { // resourceManagerClient gets the ResourceManager client of current PD leader. func (c *client) resourceManagerClient() (rmpb.ResourceManagerClient, error) { - cc, err := c.pdSvcDiscovery.GetOrCreateGRPCConn(c.GetLeaderAddr()) + cc, err := c.pdSvcDiscovery.GetOrCreateGRPCConn(c.GetLeaderURL()) if err != nil { return nil, err } diff --git a/client/tso_client.go b/client/tso_client.go index 465db1dbd5f9..158d84e043a8 100644 --- a/client/tso_client.go +++ b/client/tso_client.go @@ -74,9 +74,9 @@ type tsoClient struct { tsoStreamBuilderFactory // tsoAllocators defines the mapping {dc-location -> TSO allocator leader URL} tsoAllocators sync.Map // Store as map[string]string - // tsoAllocServingAddrSwitchedCallback will be called when any global/local + // tsoAllocServingURLSwitchedCallback will be called when any global/local // tso allocator leader is switched. - tsoAllocServingAddrSwitchedCallback []func() + tsoAllocServingURLSwitchedCallback []func() // tsoDispatcher is used to dispatch different TSO requests to // the corresponding dc-location TSO channel. @@ -109,9 +109,9 @@ func newTSOClient( } eventSrc := svcDiscovery.(tsoAllocatorEventSource) - eventSrc.SetTSOLocalServAddrsUpdatedCallback(c.updateTSOLocalServAddrs) - eventSrc.SetTSOGlobalServAddrUpdatedCallback(c.updateTSOGlobalServAddr) - c.svcDiscovery.AddServiceAddrsSwitchedCallback(c.scheduleUpdateTSOConnectionCtxs) + eventSrc.SetTSOLocalServURLsUpdatedCallback(c.updateTSOLocalServURLs) + eventSrc.SetTSOGlobalServURLUpdatedCallback(c.updateTSOGlobalServURL) + c.svcDiscovery.AddServiceURLsSwitchedCallback(c.scheduleUpdateTSOConnectionCtxs) return c } @@ -155,8 +155,8 @@ func (c *tsoClient) GetTSOAllocators() *sync.Map { return &c.tsoAllocators } -// GetTSOAllocatorServingAddrByDCLocation returns the tso allocator of the given dcLocation -func (c *tsoClient) GetTSOAllocatorServingAddrByDCLocation(dcLocation string) (string, bool) { +// GetTSOAllocatorServingURLByDCLocation returns the tso allocator of the given dcLocation +func (c *tsoClient) GetTSOAllocatorServingURLByDCLocation(dcLocation string) (string, bool) { url, exist := c.tsoAllocators.Load(dcLocation) if !exist { return "", false @@ -179,13 +179,13 @@ func (c *tsoClient) GetTSOAllocatorClientConnByDCLocation(dcLocation string) (*g return cc.(*grpc.ClientConn), url.(string) } -// AddTSOAllocatorServingAddrSwitchedCallback adds callbacks which will be called +// AddTSOAllocatorServingURLSwitchedCallback adds callbacks which will be called // when any global/local tso allocator service endpoint is switched. -func (c *tsoClient) AddTSOAllocatorServingAddrSwitchedCallback(callbacks ...func()) { - c.tsoAllocServingAddrSwitchedCallback = append(c.tsoAllocServingAddrSwitchedCallback, callbacks...) +func (c *tsoClient) AddTSOAllocatorServingURLSwitchedCallback(callbacks ...func()) { + c.tsoAllocServingURLSwitchedCallback = append(c.tsoAllocServingURLSwitchedCallback, callbacks...) } -func (c *tsoClient) updateTSOLocalServAddrs(allocatorMap map[string]string) error { +func (c *tsoClient) updateTSOLocalServURLs(allocatorMap map[string]string) error { if len(allocatorMap) == 0 { return nil } @@ -193,31 +193,31 @@ func (c *tsoClient) updateTSOLocalServAddrs(allocatorMap map[string]string) erro updated := false // Switch to the new one - for dcLocation, addr := range allocatorMap { - if len(addr) == 0 { + for dcLocation, url := range allocatorMap { + if len(url) == 0 { continue } - oldAddr, exist := c.GetTSOAllocatorServingAddrByDCLocation(dcLocation) - if exist && addr == oldAddr { + oldURL, exist := c.GetTSOAllocatorServingURLByDCLocation(dcLocation) + if exist && url == oldURL { continue } updated = true - if _, err := c.svcDiscovery.GetOrCreateGRPCConn(addr); err != nil { - log.Warn("[tso] failed to connect dc tso allocator serving address", + if _, err := c.svcDiscovery.GetOrCreateGRPCConn(url); err != nil { + log.Warn("[tso] failed to connect dc tso allocator serving url", zap.String("dc-location", dcLocation), - zap.String("serving-address", addr), + zap.String("serving-url", url), errs.ZapError(err)) return err } - c.tsoAllocators.Store(dcLocation, addr) - log.Info("[tso] switch dc tso local allocator serving address", + c.tsoAllocators.Store(dcLocation, url) + log.Info("[tso] switch dc tso local allocator serving url", zap.String("dc-location", dcLocation), - zap.String("new-address", addr), - zap.String("old-address", oldAddr)) + zap.String("new-url", url), + zap.String("old-url", oldURL)) } // Garbage collection of the old TSO allocator primaries - c.gcAllocatorServingAddr(allocatorMap) + c.gcAllocatorServingURL(allocatorMap) if updated { c.scheduleCheckTSODispatcher() @@ -226,16 +226,16 @@ func (c *tsoClient) updateTSOLocalServAddrs(allocatorMap map[string]string) erro return nil } -func (c *tsoClient) updateTSOGlobalServAddr(addr string) error { - c.tsoAllocators.Store(globalDCLocation, addr) - log.Info("[tso] switch dc tso global allocator serving address", +func (c *tsoClient) updateTSOGlobalServURL(url string) error { + c.tsoAllocators.Store(globalDCLocation, url) + log.Info("[tso] switch dc tso global allocator serving url", zap.String("dc-location", globalDCLocation), - zap.String("new-address", addr)) + zap.String("new-url", url)) c.scheduleCheckTSODispatcher() return nil } -func (c *tsoClient) gcAllocatorServingAddr(curAllocatorMap map[string]string) { +func (c *tsoClient) gcAllocatorServingURL(curAllocatorMap map[string]string) { // Clean up the old TSO allocators c.tsoAllocators.Range(func(dcLocationKey, _ any) bool { dcLocation := dcLocationKey.(string) @@ -255,24 +255,24 @@ func (c *tsoClient) gcAllocatorServingAddr(curAllocatorMap map[string]string) { // backup service endpoints randomly. Backup service endpoints are followers in a // quorum-based cluster or secondaries in a primary/secondary configured cluster. func (c *tsoClient) backupClientConn() (*grpc.ClientConn, string) { - addrs := c.svcDiscovery.GetBackupAddrs() - if len(addrs) < 1 { + urls := c.svcDiscovery.GetBackupURLs() + if len(urls) < 1 { return nil, "" } var ( cc *grpc.ClientConn err error ) - for i := 0; i < len(addrs); i++ { - addr := addrs[rand.Intn(len(addrs))] - if cc, err = c.svcDiscovery.GetOrCreateGRPCConn(addr); err != nil { + for i := 0; i < len(urls); i++ { + url := urls[rand.Intn(len(urls))] + if cc, err = c.svcDiscovery.GetOrCreateGRPCConn(url); err != nil { continue } healthCtx, healthCancel := context.WithTimeout(c.ctx, c.option.timeout) resp, err := healthpb.NewHealthClient(cc).Check(healthCtx, &healthpb.HealthCheckRequest{Service: ""}) healthCancel() if err == nil && resp.GetStatus() == healthpb.HealthCheckResponse_SERVING { - return cc, addr + return cc, url } } return nil, "" diff --git a/client/tso_dispatcher.go b/client/tso_dispatcher.go index 3159a77d1355..defe7de2afd5 100644 --- a/client/tso_dispatcher.go +++ b/client/tso_dispatcher.go @@ -250,12 +250,12 @@ func (c *tsoClient) tsoDispatcherCheckLoop() { func (c *tsoClient) checkAllocator( dispatcherCtx context.Context, forwardCancel context.CancelFunc, - dc, forwardedHostTrim, addrTrim, url string, + dc, forwardedHostTrim, addr, url string, updateAndClear func(newAddr string, connectionCtx *tsoConnectionContext)) { defer func() { // cancel the forward stream forwardCancel() - requestForwarded.WithLabelValues(forwardedHostTrim, addrTrim).Set(0) + requestForwarded.WithLabelValues(forwardedHostTrim, addr).Set(0) }() cc, u := c.GetTSOAllocatorClientConnByDCLocation(dc) var healthCli healthpb.HealthClient @@ -343,12 +343,12 @@ func (c *tsoClient) handleDispatcher( dc string, tbc *tsoBatchController) { var ( - err error - streamAddr string - stream tsoStream - streamCtx context.Context - cancel context.CancelFunc - // addr -> connectionContext + err error + streamURL string + stream tsoStream + streamCtx context.Context + cancel context.CancelFunc + // url -> connectionContext connectionCtxs sync.Map ) defer func() { @@ -448,7 +448,7 @@ tsoBatchLoop: for { connectionCtx := c.chooseStream(&connectionCtxs) if connectionCtx != nil { - streamAddr, stream, streamCtx, cancel = connectionCtx.streamAddr, connectionCtx.stream, connectionCtx.ctx, connectionCtx.cancel + streamURL, stream, streamCtx, cancel = connectionCtx.streamURL, connectionCtx.stream, connectionCtx.ctx, connectionCtx.cancel } // Check stream and retry if necessary. if stream == nil { @@ -475,9 +475,9 @@ tsoBatchLoop: } select { case <-streamCtx.Done(): - log.Info("[tso] tso stream is canceled", zap.String("dc", dc), zap.String("stream-addr", streamAddr)) + log.Info("[tso] tso stream is canceled", zap.String("dc", dc), zap.String("stream-url", streamURL)) // Set `stream` to nil and remove this stream from the `connectionCtxs` due to being canceled. - connectionCtxs.Delete(streamAddr) + connectionCtxs.Delete(streamURL) cancel() stream = nil continue @@ -510,10 +510,10 @@ tsoBatchLoop: c.svcDiscovery.ScheduleCheckMemberChanged() log.Error("[tso] getTS error after processing requests", zap.String("dc-location", dc), - zap.String("stream-addr", streamAddr), + zap.String("stream-url", streamURL), zap.Error(errs.ErrClientGetTSO.FastGenByArgs(err.Error()))) // Set `stream` to nil and remove this stream from the `connectionCtxs` due to error. - connectionCtxs.Delete(streamAddr) + connectionCtxs.Delete(streamURL) cancel() stream = nil // Because ScheduleCheckMemberChanged is asynchronous, if the leader changes, we better call `updateMember` ASAP. @@ -557,7 +557,7 @@ func (c *tsoClient) chooseStream(connectionCtxs *sync.Map) (connectionCtx *tsoCo } type tsoConnectionContext struct { - streamAddr string + streamURL string // Current stream to send gRPC requests, pdpb.PD_TsoClient for a leader/follower in the PD cluster, // or tsopb.TSO_TsoClient for a primary/secondary in the TSO cluster stream tsoStream @@ -594,16 +594,16 @@ func (c *tsoClient) tryConnectToTSO( url string cc *grpc.ClientConn ) - updateAndClear := func(newAddr string, connectionCtx *tsoConnectionContext) { - if cc, loaded := connectionCtxs.LoadOrStore(newAddr, connectionCtx); loaded { + updateAndClear := func(newURL string, connectionCtx *tsoConnectionContext) { + if cc, loaded := connectionCtxs.LoadOrStore(newURL, connectionCtx); loaded { // If the previous connection still exists, we should close it first. cc.(*tsoConnectionContext).cancel() - connectionCtxs.Store(newAddr, connectionCtx) + connectionCtxs.Store(newURL, connectionCtx) } - connectionCtxs.Range(func(addr, cc any) bool { - if addr.(string) != newAddr { + connectionCtxs.Range(func(url, cc any) bool { + if url.(string) != newURL { cc.(*tsoConnectionContext).cancel() - connectionCtxs.Delete(addr) + connectionCtxs.Delete(url) } return true }) @@ -650,10 +650,10 @@ func (c *tsoClient) tryConnectToTSO( if networkErrNum == maxRetryTimes { // encounter the network error - backupClientConn, addr := c.backupClientConn() + backupClientConn, backupURL := c.backupClientConn() if backupClientConn != nil { - log.Info("[tso] fall back to use follower to forward tso stream", zap.String("dc", dc), zap.String("addr", addr)) - forwardedHost, ok := c.GetTSOAllocatorServingAddrByDCLocation(dc) + log.Info("[tso] fall back to use follower to forward tso stream", zap.String("dc", dc), zap.String("follower-url", backupURL)) + forwardedHost, ok := c.GetTSOAllocatorServingURLByDCLocation(dc) if !ok { return errors.Errorf("cannot find the allocator leader in %s", dc) } @@ -664,11 +664,11 @@ func (c *tsoClient) tryConnectToTSO( stream, err = c.tsoStreamBuilderFactory.makeBuilder(backupClientConn).build(cctx, cancel, c.option.timeout) if err == nil { forwardedHostTrim := trimHTTPPrefix(forwardedHost) - addrTrim := trimHTTPPrefix(addr) + addr := trimHTTPPrefix(backupURL) // the goroutine is used to check the network and change back to the original stream - go c.checkAllocator(dispatcherCtx, cancel, dc, forwardedHostTrim, addrTrim, url, updateAndClear) - requestForwarded.WithLabelValues(forwardedHostTrim, addrTrim).Set(1) - updateAndClear(addr, &tsoConnectionContext{addr, stream, cctx, cancel}) + go c.checkAllocator(dispatcherCtx, cancel, dc, forwardedHostTrim, addr, url, updateAndClear) + requestForwarded.WithLabelValues(forwardedHostTrim, addr).Set(1) + updateAndClear(backupURL, &tsoConnectionContext{backupURL, stream, cctx, cancel}) return nil } cancel() @@ -707,8 +707,8 @@ func (c *tsoClient) getAllTSOStreamBuilders() map[string]tsoStreamBuilder { // a TSO proxy to reduce the pressure of the main serving service endpoint. func (c *tsoClient) tryConnectToTSOWithProxy(dispatcherCtx context.Context, dc string, connectionCtxs *sync.Map) error { tsoStreamBuilders := c.getAllTSOStreamBuilders() - leaderAddr := c.svcDiscovery.GetServingAddr() - forwardedHost, ok := c.GetTSOAllocatorServingAddrByDCLocation(dc) + leaderAddr := c.svcDiscovery.GetServingURL() + forwardedHost, ok := c.GetTSOAllocatorServingURLByDCLocation(dc) if !ok { return errors.Errorf("cannot find the allocator leader in %s", dc) } @@ -779,7 +779,7 @@ func (c *tsoClient) processRequests( // `logical` is the largest ts's logical part here, we need to do the subtracting before we finish each TSO request. firstLogical := tsoutil.AddLogical(logical, -count+1, suffixBits) curTSOInfo := &tsoInfo{ - tsoServer: stream.getServerAddr(), + tsoServer: stream.getServerURL(), reqKeyspaceGroupID: reqKeyspaceGroupID, respKeyspaceGroupID: respKeyspaceGroupID, respReceivedAt: time.Now(), diff --git a/client/tso_service_discovery.go b/client/tso_service_discovery.go index 03638a1161c2..f6c46346d5d3 100644 --- a/client/tso_service_discovery.go +++ b/client/tso_service_discovery.go @@ -58,45 +58,45 @@ var _ tsoAllocatorEventSource = (*tsoServiceDiscovery)(nil) type keyspaceGroupSvcDiscovery struct { sync.RWMutex group *tsopb.KeyspaceGroup - // primaryAddr is the primary serving address - primaryAddr string - // secondaryAddrs are TSO secondary serving addresses - secondaryAddrs []string - // addrs are the primary/secondary serving addresses - addrs []string + // primaryURL is the primary serving URL + primaryURL string + // secondaryURLs are TSO secondary serving URL + secondaryURLs []string + // urls are the primary/secondary serving URL + urls []string } func (k *keyspaceGroupSvcDiscovery) update( keyspaceGroup *tsopb.KeyspaceGroup, - newPrimaryAddr string, - secondaryAddrs, addrs []string, -) (oldPrimaryAddr string, primarySwitched, secondaryChanged bool) { + newPrimaryURL string, + secondaryURLs, urls []string, +) (oldPrimaryURL string, primarySwitched, secondaryChanged bool) { k.Lock() defer k.Unlock() - // If the new primary address is empty, we don't switch the primary address. - oldPrimaryAddr = k.primaryAddr - if len(newPrimaryAddr) > 0 { - primarySwitched = !strings.EqualFold(oldPrimaryAddr, newPrimaryAddr) - k.primaryAddr = newPrimaryAddr + // If the new primary URL is empty, we don't switch the primary URL. + oldPrimaryURL = k.primaryURL + if len(newPrimaryURL) > 0 { + primarySwitched = !strings.EqualFold(oldPrimaryURL, newPrimaryURL) + k.primaryURL = newPrimaryURL } - if !reflect.DeepEqual(k.secondaryAddrs, secondaryAddrs) { - k.secondaryAddrs = secondaryAddrs + if !reflect.DeepEqual(k.secondaryURLs, secondaryURLs) { + k.secondaryURLs = secondaryURLs secondaryChanged = true } k.group = keyspaceGroup - k.addrs = addrs + k.urls = urls return } // tsoServerDiscovery is for discovering the serving endpoints of the TSO servers -// TODO: dynamically update the TSO server addresses in the case of TSO server failover +// TODO: dynamically update the TSO server URLs in the case of TSO server failover // and scale-out/in. type tsoServerDiscovery struct { sync.RWMutex - addrs []string + urls []string // used for round-robin load balancing selectIdx int // failureCount counts the consecutive failures for communicating with the tso servers @@ -107,7 +107,7 @@ func (t *tsoServerDiscovery) countFailure() bool { t.Lock() defer t.Unlock() t.failureCount++ - return t.failureCount >= len(t.addrs) + return t.failureCount >= len(t.urls) } func (t *tsoServerDiscovery) resetFailure() { @@ -133,14 +133,14 @@ type tsoServiceDiscovery struct { // keyspaceGroupSD is for discovering the serving endpoints of the keyspace group keyspaceGroupSD *keyspaceGroupSvcDiscovery - // addr -> a gRPC connection + // URL -> a gRPC connection clientConns sync.Map // Store as map[string]*grpc.ClientConn // localAllocPrimariesUpdatedCb will be called when the local tso allocator primary list is updated. - // The input is a map {DC Location -> Leader Addr} - localAllocPrimariesUpdatedCb tsoLocalServAddrsUpdatedFunc + // The input is a map {DC Location -> Leader URL} + localAllocPrimariesUpdatedCb tsoLocalServURLsUpdatedFunc // globalAllocPrimariesUpdatedCb will be called when the local tso allocator primary list is updated. - globalAllocPrimariesUpdatedCb tsoGlobalServAddrUpdatedFunc + globalAllocPrimariesUpdatedCb tsoGlobalServURLUpdatedFunc checkMembershipCh chan struct{} @@ -173,11 +173,11 @@ func newTSOServiceDiscovery( } c.keyspaceID.Store(keyspaceID) c.keyspaceGroupSD = &keyspaceGroupSvcDiscovery{ - primaryAddr: "", - secondaryAddrs: make([]string, 0), - addrs: make([]string, 0), + primaryURL: "", + secondaryURLs: make([]string, 0), + urls: make([]string, 0), } - c.tsoServerDiscovery = &tsoServerDiscovery{addrs: make([]string, 0)} + c.tsoServerDiscovery = &tsoServerDiscovery{urls: make([]string, 0)} // Start with the default keyspace group. The actual keyspace group, to which the keyspace belongs, // will be discovered later. c.defaultDiscoveryKey = fmt.Sprintf(tsoSvcDiscoveryFormat, clusterID, defaultKeySpaceGroupID) @@ -288,44 +288,44 @@ func (c *tsoServiceDiscovery) GetKeyspaceGroupID() uint32 { return c.keyspaceGroupSD.group.Id } -// GetServiceURLs returns the URLs of the tso primary/secondary addresses of this keyspace group. +// GetServiceURLs returns the URLs of the tso primary/secondary URL of this keyspace group. // For testing use. It should only be called when the client is closed. func (c *tsoServiceDiscovery) GetServiceURLs() []string { c.keyspaceGroupSD.RLock() defer c.keyspaceGroupSD.RUnlock() - return c.keyspaceGroupSD.addrs + return c.keyspaceGroupSD.urls } -// GetServingAddr returns the grpc client connection of the serving endpoint +// GetServingURL returns the grpc client connection of the serving endpoint // which is the primary in a primary/secondary configured cluster. func (c *tsoServiceDiscovery) GetServingEndpointClientConn() *grpc.ClientConn { - if cc, ok := c.clientConns.Load(c.getPrimaryAddr()); ok { + if cc, ok := c.clientConns.Load(c.getPrimaryURL()); ok { return cc.(*grpc.ClientConn) } return nil } -// GetClientConns returns the mapping {addr -> a gRPC connection} +// GetClientConns returns the mapping {URL -> a gRPC connection} func (c *tsoServiceDiscovery) GetClientConns() *sync.Map { return &c.clientConns } -// GetServingAddr returns the serving endpoint which is the primary in a +// GetServingURL returns the serving endpoint which is the primary in a // primary/secondary configured cluster. -func (c *tsoServiceDiscovery) GetServingAddr() string { - return c.getPrimaryAddr() +func (c *tsoServiceDiscovery) GetServingURL() string { + return c.getPrimaryURL() } -// GetBackupAddrs gets the addresses of the current reachable and healthy +// GetBackupURLs gets the URLs of the current reachable and healthy // backup service endpoints. Backup service endpoints are secondaries in // a primary/secondary configured cluster. -func (c *tsoServiceDiscovery) GetBackupAddrs() []string { - return c.getSecondaryAddrs() +func (c *tsoServiceDiscovery) GetBackupURLs() []string { + return c.getSecondaryURLs() } -// GetOrCreateGRPCConn returns the corresponding grpc client connection of the given addr. -func (c *tsoServiceDiscovery) GetOrCreateGRPCConn(addr string) (*grpc.ClientConn, error) { - return grpcutil.GetOrCreateGRPCConn(c.ctx, &c.clientConns, addr, c.tlsCfg, c.option.gRPCDialOptions...) +// GetOrCreateGRPCConn returns the corresponding grpc client connection of the given URL. +func (c *tsoServiceDiscovery) GetOrCreateGRPCConn(url string) (*grpc.ClientConn, error) { + return grpcutil.GetOrCreateGRPCConn(c.ctx, &c.clientConns, url, c.tlsCfg, c.option.gRPCDialOptions...) } // ScheduleCheckMemberChanged is used to trigger a check to see if there is any change in service endpoints. @@ -347,28 +347,28 @@ func (c *tsoServiceDiscovery) CheckMemberChanged() error { return nil } -// AddServingAddrSwitchedCallback adds callbacks which will be called when the primary in +// AddServingURLSwitchedCallback adds callbacks which will be called when the primary in // a primary/secondary configured cluster is switched. -func (c *tsoServiceDiscovery) AddServingAddrSwitchedCallback(callbacks ...func()) { +func (c *tsoServiceDiscovery) AddServingURLSwitchedCallback(callbacks ...func()) { } -// AddServiceAddrsSwitchedCallback adds callbacks which will be called when any primary/secondary +// AddServiceURLsSwitchedCallback adds callbacks which will be called when any primary/secondary // in a primary/secondary configured cluster is changed. -func (c *tsoServiceDiscovery) AddServiceAddrsSwitchedCallback(callbacks ...func()) { +func (c *tsoServiceDiscovery) AddServiceURLsSwitchedCallback(callbacks ...func()) { } -// SetTSOLocalServAddrsUpdatedCallback adds a callback which will be called when the local tso +// SetTSOLocalServURLsUpdatedCallback adds a callback which will be called when the local tso // allocator leader list is updated. -func (c *tsoServiceDiscovery) SetTSOLocalServAddrsUpdatedCallback(callback tsoLocalServAddrsUpdatedFunc) { +func (c *tsoServiceDiscovery) SetTSOLocalServURLsUpdatedCallback(callback tsoLocalServURLsUpdatedFunc) { c.localAllocPrimariesUpdatedCb = callback } -// SetTSOGlobalServAddrUpdatedCallback adds a callback which will be called when the global tso +// SetTSOGlobalServURLUpdatedCallback adds a callback which will be called when the global tso // allocator leader is updated. -func (c *tsoServiceDiscovery) SetTSOGlobalServAddrUpdatedCallback(callback tsoGlobalServAddrUpdatedFunc) { - addr := c.getPrimaryAddr() - if len(addr) > 0 { - callback(addr) +func (c *tsoServiceDiscovery) SetTSOGlobalServURLUpdatedCallback(callback tsoGlobalServURLUpdatedFunc) { + url := c.getPrimaryURL() + if len(url) > 0 { + callback(url) } c.globalAllocPrimariesUpdatedCb = callback } @@ -383,18 +383,18 @@ func (c *tsoServiceDiscovery) GetAllServiceClients() []ServiceClient { return c.apiSvcDiscovery.GetAllServiceClients() } -// getPrimaryAddr returns the primary address. -func (c *tsoServiceDiscovery) getPrimaryAddr() string { +// getPrimaryURL returns the primary URL. +func (c *tsoServiceDiscovery) getPrimaryURL() string { c.keyspaceGroupSD.RLock() defer c.keyspaceGroupSD.RUnlock() - return c.keyspaceGroupSD.primaryAddr + return c.keyspaceGroupSD.primaryURL } -// getSecondaryAddrs returns the secondary addresses. -func (c *tsoServiceDiscovery) getSecondaryAddrs() []string { +// getSecondaryURLs returns the secondary URLs. +func (c *tsoServiceDiscovery) getSecondaryURLs() []string { c.keyspaceGroupSD.RLock() defer c.keyspaceGroupSD.RUnlock() - return c.keyspaceGroupSD.secondaryAddrs + return c.keyspaceGroupSD.secondaryURLs } func (c *tsoServiceDiscovery) afterPrimarySwitched(oldPrimary, newPrimary string) error { @@ -411,9 +411,9 @@ func (c *tsoServiceDiscovery) afterPrimarySwitched(oldPrimary, newPrimary string } func (c *tsoServiceDiscovery) updateMember() error { - // The keyspace membership or the primary serving address of the keyspace group, to which this + // The keyspace membership or the primary serving URL of the keyspace group, to which this // keyspace belongs, might have been changed. We need to query tso servers to get the latest info. - tsoServerAddr, err := c.getTSOServer(c.apiSvcDiscovery) + tsoServerURL, err := c.getTSOServer(c.apiSvcDiscovery) if err != nil { log.Error("[tso] failed to get tso server", errs.ZapError(err)) return err @@ -421,41 +421,41 @@ func (c *tsoServiceDiscovery) updateMember() error { keyspaceID := c.GetKeyspaceID() var keyspaceGroup *tsopb.KeyspaceGroup - if len(tsoServerAddr) > 0 { - keyspaceGroup, err = c.findGroupByKeyspaceID(keyspaceID, tsoServerAddr, updateMemberTimeout) + if len(tsoServerURL) > 0 { + keyspaceGroup, err = c.findGroupByKeyspaceID(keyspaceID, tsoServerURL, updateMemberTimeout) if err != nil { if c.tsoServerDiscovery.countFailure() { log.Error("[tso] failed to find the keyspace group", zap.Uint32("keyspace-id-in-request", keyspaceID), - zap.String("tso-server-addr", tsoServerAddr), + zap.String("tso-server-url", tsoServerURL), errs.ZapError(err)) } return err } c.tsoServerDiscovery.resetFailure() } else { - // There is no error but no tso server address found, which means + // There is no error but no tso server URL found, which means // the server side hasn't been upgraded to the version that // processes and returns GetClusterInfoResponse.TsoUrls. In this case, - // we fall back to the old way of discovering the tso primary addresses + // we fall back to the old way of discovering the tso primary URL // from etcd directly. c.printFallbackLogOnce.Do(func() { - log.Warn("[tso] no tso server address found,"+ + log.Warn("[tso] no tso server URL found,"+ " fallback to the legacy path to discover from etcd directly", zap.Uint32("keyspace-id-in-request", keyspaceID), - zap.String("tso-server-addr", tsoServerAddr), + zap.String("tso-server-url", tsoServerURL), zap.String("discovery-key", c.defaultDiscoveryKey)) }) - addrs, err := c.discoverWithLegacyPath() + urls, err := c.discoverWithLegacyPath() if err != nil { return err } - if len(addrs) == 0 { - return errors.New("no tso server address found") + if len(urls) == 0 { + return errors.New("no tso server url found") } - members := make([]*tsopb.KeyspaceGroupMember, 0, len(addrs)) - for _, addr := range addrs { - members = append(members, &tsopb.KeyspaceGroupMember{Address: addr}) + members := make([]*tsopb.KeyspaceGroupMember, 0, len(urls)) + for _, url := range urls { + members = append(members, &tsopb.KeyspaceGroupMember{Address: url}) } members[0].IsPrimary = true keyspaceGroup = &tsopb.KeyspaceGroup{ @@ -472,49 +472,49 @@ func (c *tsoServiceDiscovery) updateMember() error { zap.Uint32("old-keyspace-group-id", oldGroupID)) } - // Initialize the serving addresses from the returned keyspace group info. - primaryAddr := "" - secondaryAddrs := make([]string, 0) - addrs := make([]string, 0, len(keyspaceGroup.Members)) + // Initialize the serving URL from the returned keyspace group info. + primaryURL := "" + secondaryURLs := make([]string, 0) + urls := make([]string, 0, len(keyspaceGroup.Members)) for _, m := range keyspaceGroup.Members { - addrs = append(addrs, m.Address) + urls = append(urls, m.Address) if m.IsPrimary { - primaryAddr = m.Address + primaryURL = m.Address } else { - secondaryAddrs = append(secondaryAddrs, m.Address) + secondaryURLs = append(secondaryURLs, m.Address) } } - // If the primary address is not empty, we need to create a grpc connection to it, and do it + // If the primary URL is not empty, we need to create a grpc connection to it, and do it // out of the critical section of the keyspace group service discovery. - if len(primaryAddr) > 0 { - if primarySwitched := !strings.EqualFold(primaryAddr, c.getPrimaryAddr()); primarySwitched { - if _, err := c.GetOrCreateGRPCConn(primaryAddr); err != nil { + if len(primaryURL) > 0 { + if primarySwitched := !strings.EqualFold(primaryURL, c.getPrimaryURL()); primarySwitched { + if _, err := c.GetOrCreateGRPCConn(primaryURL); err != nil { log.Warn("[tso] failed to connect the next primary", zap.Uint32("keyspace-id-in-request", keyspaceID), - zap.String("tso-server-addr", tsoServerAddr), - zap.String("next-primary", primaryAddr), errs.ZapError(err)) + zap.String("tso-server-url", tsoServerURL), + zap.String("next-primary", primaryURL), errs.ZapError(err)) return err } } } oldPrimary, primarySwitched, _ := - c.keyspaceGroupSD.update(keyspaceGroup, primaryAddr, secondaryAddrs, addrs) + c.keyspaceGroupSD.update(keyspaceGroup, primaryURL, secondaryURLs, urls) if primarySwitched { log.Info("[tso] updated keyspace group service discovery info", zap.Uint32("keyspace-id-in-request", keyspaceID), - zap.String("tso-server-addr", tsoServerAddr), + zap.String("tso-server-url", tsoServerURL), zap.String("keyspace-group-service", keyspaceGroup.String())) - if err := c.afterPrimarySwitched(oldPrimary, primaryAddr); err != nil { + if err := c.afterPrimarySwitched(oldPrimary, primaryURL); err != nil { return err } } - // Even if the primary address is empty, we still updated other returned info above, including the - // keyspace group info and the secondary addresses. - if len(primaryAddr) == 0 { - return errors.New("no primary address found") + // Even if the primary URL is empty, we still updated other returned info above, including the + // keyspace group info and the secondary url. + if len(primaryURL) == 0 { + return errors.New("no primary URL found") } return nil @@ -523,7 +523,7 @@ func (c *tsoServiceDiscovery) updateMember() error { // Query the keyspace group info from the tso server by the keyspace ID. The server side will return // the info of the keyspace group to which this keyspace belongs. func (c *tsoServiceDiscovery) findGroupByKeyspaceID( - keyspaceID uint32, tsoSrvAddr string, timeout time.Duration, + keyspaceID uint32, tsoSrvURL string, timeout time.Duration, ) (*tsopb.KeyspaceGroup, error) { failpoint.Inject("unexpectedCallOfFindGroupByKeyspaceID", func(val failpoint.Value) { keyspaceToCheck, ok := val.(int) @@ -534,7 +534,7 @@ func (c *tsoServiceDiscovery) findGroupByKeyspaceID( ctx, cancel := context.WithTimeout(c.ctx, timeout) defer cancel() - cc, err := c.GetOrCreateGRPCConn(tsoSrvAddr) + cc, err := c.GetOrCreateGRPCConn(tsoSrvURL) if err != nil { return nil, err } @@ -572,40 +572,40 @@ func (c *tsoServiceDiscovery) getTSOServer(sd ServiceDiscovery) (string, error) defer c.Unlock() var ( - addrs []string - err error + urls []string + err error ) t := c.tsoServerDiscovery - if len(t.addrs) == 0 || t.failureCount == len(t.addrs) { - addrs, err = sd.(*pdServiceDiscovery).discoverMicroservice(tsoService) + if len(t.urls) == 0 || t.failureCount == len(t.urls) { + urls, err = sd.(*pdServiceDiscovery).discoverMicroservice(tsoService) if err != nil { return "", err } failpoint.Inject("serverReturnsNoTSOAddrs", func() { - log.Info("[failpoint] injected error: server returns no tso addrs") - addrs = nil + log.Info("[failpoint] injected error: server returns no tso URLs") + urls = nil }) - if len(addrs) == 0 { - // There is no error but no tso server address found, which means + if len(urls) == 0 { + // There is no error but no tso server url found, which means // the server side hasn't been upgraded to the version that // processes and returns GetClusterInfoResponse.TsoUrls. Return here // and handle the fallback logic outside of this function. return "", nil } - log.Info("update tso server addresses", zap.Strings("addrs", addrs)) + log.Info("update tso server URLs", zap.Strings("urls", urls)) - t.addrs = addrs + t.urls = urls t.selectIdx = 0 t.failureCount = 0 } // Pick a TSO server in a round-robin way. - tsoServerAddr := t.addrs[t.selectIdx] + tsoServerURL := t.urls[t.selectIdx] t.selectIdx++ - t.selectIdx %= len(t.addrs) + t.selectIdx %= len(t.urls) - return tsoServerAddr, nil + return tsoServerURL, nil } func (c *tsoServiceDiscovery) discoverWithLegacyPath() ([]string, error) { diff --git a/client/tso_stream.go b/client/tso_stream.go index e3203818938a..acefa19d21cd 100644 --- a/client/tso_stream.go +++ b/client/tso_stream.go @@ -35,13 +35,13 @@ type tsoStreamBuilderFactory interface { type pdTSOStreamBuilderFactory struct{} func (f *pdTSOStreamBuilderFactory) makeBuilder(cc *grpc.ClientConn) tsoStreamBuilder { - return &pdTSOStreamBuilder{client: pdpb.NewPDClient(cc), serverAddr: cc.Target()} + return &pdTSOStreamBuilder{client: pdpb.NewPDClient(cc), serverURL: cc.Target()} } type tsoTSOStreamBuilderFactory struct{} func (f *tsoTSOStreamBuilderFactory) makeBuilder(cc *grpc.ClientConn) tsoStreamBuilder { - return &tsoTSOStreamBuilder{client: tsopb.NewTSOClient(cc), serverAddr: cc.Target()} + return &tsoTSOStreamBuilder{client: tsopb.NewTSOClient(cc), serverURL: cc.Target()} } // TSO Stream Builder @@ -51,8 +51,8 @@ type tsoStreamBuilder interface { } type pdTSOStreamBuilder struct { - serverAddr string - client pdpb.PDClient + serverURL string + client pdpb.PDClient } func (b *pdTSOStreamBuilder) build(ctx context.Context, cancel context.CancelFunc, timeout time.Duration) (tsoStream, error) { @@ -62,14 +62,14 @@ func (b *pdTSOStreamBuilder) build(ctx context.Context, cancel context.CancelFun stream, err := b.client.Tso(ctx) done <- struct{}{} if err == nil { - return &pdTSOStream{stream: stream, serverAddr: b.serverAddr}, nil + return &pdTSOStream{stream: stream, serverURL: b.serverURL}, nil } return nil, err } type tsoTSOStreamBuilder struct { - serverAddr string - client tsopb.TSOClient + serverURL string + client tsopb.TSOClient } func (b *tsoTSOStreamBuilder) build( @@ -81,7 +81,7 @@ func (b *tsoTSOStreamBuilder) build( stream, err := b.client.Tso(ctx) done <- struct{}{} if err == nil { - return &tsoTSOStream{stream: stream, serverAddr: b.serverAddr}, nil + return &tsoTSOStream{stream: stream, serverURL: b.serverURL}, nil } return nil, err } @@ -102,7 +102,7 @@ func checkStreamTimeout(ctx context.Context, cancel context.CancelFunc, done cha // TSO Stream type tsoStream interface { - getServerAddr() string + getServerURL() string // processRequests processes TSO requests in streaming mode to get timestamps processRequests( clusterID uint64, keyspaceID, keyspaceGroupID uint32, dcLocation string, @@ -111,12 +111,12 @@ type tsoStream interface { } type pdTSOStream struct { - serverAddr string - stream pdpb.PD_TsoClient + serverURL string + stream pdpb.PD_TsoClient } -func (s *pdTSOStream) getServerAddr() string { - return s.serverAddr +func (s *pdTSOStream) getServerURL() string { + return s.serverURL } func (s *pdTSOStream) processRequests( @@ -165,12 +165,12 @@ func (s *pdTSOStream) processRequests( } type tsoTSOStream struct { - serverAddr string - stream tsopb.TSO_TsoClient + serverURL string + stream tsopb.TSO_TsoClient } -func (s *tsoTSOStream) getServerAddr() string { - return s.serverAddr +func (s *tsoTSOStream) getServerURL() string { + return s.serverURL } func (s *tsoTSOStream) processRequests( diff --git a/pkg/utils/grpcutil/grpcutil.go b/pkg/utils/grpcutil/grpcutil.go index 3b233956fa82..9b8cc2feb493 100644 --- a/pkg/utils/grpcutil/grpcutil.go +++ b/pkg/utils/grpcutil/grpcutil.go @@ -169,8 +169,8 @@ func GetClientConn(ctx context.Context, addr string, tlsCfg *tls.Config, do ...g // BuildForwardContext creates a context with receiver metadata information. // It is used in client side. -func BuildForwardContext(ctx context.Context, addr string) context.Context { - md := metadata.Pairs(ForwardMetadataKey, addr) +func BuildForwardContext(ctx context.Context, url string) context.Context { + md := metadata.Pairs(ForwardMetadataKey, url) return metadata.NewOutgoingContext(ctx, md) } diff --git a/tests/integrations/client/client_test.go b/tests/integrations/client/client_test.go index ca6776aec712..0daa270b2fab 100644 --- a/tests/integrations/client/client_test.go +++ b/tests/integrations/client/client_test.go @@ -109,7 +109,12 @@ func TestClientLeaderChange(t *testing.T) { defer cluster.Destroy() endpoints := runServer(re, cluster) - cli := setupCli(re, ctx, endpoints) + endpointsWithWrongURL := append([]string{}, endpoints...) + // inject wrong http scheme + for i := range endpointsWithWrongURL { + endpointsWithWrongURL[i] = "https://" + strings.TrimPrefix(endpointsWithWrongURL[i], "http://") + } + cli := setupCli(re, ctx, endpointsWithWrongURL) defer cli.Close() innerCli, ok := cli.(interface{ GetServiceDiscovery() pd.ServiceDiscovery }) re.True(ok) @@ -127,14 +132,14 @@ func TestClientLeaderChange(t *testing.T) { re.True(cluster.CheckTSOUnique(ts1)) leader := cluster.GetLeader() - waitLeader(re, innerCli.GetServiceDiscovery(), cluster.GetServer(leader).GetConfig().ClientUrls) + waitLeader(re, innerCli.GetServiceDiscovery(), cluster.GetServer(leader)) err = cluster.GetServer(leader).Stop() re.NoError(err) leader = cluster.WaitLeader() re.NotEmpty(leader) - waitLeader(re, innerCli.GetServiceDiscovery(), cluster.GetServer(leader).GetConfig().ClientUrls) + waitLeader(re, innerCli.GetServiceDiscovery(), cluster.GetServer(leader)) // Check TS won't fall back after leader changed. testutil.Eventually(re, func() bool { @@ -955,10 +960,10 @@ func setupCli(re *require.Assertions, ctx context.Context, endpoints []string, o return cli } -func waitLeader(re *require.Assertions, cli pd.ServiceDiscovery, leader string) { +func waitLeader(re *require.Assertions, cli pd.ServiceDiscovery, leader *tests.TestServer) { testutil.Eventually(re, func() bool { cli.ScheduleCheckMemberChanged() - return cli.GetServingAddr() == leader + return cli.GetServingURL() == leader.GetConfig().ClientUrls && leader.GetAddr() == cli.GetServingURL() }) } @@ -1853,7 +1858,7 @@ func (suite *clientTestSuite) TestMemberUpdateBackOff() { re.True(ok) leader := cluster.GetLeader() - waitLeader(re, innerCli.GetServiceDiscovery(), cluster.GetServer(leader).GetConfig().ClientUrls) + waitLeader(re, innerCli.GetServiceDiscovery(), cluster.GetServer(leader)) memberID := cluster.GetServer(leader).GetLeader().GetMemberId() re.NoError(failpoint.Enable("github.com/tikv/pd/server/leaderLoopCheckAgain", fmt.Sprintf("return(\"%d\")", memberID))) diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index bcd747d85499..f53174d8089c 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -733,7 +733,7 @@ func (suite *httpClientTestSuite) TestRedirectWithMetrics() { re.Equal(float64(2), out.Counter.GetValue()) c.Close() - leader := sd.GetServingAddr() + leader := sd.GetServingURL() httpClient = pd.NewHTTPClientWithRequestChecker(func(req *http.Request) error { // mock leader success. if !strings.Contains(leader, req.Host) { diff --git a/tests/integrations/mcs/resourcemanager/resource_manager_test.go b/tests/integrations/mcs/resourcemanager/resource_manager_test.go index 074988d9abad..aa7a264f5e69 100644 --- a/tests/integrations/mcs/resourcemanager/resource_manager_test.go +++ b/tests/integrations/mcs/resourcemanager/resource_manager_test.go @@ -140,7 +140,7 @@ func (suite *resourceManagerClientTestSuite) waitLeader(re *require.Assertions, re.NotNil(innerCli) testutil.Eventually(re, func() bool { innerCli.GetServiceDiscovery().ScheduleCheckMemberChanged() - return innerCli.GetServiceDiscovery().GetServingAddr() == leaderAddr + return innerCli.GetServiceDiscovery().GetServingURL() == leaderAddr }) } From 96590de4ee2d9d6df0d2a212df22a05f7532a78b Mon Sep 17 00:00:00 2001 From: ShuNing Date: Tue, 12 Mar 2024 20:24:39 +0800 Subject: [PATCH 2/5] *: add region heartbeat duration breakdown metrics (#7871) close tikv/pd#7868 *: add region heartbeat duration breakdown metrics - add a tracer during the heartbeat process - statistic the lock wait time in RegionsInfo Signed-off-by: nolouch Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/core/metrics.go | 256 ++++++++++++++++++++++++ pkg/core/region.go | 90 ++++++++- pkg/core/region_test.go | 4 +- pkg/mcs/scheduling/server/cluster.go | 28 ++- pkg/schedule/config/config.go | 25 ++- pkg/syncer/client.go | 3 +- server/cluster/cluster.go | 19 +- server/cluster/cluster_test.go | 103 +++++----- server/cluster/cluster_worker.go | 9 +- server/cluster/scheduling_controller.go | 2 + server/grpc_service.go | 2 - 11 files changed, 458 insertions(+), 83 deletions(-) create mode 100644 pkg/core/metrics.go diff --git a/pkg/core/metrics.go b/pkg/core/metrics.go new file mode 100644 index 000000000000..e6f3535b1d74 --- /dev/null +++ b/pkg/core/metrics.go @@ -0,0 +1,256 @@ +// Copyright 2024 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package core + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + "go.uber.org/zap" +) + +var ( + // HeartbeatBreakdownHandleDurationSum is the summary of the processing time of handle the heartbeat stage. + HeartbeatBreakdownHandleDurationSum = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "pd", + Subsystem: "core", + Name: "region_heartbeat_breakdown_handle_duration_seconds_sum", + Help: "Bucketed histogram of processing time (s) of handle the heartbeat stage.", + }, []string{"name"}) + + // HeartbeatBreakdownHandleCount is the summary of the processing count of handle the heartbeat stage. + HeartbeatBreakdownHandleCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "pd", + Subsystem: "core", + Name: "region_heartbeat_breakdown_handle_duration_seconds_count", + Help: "Bucketed histogram of processing count of handle the heartbeat stage.", + }, []string{"name"}) + // AcquireRegionsLockWaitDurationSum is the summary of the processing time of waiting for acquiring regions lock. + AcquireRegionsLockWaitDurationSum = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "pd", + Subsystem: "core", + Name: "acquire_regions_lock_wait_duration_seconds_sum", + Help: "Bucketed histogram of processing time (s) of waiting for acquiring regions lock.", + }, []string{"type"}) + // AcquireRegionsLockWaitCount is the summary of the processing count of waiting for acquiring regions lock. + AcquireRegionsLockWaitCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "pd", + Subsystem: "core", + Name: "acquire_regions_lock_wait_duration_seconds_count", + Help: "Bucketed histogram of processing count of waiting for acquiring regions lock.", + }, []string{"name"}) + + // lock statistics + waitRegionsLockDurationSum = AcquireRegionsLockWaitDurationSum.WithLabelValues("WaitRegionsLock") + waitRegionsLockCount = AcquireRegionsLockWaitCount.WithLabelValues("WaitRegionsLock") + waitSubRegionsLockDurationSum = AcquireRegionsLockWaitDurationSum.WithLabelValues("WaitSubRegionsLock") + waitSubRegionsLockCount = AcquireRegionsLockWaitCount.WithLabelValues("WaitSubRegionsLock") + + // heartbeat breakdown statistics + preCheckDurationSum = HeartbeatBreakdownHandleDurationSum.WithLabelValues("PreCheck") + preCheckCount = HeartbeatBreakdownHandleCount.WithLabelValues("PreCheck") + asyncHotStatsDurationSum = HeartbeatBreakdownHandleDurationSum.WithLabelValues("AsyncHotStatsDuration") + asyncHotStatsCount = HeartbeatBreakdownHandleCount.WithLabelValues("AsyncHotStatsDuration") + regionGuideDurationSum = HeartbeatBreakdownHandleDurationSum.WithLabelValues("RegionGuide") + regionGuideCount = HeartbeatBreakdownHandleCount.WithLabelValues("RegionGuide") + checkOverlapsDurationSum = HeartbeatBreakdownHandleDurationSum.WithLabelValues("SaveCache_CheckOverlaps") + checkOverlapsCount = HeartbeatBreakdownHandleCount.WithLabelValues("SaveCache_CheckOverlaps") + validateRegionDurationSum = HeartbeatBreakdownHandleDurationSum.WithLabelValues("SaveCache_InvalidRegion") + validateRegionCount = HeartbeatBreakdownHandleCount.WithLabelValues("SaveCache_InvalidRegion") + setRegionDurationSum = HeartbeatBreakdownHandleDurationSum.WithLabelValues("SaveCache_SetRegion") + setRegionCount = HeartbeatBreakdownHandleCount.WithLabelValues("SaveCache_SetRegion") + updateSubTreeDurationSum = HeartbeatBreakdownHandleDurationSum.WithLabelValues("SaveCache_UpdateSubTree") + updateSubTreeCount = HeartbeatBreakdownHandleCount.WithLabelValues("SaveCache_UpdateSubTree") + regionCollectDurationSum = HeartbeatBreakdownHandleDurationSum.WithLabelValues("CollectRegionStats") + regionCollectCount = HeartbeatBreakdownHandleCount.WithLabelValues("CollectRegionStats") + otherDurationSum = HeartbeatBreakdownHandleDurationSum.WithLabelValues("Other") + otherCount = HeartbeatBreakdownHandleCount.WithLabelValues("Other") +) + +func init() { + prometheus.MustRegister(HeartbeatBreakdownHandleDurationSum) + prometheus.MustRegister(HeartbeatBreakdownHandleCount) + prometheus.MustRegister(AcquireRegionsLockWaitDurationSum) + prometheus.MustRegister(AcquireRegionsLockWaitCount) +} + +type saveCacheStats struct { + startTime time.Time + lastCheckTime time.Time + checkOverlapsDuration time.Duration + validateRegionDuration time.Duration + setRegionDuration time.Duration + updateSubTreeDuration time.Duration +} + +// RegionHeartbeatProcessTracer is used to trace the process of handling region heartbeat. +type RegionHeartbeatProcessTracer interface { + Begin() + OnPreCheckFinished() + OnAsyncHotStatsFinished() + OnRegionGuideFinished() + OnSaveCacheBegin() + OnSaveCacheFinished() + OnCheckOverlapsFinished() + OnValidateRegionFinished() + OnSetRegionFinished() + OnUpdateSubTreeFinished() + OnCollectRegionStatsFinished() + OnAllStageFinished() + LogFields() []zap.Field +} + +type noopHeartbeatProcessTracer struct{} + +// NewNoopHeartbeatProcessTracer returns a noop heartbeat process tracer. +func NewNoopHeartbeatProcessTracer() RegionHeartbeatProcessTracer { + return &noopHeartbeatProcessTracer{} +} + +func (n *noopHeartbeatProcessTracer) Begin() {} +func (n *noopHeartbeatProcessTracer) OnPreCheckFinished() {} +func (n *noopHeartbeatProcessTracer) OnAsyncHotStatsFinished() {} +func (n *noopHeartbeatProcessTracer) OnRegionGuideFinished() {} +func (n *noopHeartbeatProcessTracer) OnSaveCacheBegin() {} +func (n *noopHeartbeatProcessTracer) OnSaveCacheFinished() {} +func (n *noopHeartbeatProcessTracer) OnCheckOverlapsFinished() {} +func (n *noopHeartbeatProcessTracer) OnValidateRegionFinished() {} +func (n *noopHeartbeatProcessTracer) OnSetRegionFinished() {} +func (n *noopHeartbeatProcessTracer) OnUpdateSubTreeFinished() {} +func (n *noopHeartbeatProcessTracer) OnCollectRegionStatsFinished() {} +func (n *noopHeartbeatProcessTracer) OnAllStageFinished() {} +func (n *noopHeartbeatProcessTracer) LogFields() []zap.Field { + return nil +} + +type regionHeartbeatProcessTracer struct { + startTime time.Time + lastCheckTime time.Time + preCheckDuration time.Duration + asyncHotStatsDuration time.Duration + regionGuideDuration time.Duration + saveCacheStats saveCacheStats + OtherDuration time.Duration +} + +// NewHeartbeatProcessTracer returns a heartbeat process tracer. +func NewHeartbeatProcessTracer() RegionHeartbeatProcessTracer { + return ®ionHeartbeatProcessTracer{} +} + +func (h *regionHeartbeatProcessTracer) Begin() { + now := time.Now() + h.startTime = now + h.lastCheckTime = now +} + +func (h *regionHeartbeatProcessTracer) OnPreCheckFinished() { + now := time.Now() + h.preCheckDuration = now.Sub(h.lastCheckTime) + h.lastCheckTime = now + preCheckDurationSum.Add(h.preCheckDuration.Seconds()) + preCheckCount.Inc() +} + +func (h *regionHeartbeatProcessTracer) OnAsyncHotStatsFinished() { + now := time.Now() + h.asyncHotStatsDuration = now.Sub(h.lastCheckTime) + h.lastCheckTime = now + asyncHotStatsDurationSum.Add(h.preCheckDuration.Seconds()) + asyncHotStatsCount.Inc() +} + +func (h *regionHeartbeatProcessTracer) OnRegionGuideFinished() { + now := time.Now() + h.regionGuideDuration = now.Sub(h.lastCheckTime) + h.lastCheckTime = now + regionGuideDurationSum.Add(h.regionGuideDuration.Seconds()) + regionGuideCount.Inc() +} + +func (h *regionHeartbeatProcessTracer) OnSaveCacheBegin() { + now := time.Now() + h.saveCacheStats.startTime = now + h.saveCacheStats.lastCheckTime = now + h.lastCheckTime = now +} + +func (h *regionHeartbeatProcessTracer) OnSaveCacheFinished() { + // update the outer checkpoint time + h.lastCheckTime = time.Now() +} + +func (h *regionHeartbeatProcessTracer) OnCollectRegionStatsFinished() { + now := time.Now() + regionCollectDurationSum.Add(now.Sub(h.lastCheckTime).Seconds()) + regionCollectCount.Inc() + h.lastCheckTime = now +} + +func (h *regionHeartbeatProcessTracer) OnCheckOverlapsFinished() { + now := time.Now() + h.saveCacheStats.checkOverlapsDuration = now.Sub(h.lastCheckTime) + h.saveCacheStats.lastCheckTime = now + checkOverlapsDurationSum.Add(h.saveCacheStats.checkOverlapsDuration.Seconds()) + checkOverlapsCount.Inc() +} + +func (h *regionHeartbeatProcessTracer) OnValidateRegionFinished() { + now := time.Now() + h.saveCacheStats.validateRegionDuration = now.Sub(h.saveCacheStats.lastCheckTime) + h.saveCacheStats.lastCheckTime = now + validateRegionDurationSum.Add(h.saveCacheStats.validateRegionDuration.Seconds()) + validateRegionCount.Inc() +} + +func (h *regionHeartbeatProcessTracer) OnSetRegionFinished() { + now := time.Now() + h.saveCacheStats.setRegionDuration = now.Sub(h.saveCacheStats.lastCheckTime) + h.saveCacheStats.lastCheckTime = now + setRegionDurationSum.Add(h.saveCacheStats.setRegionDuration.Seconds()) + setRegionCount.Inc() +} + +func (h *regionHeartbeatProcessTracer) OnUpdateSubTreeFinished() { + now := time.Now() + h.saveCacheStats.updateSubTreeDuration = now.Sub(h.saveCacheStats.lastCheckTime) + h.saveCacheStats.lastCheckTime = now + updateSubTreeDurationSum.Add(h.saveCacheStats.updateSubTreeDuration.Seconds()) + updateSubTreeCount.Inc() +} + +func (h *regionHeartbeatProcessTracer) OnAllStageFinished() { + now := time.Now() + h.OtherDuration = now.Sub(h.lastCheckTime) + otherDurationSum.Add(h.OtherDuration.Seconds()) + otherCount.Inc() +} + +func (h *regionHeartbeatProcessTracer) LogFields() []zap.Field { + return []zap.Field{ + zap.Duration("pre-check-duration", h.preCheckDuration), + zap.Duration("async-hot-stats-duration", h.asyncHotStatsDuration), + zap.Duration("region-guide-duration", h.regionGuideDuration), + zap.Duration("check-overlaps-duration", h.saveCacheStats.checkOverlapsDuration), + zap.Duration("validate-region-duration", h.saveCacheStats.validateRegionDuration), + zap.Duration("set-region-duration", h.saveCacheStats.setRegionDuration), + zap.Duration("update-sub-tree-duration", h.saveCacheStats.updateSubTreeDuration), + zap.Duration("other-duration", h.OtherDuration), + } +} diff --git a/pkg/core/region.go b/pkg/core/region.go index ee03b5143ce4..f7a4ef5f0fd5 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -824,12 +824,49 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { } } +// RWLockStats is a read-write lock with statistics. +type RWLockStats struct { + syncutil.RWMutex + totalWaitTime int64 + lockCount int64 + lastLockCount int64 + lastTotalWaitTime int64 +} + +// Lock locks the lock and records the waiting time. +func (l *RWLockStats) Lock() { + startTime := time.Now() + l.RWMutex.Lock() + elapsed := time.Since(startTime).Nanoseconds() + atomic.AddInt64(&l.totalWaitTime, elapsed) + atomic.AddInt64(&l.lockCount, 1) +} + +// Unlock unlocks the lock. +func (l *RWLockStats) Unlock() { + l.RWMutex.Unlock() +} + +// RLock locks the lock for reading and records the waiting time. +func (l *RWLockStats) RLock() { + startTime := time.Now() + l.RWMutex.RLock() + elapsed := time.Since(startTime).Nanoseconds() + atomic.AddInt64(&l.totalWaitTime, elapsed) + atomic.AddInt64(&l.lockCount, 1) +} + +// RUnlock unlocks the lock for reading. +func (l *RWLockStats) RUnlock() { + l.RWMutex.RUnlock() +} + // RegionsInfo for export type RegionsInfo struct { - t syncutil.RWMutex + t RWLockStats tree *regionTree regions map[uint64]*regionItem // regionID -> regionInfo - st syncutil.RWMutex + st RWLockStats subRegions map[uint64]*regionItem // regionID -> regionInfo leaders map[uint64]*regionTree // storeID -> sub regionTree followers map[uint64]*regionTree // storeID -> sub regionTree @@ -896,33 +933,38 @@ func (r *RegionsInfo) PutRegion(region *RegionInfo) []*RegionInfo { } // PreCheckPutRegion checks if the region is valid to put. -func (r *RegionsInfo) PreCheckPutRegion(region *RegionInfo) (*RegionInfo, []*regionItem, error) { - origin, overlaps := r.GetRelevantRegions(region) +func (r *RegionsInfo) PreCheckPutRegion(region *RegionInfo, trace RegionHeartbeatProcessTracer) (*RegionInfo, []*regionItem, error) { + origin, overlaps := r.GetRelevantRegions(region, trace) err := check(region, origin, overlaps) return origin, overlaps, err } // AtomicCheckAndPutRegion checks if the region is valid to put, if valid then put. -func (r *RegionsInfo) AtomicCheckAndPutRegion(region *RegionInfo) ([]*RegionInfo, error) { +func (r *RegionsInfo) AtomicCheckAndPutRegion(region *RegionInfo, trace RegionHeartbeatProcessTracer) ([]*RegionInfo, error) { r.t.Lock() var ols []*regionItem origin := r.getRegionLocked(region.GetID()) if origin == nil || !bytes.Equal(origin.GetStartKey(), region.GetStartKey()) || !bytes.Equal(origin.GetEndKey(), region.GetEndKey()) { ols = r.tree.overlaps(®ionItem{RegionInfo: region}) } + trace.OnCheckOverlapsFinished() err := check(region, origin, ols) if err != nil { r.t.Unlock() + trace.OnValidateRegionFinished() return nil, err } + trace.OnValidateRegionFinished() origin, overlaps, rangeChanged := r.setRegionLocked(region, true, ols...) r.t.Unlock() + trace.OnSetRegionFinished() r.UpdateSubTree(region, origin, overlaps, rangeChanged) + trace.OnUpdateSubTreeFinished() return overlaps, nil } // GetRelevantRegions returns the relevant regions for a given region. -func (r *RegionsInfo) GetRelevantRegions(region *RegionInfo) (origin *RegionInfo, overlaps []*regionItem) { +func (r *RegionsInfo) GetRelevantRegions(region *RegionInfo, trace RegionHeartbeatProcessTracer) (origin *RegionInfo, overlaps []*regionItem) { r.t.RLock() defer r.t.RUnlock() origin = r.getRegionLocked(region.GetID()) @@ -1653,6 +1695,42 @@ func (r *RegionsInfo) GetRegionSizeByRange(startKey, endKey []byte) int64 { return size } +// metrics default poll interval +const defaultPollInterval = 15 * time.Second + +// CollectWaitLockMetrics collects the metrics of waiting time for lock +func (r *RegionsInfo) CollectWaitLockMetrics() { + regionsLockTotalWaitTime := atomic.LoadInt64(&r.t.totalWaitTime) + regionsLockCount := atomic.LoadInt64(&r.t.lockCount) + + lastRegionsLockTotalWaitTime := atomic.LoadInt64(&r.t.lastTotalWaitTime) + lastsRegionsLockCount := atomic.LoadInt64(&r.t.lastLockCount) + + subRegionsLockTotalWaitTime := atomic.LoadInt64(&r.st.totalWaitTime) + subRegionsLockCount := atomic.LoadInt64(&r.st.lockCount) + + lastSubRegionsLockTotalWaitTime := atomic.LoadInt64(&r.st.lastTotalWaitTime) + lastSubRegionsLockCount := atomic.LoadInt64(&r.st.lastLockCount) + + // update last metrics + atomic.StoreInt64(&r.t.lastTotalWaitTime, regionsLockTotalWaitTime) + atomic.StoreInt64(&r.t.lastLockCount, regionsLockCount) + atomic.StoreInt64(&r.st.lastTotalWaitTime, subRegionsLockTotalWaitTime) + atomic.StoreInt64(&r.st.lastLockCount, subRegionsLockCount) + + // skip invalid situation like initial status + if lastRegionsLockTotalWaitTime == 0 || lastsRegionsLockCount == 0 || lastSubRegionsLockTotalWaitTime == 0 || lastSubRegionsLockCount == 0 || + regionsLockTotalWaitTime-lastRegionsLockTotalWaitTime < 0 || regionsLockTotalWaitTime-lastRegionsLockTotalWaitTime > int64(defaultPollInterval) || + subRegionsLockTotalWaitTime-lastSubRegionsLockTotalWaitTime < 0 || subRegionsLockTotalWaitTime-lastSubRegionsLockTotalWaitTime > int64(defaultPollInterval) { + return + } + + waitRegionsLockDurationSum.Add(time.Duration(regionsLockTotalWaitTime - lastRegionsLockTotalWaitTime).Seconds()) + waitRegionsLockCount.Add(float64(regionsLockCount - lastsRegionsLockCount)) + waitSubRegionsLockDurationSum.Add(time.Duration(subRegionsLockTotalWaitTime - lastSubRegionsLockTotalWaitTime).Seconds()) + waitSubRegionsLockCount.Add(float64(subRegionsLockCount - lastSubRegionsLockCount)) +} + // GetAdjacentRegions returns region's info that is adjacent with specific region func (r *RegionsInfo) GetAdjacentRegions(region *RegionInfo) (*RegionInfo, *RegionInfo) { r.t.RLock() diff --git a/pkg/core/region_test.go b/pkg/core/region_test.go index 1e3b6073dda8..3c6536a6a773 100644 --- a/pkg/core/region_test.go +++ b/pkg/core/region_test.go @@ -459,9 +459,9 @@ func TestSetRegionConcurrence(t *testing.T) { regions := NewRegionsInfo() region := NewTestRegionInfo(1, 1, []byte("a"), []byte("b")) go func() { - regions.AtomicCheckAndPutRegion(region) + regions.AtomicCheckAndPutRegion(region, NewNoopHeartbeatProcessTracer()) }() - regions.AtomicCheckAndPutRegion(region) + regions.AtomicCheckAndPutRegion(region, NewNoopHeartbeatProcessTracer()) re.NoError(failpoint.Disable("github.com/tikv/pd/pkg/core/UpdateSubTree")) } diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index 9e75057621ec..1b915b6874d2 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -500,6 +500,8 @@ func (c *Cluster) collectMetrics() { c.labelStats.Collect() // collect hot cache metrics c.hotStat.CollectMetrics() + // collect the lock metrics + c.RegionsInfo.CollectWaitLockMetrics() } func (c *Cluster) resetMetrics() { @@ -536,28 +538,36 @@ func (c *Cluster) IsBackgroundJobsRunning() bool { // HandleRegionHeartbeat processes RegionInfo reports from client. func (c *Cluster) HandleRegionHeartbeat(region *core.RegionInfo) error { - if err := c.processRegionHeartbeat(region); err != nil { + tracer := core.NewNoopHeartbeatProcessTracer() + if c.persistConfig.GetScheduleConfig().EnableHeartbeatBreakdownMetrics { + tracer = core.NewHeartbeatProcessTracer() + } + tracer.Begin() + if err := c.processRegionHeartbeat(region, tracer); err != nil { + tracer.OnAllStageFinished() return err } - + tracer.OnAllStageFinished() c.coordinator.GetOperatorController().Dispatch(region, operator.DispatchFromHeartBeat, c.coordinator.RecordOpStepWithTTL) return nil } // processRegionHeartbeat updates the region information. -func (c *Cluster) processRegionHeartbeat(region *core.RegionInfo) error { - origin, _, err := c.PreCheckPutRegion(region) +func (c *Cluster) processRegionHeartbeat(region *core.RegionInfo, tracer core.RegionHeartbeatProcessTracer) error { + origin, _, err := c.PreCheckPutRegion(region, tracer) + tracer.OnPreCheckFinished() if err != nil { return err } region.Inherit(origin, c.GetStoreConfig().IsEnableRegionBucket()) cluster.HandleStatsAsync(c, region) - + tracer.OnAsyncHotStatsFinished() hasRegionStats := c.regionStats != nil // Save to storage if meta is updated, except for flashback. // Save to cache if meta or leader is updated, or contains any down/pending peer. _, saveCache, _ := core.GenerateRegionGuideFunc(true)(region, origin) + if !saveCache { // Due to some config changes need to update the region stats as well, // so we do some extra checks here. @@ -566,21 +576,23 @@ func (c *Cluster) processRegionHeartbeat(region *core.RegionInfo) error { } return nil } - + tracer.OnSaveCacheBegin() var overlaps []*core.RegionInfo if saveCache { // To prevent a concurrent heartbeat of another region from overriding the up-to-date region info by a stale one, // check its validation again here. // // However, it can't solve the race condition of concurrent heartbeats from the same region. - if overlaps, err = c.AtomicCheckAndPutRegion(region); err != nil { + if overlaps, err = c.AtomicCheckAndPutRegion(region, tracer); err != nil { + tracer.OnSaveCacheFinished() return err } cluster.HandleOverlaps(c, overlaps) } - + tracer.OnSaveCacheFinished() cluster.Collect(c, region, c.GetRegionStores(region), hasRegionStats) + tracer.OnCollectRegionStatsFinished() return nil } diff --git a/pkg/schedule/config/config.go b/pkg/schedule/config/config.go index 528f3a611c9b..56038ddcb098 100644 --- a/pkg/schedule/config/config.go +++ b/pkg/schedule/config/config.go @@ -49,14 +49,15 @@ const ( defaultSlowStoreEvictingAffectedStoreRatioThreshold = 0.3 defaultMaxMovableHotPeerSize = int64(512) - defaultEnableJointConsensus = true - defaultEnableTiKVSplitRegion = true - defaultEnableCrossTableMerge = true - defaultEnableDiagnostic = true - defaultStrictlyMatchLabel = false - defaultEnablePlacementRules = true - defaultEnableWitness = false - defaultHaltScheduling = false + defaultEnableJointConsensus = true + defaultEnableTiKVSplitRegion = true + defaultEnableHeartbeatBreakdownMetrics = true + defaultEnableCrossTableMerge = true + defaultEnableDiagnostic = true + defaultStrictlyMatchLabel = false + defaultEnablePlacementRules = true + defaultEnableWitness = false + defaultHaltScheduling = false defaultRegionScoreFormulaVersion = "v2" defaultLeaderSchedulePolicy = "count" @@ -263,6 +264,9 @@ type ScheduleConfig struct { // on ebs-based BR we need to disable it with TTL EnableTiKVSplitRegion bool `toml:"enable-tikv-split-region" json:"enable-tikv-split-region,string"` + // EnableHeartbeatBreakdownMetrics is the option to enable heartbeat stats metrics. + EnableHeartbeatBreakdownMetrics bool `toml:"enable-heartbeat-breakdown-metrics" json:"enable-heartbeat-breakdown-metrics,string"` + // Schedulers support for loading customized schedulers Schedulers SchedulerConfigs `toml:"schedulers" json:"schedulers-v2"` // json v2 is for the sake of compatible upgrade @@ -373,6 +377,11 @@ func (c *ScheduleConfig) Adjust(meta *configutil.ConfigMetaData, reloading bool) if !meta.IsDefined("enable-tikv-split-region") { c.EnableTiKVSplitRegion = defaultEnableTiKVSplitRegion } + + if !meta.IsDefined("enable-heartbeat-breakdown-metrics") { + c.EnableHeartbeatBreakdownMetrics = defaultEnableHeartbeatBreakdownMetrics + } + if !meta.IsDefined("enable-cross-table-merge") { c.EnableCrossTableMerge = defaultEnableCrossTableMerge } diff --git a/pkg/syncer/client.go b/pkg/syncer/client.go index cd9a87aaf546..ffbd71d2f1ea 100644 --- a/pkg/syncer/client.go +++ b/pkg/syncer/client.go @@ -200,7 +200,8 @@ func (s *RegionSyncer) StartSyncWithLeader(addr string) { region = core.NewRegionInfo(r, regionLeader, core.SetSource(core.Sync)) } - origin, _, err := bc.PreCheckPutRegion(region) + tracer := core.NewNoopHeartbeatProcessTracer() + origin, _, err := bc.PreCheckPutRegion(region, tracer) if err != nil { log.Debug("region is stale", zap.Stringer("origin", origin.GetMeta()), errs.ZapError(err)) continue diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index c69e487c3db5..354e12020e3b 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -990,21 +990,24 @@ func (c *RaftCluster) processReportBuckets(buckets *metapb.Buckets) error { var regionGuide = core.GenerateRegionGuideFunc(true) // processRegionHeartbeat updates the region information. -func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { - origin, _, err := c.core.PreCheckPutRegion(region) +func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo, tracer core.RegionHeartbeatProcessTracer) error { + origin, _, err := c.core.PreCheckPutRegion(region, tracer) + tracer.OnPreCheckFinished() if err != nil { return err } + region.Inherit(origin, c.GetStoreConfig().IsEnableRegionBucket()) if !c.IsServiceIndependent(mcsutils.SchedulingServiceName) { cluster.HandleStatsAsync(c, region) } - + tracer.OnAsyncHotStatsFinished() hasRegionStats := c.regionStats != nil // Save to storage if meta is updated, except for flashback. // Save to cache if meta or leader is updated, or contains any down/pending peer. saveKV, saveCache, needSync := regionGuide(region, origin) + tracer.OnRegionGuideFinished() if !saveKV && !saveCache { // Due to some config changes need to update the region stats as well, // so we do some extra checks here. @@ -1016,11 +1019,10 @@ func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { } return nil } - failpoint.Inject("concurrentRegionHeartbeat", func() { time.Sleep(500 * time.Millisecond) }) - + tracer.OnSaveCacheBegin() var overlaps []*core.RegionInfo if saveCache { failpoint.Inject("decEpoch", func() { @@ -1030,7 +1032,8 @@ func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { // check its validation again here. // // However, it can't solve the race condition of concurrent heartbeats from the same region. - if overlaps, err = c.core.AtomicCheckAndPutRegion(region); err != nil { + if overlaps, err = c.core.AtomicCheckAndPutRegion(region, tracer); err != nil { + tracer.OnSaveCacheFinished() return err } if !c.IsServiceIndependent(mcsutils.SchedulingServiceName) { @@ -1039,11 +1042,12 @@ func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { regionUpdateCacheEventCounter.Inc() } + tracer.OnSaveCacheFinished() // TODO: Due to the accuracy requirements of the API "/regions/check/xxx", // region stats needs to be collected in API mode. // We need to think of a better way to reduce this part of the cost in the future. cluster.Collect(c, region, c.GetRegionStores(region), hasRegionStats) - + tracer.OnCollectRegionStatsFinished() if c.storage != nil { // If there are concurrent heartbeats from the same region, the last write will win even if // writes to storage in the critical area. So don't use mutex to protect it. @@ -1074,7 +1078,6 @@ func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { default: } } - return nil } diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index 11c2c8c78364..dc0f79667614 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -50,6 +50,7 @@ import ( "github.com/tikv/pd/pkg/schedule/operator" "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/schedule/schedulers" + "github.com/tikv/pd/pkg/slice" "github.com/tikv/pd/pkg/statistics" "github.com/tikv/pd/pkg/statistics/utils" "github.com/tikv/pd/pkg/storage" @@ -630,7 +631,7 @@ func TestRegionHeartbeatHotStat(t *testing.T) { region := core.NewRegionInfo(regionMeta, leader, core.WithInterval(&pdpb.TimeInterval{StartTimestamp: 0, EndTimestamp: utils.RegionHeartBeatReportInterval}), core.SetWrittenBytes(30000*10), core.SetWrittenKeys(300000*10)) - err = cluster.processRegionHeartbeat(region) + err = cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer()) re.NoError(err) // wait HotStat to update items time.Sleep(time.Second) @@ -643,7 +644,7 @@ func TestRegionHeartbeatHotStat(t *testing.T) { StoreId: 4, } region = region.Clone(core.WithRemoveStorePeer(2), core.WithAddPeer(newPeer)) - err = cluster.processRegionHeartbeat(region) + err = cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer()) re.NoError(err) // wait HotStat to update items time.Sleep(time.Second) @@ -680,8 +681,8 @@ func TestBucketHeartbeat(t *testing.T) { re.NoError(cluster.putStoreLocked(store)) } - re.NoError(cluster.processRegionHeartbeat(regions[0])) - re.NoError(cluster.processRegionHeartbeat(regions[1])) + re.NoError(cluster.processRegionHeartbeat(regions[0], core.NewNoopHeartbeatProcessTracer())) + re.NoError(cluster.processRegionHeartbeat(regions[1], core.NewNoopHeartbeatProcessTracer())) re.Nil(cluster.GetRegion(uint64(1)).GetBuckets()) re.NoError(cluster.processReportBuckets(buckets)) re.Equal(buckets, cluster.GetRegion(uint64(1)).GetBuckets()) @@ -700,13 +701,13 @@ func TestBucketHeartbeat(t *testing.T) { // case5: region update should inherit buckets. newRegion := regions[1].Clone(core.WithIncConfVer(), core.SetBuckets(nil)) opt.SetRegionBucketEnabled(true) - re.NoError(cluster.processRegionHeartbeat(newRegion)) + re.NoError(cluster.processRegionHeartbeat(newRegion, core.NewNoopHeartbeatProcessTracer())) re.Len(cluster.GetRegion(uint64(1)).GetBuckets().GetKeys(), 2) // case6: disable region bucket in opt.SetRegionBucketEnabled(false) newRegion2 := regions[1].Clone(core.WithIncConfVer(), core.SetBuckets(nil)) - re.NoError(cluster.processRegionHeartbeat(newRegion2)) + re.NoError(cluster.processRegionHeartbeat(newRegion2, core.NewNoopHeartbeatProcessTracer())) re.Nil(cluster.GetRegion(uint64(1)).GetBuckets()) re.Empty(cluster.GetRegion(uint64(1)).GetBuckets().GetKeys()) } @@ -732,25 +733,25 @@ func TestRegionHeartbeat(t *testing.T) { for i, region := range regions { // region does not exist. - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is the same, not updated. - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) origin := region // region is updated. region = origin.Clone(core.WithIncVersion()) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is stale (Version). stale := origin.Clone(core.WithIncConfVer()) - re.Error(cluster.processRegionHeartbeat(stale)) + re.Error(cluster.processRegionHeartbeat(stale, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) @@ -760,13 +761,13 @@ func TestRegionHeartbeat(t *testing.T) { core.WithIncConfVer(), ) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is stale (ConfVer). stale = origin.Clone(core.WithIncConfVer()) - re.Error(cluster.processRegionHeartbeat(stale)) + re.Error(cluster.processRegionHeartbeat(stale, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) @@ -778,38 +779,38 @@ func TestRegionHeartbeat(t *testing.T) { }, })) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) // Add a pending peer. region = region.Clone(core.WithPendingPeers([]*metapb.Peer{region.GetPeers()[rand.Intn(len(region.GetPeers()))]})) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) // Clear down peers. region = region.Clone(core.WithDownPeers(nil)) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) // Clear pending peers. region = region.Clone(core.WithPendingPeers(nil)) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) // Remove peers. origin = region region = origin.Clone(core.SetPeers(region.GetPeers()[:1])) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // Add peers. region = origin regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) @@ -819,47 +820,47 @@ func TestRegionHeartbeat(t *testing.T) { core.WithIncConfVer(), ) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) // Change leader. region = region.Clone(core.WithLeader(region.GetPeers()[1])) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) // Change ApproximateSize. region = region.Clone(core.SetApproximateSize(144)) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) // Change ApproximateKeys. region = region.Clone(core.SetApproximateKeys(144000)) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) // Change bytes written. region = region.Clone(core.SetWrittenBytes(24000)) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) // Change bytes read. region = region.Clone(core.SetReadBytes(1080000)) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) // Flashback region = region.Clone(core.WithFlashback(true, 1)) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) region = region.Clone(core.WithFlashback(false, 0)) regions[i] = region - re.NoError(cluster.processRegionHeartbeat(region)) + re.NoError(cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer())) checkRegions(re, cluster.core, regions[:i+1]) } @@ -915,7 +916,8 @@ func TestRegionHeartbeat(t *testing.T) { core.WithNewRegionID(10000), core.WithDecVersion(), ) - re.Error(cluster.processRegionHeartbeat(overlapRegion)) + tracer := core.NewHeartbeatProcessTracer() + re.Error(cluster.processRegionHeartbeat(overlapRegion, tracer)) region := &metapb.Region{} ok, err := storage.LoadRegion(regions[n-1].GetID(), region) re.True(ok) @@ -939,7 +941,14 @@ func TestRegionHeartbeat(t *testing.T) { core.WithStartKey(regions[n-2].GetStartKey()), core.WithNewRegionID(regions[n-1].GetID()+1), ) - re.NoError(cluster.processRegionHeartbeat(overlapRegion)) + tracer = core.NewHeartbeatProcessTracer() + tracer.Begin() + re.NoError(cluster.processRegionHeartbeat(overlapRegion, tracer)) + tracer.OnAllStageFinished() + re.Condition(func() bool { + fileds := tracer.LogFields() + return slice.AllOf(fileds, func(i int) bool { return fileds[i].Integer > 0 }) + }, "should have stats") region = &metapb.Region{} ok, err = storage.LoadRegion(regions[n-1].GetID(), region) re.False(ok) @@ -968,7 +977,7 @@ func TestRegionFlowChanged(t *testing.T) { regions := []*core.RegionInfo{core.NewTestRegionInfo(1, 1, []byte{}, []byte{})} processRegions := func(regions []*core.RegionInfo) { for _, r := range regions { - cluster.processRegionHeartbeat(r) + cluster.processRegionHeartbeat(r, core.NewNoopHeartbeatProcessTracer()) } } regions = core.SplitRegions(regions) @@ -1004,7 +1013,7 @@ func TestRegionSizeChanged(t *testing.T) { core.SetApproximateKeys(curMaxMergeKeys-1), core.SetSource(core.Heartbeat), ) - cluster.processRegionHeartbeat(region) + cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer()) regionID := region.GetID() re.True(cluster.regionStats.IsRegionStatsType(regionID, statistics.UndersizedRegion)) // Test ApproximateSize and ApproximateKeys change. @@ -1014,16 +1023,16 @@ func TestRegionSizeChanged(t *testing.T) { core.SetApproximateKeys(curMaxMergeKeys+1), core.SetSource(core.Heartbeat), ) - cluster.processRegionHeartbeat(region) + cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer()) re.False(cluster.regionStats.IsRegionStatsType(regionID, statistics.UndersizedRegion)) // Test MaxMergeRegionSize and MaxMergeRegionKeys change. cluster.opt.SetMaxMergeRegionSize(uint64(curMaxMergeSize + 2)) cluster.opt.SetMaxMergeRegionKeys(uint64(curMaxMergeKeys + 2)) - cluster.processRegionHeartbeat(region) + cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer()) re.True(cluster.regionStats.IsRegionStatsType(regionID, statistics.UndersizedRegion)) cluster.opt.SetMaxMergeRegionSize(uint64(curMaxMergeSize)) cluster.opt.SetMaxMergeRegionKeys(uint64(curMaxMergeKeys)) - cluster.processRegionHeartbeat(region) + cluster.processRegionHeartbeat(region, core.NewNoopHeartbeatProcessTracer()) re.False(cluster.regionStats.IsRegionStatsType(regionID, statistics.UndersizedRegion)) } @@ -1086,11 +1095,11 @@ func TestConcurrentRegionHeartbeat(t *testing.T) { re.NoError(failpoint.Enable("github.com/tikv/pd/server/cluster/concurrentRegionHeartbeat", "return(true)")) go func() { defer wg.Done() - cluster.processRegionHeartbeat(source) + cluster.processRegionHeartbeat(source, core.NewNoopHeartbeatProcessTracer()) }() time.Sleep(100 * time.Millisecond) re.NoError(failpoint.Disable("github.com/tikv/pd/server/cluster/concurrentRegionHeartbeat")) - re.NoError(cluster.processRegionHeartbeat(target)) + re.NoError(cluster.processRegionHeartbeat(target, core.NewNoopHeartbeatProcessTracer())) wg.Wait() checkRegion(re, cluster.GetRegionByKey([]byte{}), target) } @@ -1152,7 +1161,7 @@ func TestRegionLabelIsolationLevel(t *testing.T) { func heartbeatRegions(re *require.Assertions, cluster *RaftCluster, regions []*core.RegionInfo) { // Heartbeat and check region one by one. for _, r := range regions { - re.NoError(cluster.processRegionHeartbeat(r)) + re.NoError(cluster.processRegionHeartbeat(r, core.NewNoopHeartbeatProcessTracer())) checkRegion(re, cluster.GetRegion(r.GetID()), r) checkRegion(re, cluster.GetRegionByKey(r.GetStartKey()), r) @@ -1189,7 +1198,7 @@ func TestHeartbeatSplit(t *testing.T) { // 1: [nil, nil) region1 := core.NewRegionInfo(&metapb.Region{Id: 1, RegionEpoch: &metapb.RegionEpoch{Version: 1, ConfVer: 1}}, nil) - re.NoError(cluster.processRegionHeartbeat(region1)) + re.NoError(cluster.processRegionHeartbeat(region1, core.NewNoopHeartbeatProcessTracer())) checkRegion(re, cluster.GetRegionByKey([]byte("foo")), region1) // split 1 to 2: [nil, m) 1: [m, nil), sync 2 first. @@ -1198,12 +1207,12 @@ func TestHeartbeatSplit(t *testing.T) { core.WithIncVersion(), ) region2 := core.NewRegionInfo(&metapb.Region{Id: 2, EndKey: []byte("m"), RegionEpoch: &metapb.RegionEpoch{Version: 1, ConfVer: 1}}, nil) - re.NoError(cluster.processRegionHeartbeat(region2)) + re.NoError(cluster.processRegionHeartbeat(region2, core.NewNoopHeartbeatProcessTracer())) checkRegion(re, cluster.GetRegionByKey([]byte("a")), region2) // [m, nil) is missing before r1's heartbeat. re.Nil(cluster.GetRegionByKey([]byte("z"))) - re.NoError(cluster.processRegionHeartbeat(region1)) + re.NoError(cluster.processRegionHeartbeat(region1, core.NewNoopHeartbeatProcessTracer())) checkRegion(re, cluster.GetRegionByKey([]byte("z")), region1) // split 1 to 3: [m, q) 1: [q, nil), sync 1 first. @@ -1212,12 +1221,12 @@ func TestHeartbeatSplit(t *testing.T) { core.WithIncVersion(), ) region3 := core.NewRegionInfo(&metapb.Region{Id: 3, StartKey: []byte("m"), EndKey: []byte("q"), RegionEpoch: &metapb.RegionEpoch{Version: 1, ConfVer: 1}}, nil) - re.NoError(cluster.processRegionHeartbeat(region1)) + re.NoError(cluster.processRegionHeartbeat(region1, core.NewNoopHeartbeatProcessTracer())) checkRegion(re, cluster.GetRegionByKey([]byte("z")), region1) checkRegion(re, cluster.GetRegionByKey([]byte("a")), region2) // [m, q) is missing before r3's heartbeat. re.Nil(cluster.GetRegionByKey([]byte("n"))) - re.NoError(cluster.processRegionHeartbeat(region3)) + re.NoError(cluster.processRegionHeartbeat(region3, core.NewNoopHeartbeatProcessTracer())) checkRegion(re, cluster.GetRegionByKey([]byte("n")), region3) } @@ -1513,11 +1522,11 @@ func TestUpdateStorePendingPeerCount(t *testing.T) { }, } origin := core.NewRegionInfo(&metapb.Region{Id: 1, Peers: peers[:3]}, peers[0], core.WithPendingPeers(peers[1:3])) - re.NoError(tc.processRegionHeartbeat(origin)) + re.NoError(tc.processRegionHeartbeat(origin, core.NewNoopHeartbeatProcessTracer())) time.Sleep(50 * time.Millisecond) checkPendingPeerCount([]int{0, 1, 1, 0}, tc.RaftCluster, re) newRegion := core.NewRegionInfo(&metapb.Region{Id: 1, Peers: peers[1:]}, peers[1], core.WithPendingPeers(peers[3:4])) - re.NoError(tc.processRegionHeartbeat(newRegion)) + re.NoError(tc.processRegionHeartbeat(newRegion, core.NewNoopHeartbeatProcessTracer())) time.Sleep(50 * time.Millisecond) checkPendingPeerCount([]int{0, 0, 0, 1}, tc.RaftCluster, re) } @@ -2950,12 +2959,12 @@ func TestShouldRun(t *testing.T) { for _, testCase := range testCases { r := tc.GetRegion(testCase.regionID) nr := r.Clone(core.WithLeader(r.GetPeers()[0]), core.SetSource(core.Heartbeat)) - re.NoError(tc.processRegionHeartbeat(nr)) + re.NoError(tc.processRegionHeartbeat(nr, core.NewNoopHeartbeatProcessTracer())) re.Equal(testCase.ShouldRun, co.ShouldRun()) } nr := &metapb.Region{Id: 6, Peers: []*metapb.Peer{}} newRegion := core.NewRegionInfo(nr, nil, core.SetSource(core.Heartbeat)) - re.Error(tc.processRegionHeartbeat(newRegion)) + re.Error(tc.processRegionHeartbeat(newRegion, core.NewNoopHeartbeatProcessTracer())) re.Equal(7, tc.core.GetClusterNotFromStorageRegionsCnt()) } @@ -2993,12 +3002,12 @@ func TestShouldRunWithNonLeaderRegions(t *testing.T) { for _, testCase := range testCases { r := tc.GetRegion(testCase.regionID) nr := r.Clone(core.WithLeader(r.GetPeers()[0]), core.SetSource(core.Heartbeat)) - re.NoError(tc.processRegionHeartbeat(nr)) + re.NoError(tc.processRegionHeartbeat(nr, core.NewNoopHeartbeatProcessTracer())) re.Equal(testCase.ShouldRun, co.ShouldRun()) } nr := &metapb.Region{Id: 9, Peers: []*metapb.Peer{}} newRegion := core.NewRegionInfo(nr, nil, core.SetSource(core.Heartbeat)) - re.Error(tc.processRegionHeartbeat(newRegion)) + re.Error(tc.processRegionHeartbeat(newRegion, core.NewNoopHeartbeatProcessTracer())) re.Equal(9, tc.core.GetClusterNotFromStorageRegionsCnt()) // Now, after server is prepared, there exist some regions with no leader. diff --git a/server/cluster/cluster_worker.go b/server/cluster/cluster_worker.go index 74a445ad78e9..5ae8fdc0396f 100644 --- a/server/cluster/cluster_worker.go +++ b/server/cluster/cluster_worker.go @@ -34,9 +34,16 @@ import ( // HandleRegionHeartbeat processes RegionInfo reports from client. func (c *RaftCluster) HandleRegionHeartbeat(region *core.RegionInfo) error { - if err := c.processRegionHeartbeat(region); err != nil { + tracer := core.NewNoopHeartbeatProcessTracer() + if c.GetScheduleConfig().EnableHeartbeatBreakdownMetrics { + tracer = core.NewHeartbeatProcessTracer() + } + tracer.Begin() + if err := c.processRegionHeartbeat(region, tracer); err != nil { + tracer.OnAllStageFinished() return err } + tracer.OnAllStageFinished() if c.IsServiceIndependent(mcsutils.SchedulingServiceName) { return nil diff --git a/server/cluster/scheduling_controller.go b/server/cluster/scheduling_controller.go index a36e7159cfd4..322ccc94d0e2 100644 --- a/server/cluster/scheduling_controller.go +++ b/server/cluster/scheduling_controller.go @@ -194,6 +194,8 @@ func (sc *schedulingController) collectSchedulingMetrics() { sc.labelStats.Collect() // collect hot cache metrics sc.hotStat.CollectMetrics() + // collect the lock metrics + sc.RegionsInfo.CollectWaitLockMetrics() } func (sc *schedulingController) removeStoreStatistics(storeID uint64) { diff --git a/server/grpc_service.go b/server/grpc_service.go index 095e45775dc2..b6cdce4c8b88 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -1293,7 +1293,6 @@ func (s *GrpcServer) RegionHeartbeat(stream pdpb.PD_RegionHeartbeatServer) error continue } start := time.Now() - err = rc.HandleRegionHeartbeat(region) if err != nil { regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "report", "err").Inc() @@ -1301,7 +1300,6 @@ func (s *GrpcServer) RegionHeartbeat(stream pdpb.PD_RegionHeartbeatServer) error s.hbStreams.SendErr(pdpb.ErrorType_UNKNOWN, msg, request.GetLeader()) continue } - regionHeartbeatHandleDuration.WithLabelValues(storeAddress, storeLabel).Observe(time.Since(start).Seconds()) regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "report", "ok").Inc() From 7d22c4f3950a49e7da702426c4fec6c98af6928f Mon Sep 17 00:00:00 2001 From: Yongbo Jiang Date: Wed, 13 Mar 2024 15:03:40 +0800 Subject: [PATCH 3/5] client: avoid panic when leader gRPC conn is nil (#7911) close tikv/pd#7910 Signed-off-by: Cabinfever_B --- client/client.go | 4 +-- client/pd_service_discovery.go | 3 +- tests/integrations/client/client_test.go | 39 ++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/client/client.go b/client/client.go index e2ceb41cfd27..b2c5cc425bb0 100644 --- a/client/client.go +++ b/client/client.go @@ -745,7 +745,7 @@ func (c *client) GetAllMembers(ctx context.Context) ([]*pdpb.Member, error) { // follower pd client and the context which holds forward information. func (c *client) getClientAndContext(ctx context.Context) (pdpb.PDClient, context.Context) { serviceClient := c.pdSvcDiscovery.GetServiceClient() - if serviceClient == nil { + if serviceClient == nil || serviceClient.GetClientConn() == nil { return nil, ctx } return pdpb.NewPDClient(serviceClient.GetClientConn()), serviceClient.BuildGRPCTargetContext(ctx, true) @@ -762,7 +762,7 @@ func (c *client) getRegionAPIClientAndContext(ctx context.Context, allowFollower } } serviceClient = c.pdSvcDiscovery.GetServiceClient() - if serviceClient == nil { + if serviceClient == nil || serviceClient.GetClientConn() == nil { return nil, ctx } return serviceClient, serviceClient.BuildGRPCTargetContext(ctx, !allowFollower) diff --git a/client/pd_service_discovery.go b/client/pd_service_discovery.go index bf627d76ac2e..ebcd981cc032 100644 --- a/client/pd_service_discovery.go +++ b/client/pd_service_discovery.go @@ -126,7 +126,8 @@ type ServiceDiscovery interface { type ServiceClient interface { // GetURL returns the client url of the PD/etcd server. GetURL() string - // GetClientConn returns the gRPC connection of the service client + // GetClientConn returns the gRPC connection of the service client. + // It returns nil if the connection is not available. GetClientConn() *grpc.ClientConn // BuildGRPCTargetContext builds a context object with a gRPC context. // ctx: the original context object. diff --git a/tests/integrations/client/client_test.go b/tests/integrations/client/client_test.go index 0daa270b2fab..5b33a9c5bcab 100644 --- a/tests/integrations/client/client_test.go +++ b/tests/integrations/client/client_test.go @@ -787,6 +787,45 @@ func (suite *followerForwardAndHandleTestSuite) TestGetTsoAndRegionByFollowerFor }) } +func (suite *followerForwardAndHandleTestSuite) TestGetRegionFromLeaderWhenNetworkErr() { + re := suite.Require() + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + + cluster := suite.cluster + re.NotEmpty(cluster.WaitLeader()) + leader := cluster.GetLeaderServer() + + follower := cluster.GetServer(cluster.GetFollower()) + re.NoError(failpoint.Enable("github.com/tikv/pd/client/grpcutil/unreachableNetwork2", fmt.Sprintf("return(\"%s\")", follower.GetAddr()))) + + cli := setupCli(re, ctx, suite.endpoints) + defer cli.Close() + + cluster.GetLeaderServer().GetServer().GetMember().ResignEtcdLeader(ctx, leader.GetServer().Name(), follower.GetServer().Name()) + re.NotEmpty(cluster.WaitLeader()) + + // here is just for trigger the leader change. + cli.GetRegion(context.Background(), []byte("a")) + + testutil.Eventually(re, func() bool { + return cli.GetLeaderURL() == follower.GetAddr() + }) + r, err := cli.GetRegion(context.Background(), []byte("a")) + re.Error(err) + re.Nil(r) + + re.NoError(failpoint.Disable("github.com/tikv/pd/client/grpcutil/unreachableNetwork2")) + cli.GetServiceDiscovery().CheckMemberChanged() + testutil.Eventually(re, func() bool { + r, err = cli.GetRegion(context.Background(), []byte("a")) + if err == nil && r != nil { + return true + } + return false + }) +} + func (suite *followerForwardAndHandleTestSuite) TestGetRegionFromFollower() { re := suite.Require() ctx, cancel := context.WithCancel(suite.ctx) From c2c5d849cef518d2a77c1062a19e9edab9e677b0 Mon Sep 17 00:00:00 2001 From: Yongbo Jiang Date: Thu, 14 Mar 2024 12:08:09 +0800 Subject: [PATCH 4/5] client: fix wrong usage of `url.Parse` (#7916) close tikv/pd#7900 Signed-off-by: Cabinfever_B --- client/pd_service_discovery.go | 23 +++++++++++---------- client/pd_service_discovery_test.go | 32 ++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 12 deletions(-) diff --git a/client/pd_service_discovery.go b/client/pd_service_discovery.go index ebcd981cc032..defb797b7cac 100644 --- a/client/pd_service_discovery.go +++ b/client/pd_service_discovery.go @@ -1131,20 +1131,21 @@ func addrsToURLs(addrs []string, tlsCfg *tls.Config) []string { return urls } -func modifyURLScheme(uStr string, tlsCfg *tls.Config) string { - u, err := url.Parse(uStr) - if err != nil { - if tlsCfg != nil { - return httpsSchemePrefix + uStr +func modifyURLScheme(url string, tlsCfg *tls.Config) string { + if tlsCfg == nil { + if strings.HasPrefix(url, httpsSchemePrefix) { + url = httpSchemePrefix + strings.TrimPrefix(url, httpsSchemePrefix) + } else if !strings.HasPrefix(url, httpSchemePrefix) { + url = httpSchemePrefix + url } - return httpSchemePrefix + uStr - } - if tlsCfg != nil { - u.Scheme = httpsScheme } else { - u.Scheme = httpScheme + if strings.HasPrefix(url, httpSchemePrefix) { + url = httpsSchemePrefix + strings.TrimPrefix(url, httpSchemePrefix) + } else if !strings.HasPrefix(url, httpsSchemePrefix) { + url = httpsSchemePrefix + url + } } - return u.String() + return url } // pickMatchedURL picks the matched URL based on the TLS config. diff --git a/client/pd_service_discovery_test.go b/client/pd_service_discovery_test.go index 2373fc4c3049..f4cde0e1911c 100644 --- a/client/pd_service_discovery_test.go +++ b/client/pd_service_discovery_test.go @@ -324,14 +324,44 @@ func TestServiceClientScheme(t *testing.T) { func TestSchemeFunction(t *testing.T) { re := require.New(t) tlsCfg := &tls.Config{} + + endpoints1 := []string{ + "http://tc-pd:2379", + "tc-pd:2379", + "https://tc-pd:2379", + } + endpoints2 := []string{ + "127.0.0.1:2379", + "http://127.0.0.1:2379", + "https://127.0.0.1:2379", + } + urls := addrsToURLs(endpoints1, tlsCfg) + for _, u := range urls { + re.Equal("https://tc-pd:2379", u) + } + urls = addrsToURLs(endpoints2, tlsCfg) + for _, u := range urls { + re.Equal("https://127.0.0.1:2379", u) + } + urls = addrsToURLs(endpoints1, nil) + for _, u := range urls { + re.Equal("http://tc-pd:2379", u) + } + urls = addrsToURLs(endpoints2, nil) + for _, u := range urls { + re.Equal("http://127.0.0.1:2379", u) + } + re.Equal("https://127.0.0.1:2379", modifyURLScheme("https://127.0.0.1:2379", tlsCfg)) re.Equal("https://127.0.0.1:2379", modifyURLScheme("http://127.0.0.1:2379", tlsCfg)) re.Equal("https://127.0.0.1:2379", modifyURLScheme("127.0.0.1:2379", tlsCfg)) + re.Equal("https://tc-pd:2379", modifyURLScheme("tc-pd:2379", tlsCfg)) re.Equal("http://127.0.0.1:2379", modifyURLScheme("https://127.0.0.1:2379", nil)) re.Equal("http://127.0.0.1:2379", modifyURLScheme("http://127.0.0.1:2379", nil)) re.Equal("http://127.0.0.1:2379", modifyURLScheme("127.0.0.1:2379", nil)) + re.Equal("http://tc-pd:2379", modifyURLScheme("tc-pd:2379", nil)) - urls := []string{ + urls = []string{ "http://127.0.0.1:2379", "https://127.0.0.1:2379", } From b9ea01ac7671d0f495a826c27965f2d967a78e2d Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Thu, 14 Mar 2024 15:45:40 +0800 Subject: [PATCH 5/5] metrics: convert delta() usage to rate(), except for directional negative (#7669) close tikv/pd#4489 Signed-off-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- metrics/grafana/pd.json | 82 ++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index fdab784bf941..b91384828979 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -1151,7 +1151,7 @@ "fillGradient": 0, "gridPos": { "h": 6, - "w": 4, + "w": 8, "x": 16, "y": 13 }, @@ -2671,7 +2671,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"create\"}[1m])) by (type)", + "expr": "sum(rate(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"create\"}[1m])*60) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", @@ -2764,7 +2764,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"check\"}[1m])) by (type)", + "expr": "sum(rate(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"check\"}[1m])*60) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", @@ -2857,7 +2857,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"finish\"}[1m])) by (type)", + "expr": "sum(rate(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"finish\"}[1m])*60) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", @@ -2949,7 +2949,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"timeout\"}[1m])) by (type)", + "expr": "sum(rate(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"timeout\"}[1m])*60) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", @@ -3042,7 +3042,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"cancel\"}[1m])) by (type)", + "expr": "sum(rate(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"cancel\"}[1m])*60) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", @@ -3050,7 +3050,7 @@ "step": 4 }, { - "expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"replace\"}[1m])) by (type)", + "expr": "sum(rate(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"replace\"}[1m])*60) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", @@ -3143,7 +3143,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (event)", + "expr": "sum(rate(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])*60) by (event)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{event}}", @@ -3435,7 +3435,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_schedule_operator_limit{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (type,name)", + "expr": "sum(rate(pd_schedule_operator_limit{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])*60) by (type,name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}-{{name}}", @@ -3443,7 +3443,7 @@ }, { "exemplar": true, - "expr": "sum(delta(pd_schedule_operator_exceeded_store_limit{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (desc)", + "expr": "sum(rate(pd_schedule_operator_exceeded_store_limit{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])*60) by (desc)", "hide": true, "interval": "", "legendFormat": "{{desc}}-exceed-store-limit", @@ -7782,7 +7782,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (type, source, target)", + "expr": "sum(rate(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])*60) by (type, source, target)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{source}}-{{target}}-{{type}}", @@ -7811,7 +7811,7 @@ }, "yaxes": [ { - "format": "ops", + "format": "opm", "label": null, "logBase": 1, "max": null, @@ -8240,7 +8240,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_schedule_filter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", action=\"filter-source\", type!=\"store-state-tombstone-filter\"}[1m])) by (source, type, scope)", + "expr": "sum(rate(pd_schedule_filter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", action=\"filter-source\", type!=\"store-state-tombstone-filter\"}[1m])*60) by (source, type, scope)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{scope}}-store-{{source}}-{{type}}", @@ -8269,7 +8269,7 @@ }, "yaxes": [ { - "format": "ops", + "format": "opm", "label": null, "logBase": 1, "max": null, @@ -8336,7 +8336,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_schedule_filter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", action=\"filter-target\", type!=\"store-state-tombstone-filter\"}[1m])) by (target, type, scope)", + "expr": "sum(rate(pd_schedule_filter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", action=\"filter-target\", type!=\"store-state-tombstone-filter\"}[1m])*60) by (target, type, scope)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{scope}}-store-{{target}}-{{type}}", @@ -8345,7 +8345,7 @@ "step": 4 }, { - "expr": "sum(delta(pd_schedule_filter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", action=\"filter-target\",type=\"distinct-filter\"}[1m])) by (source, target, type, scope)", + "expr": "sum(rate(pd_schedule_filter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", action=\"filter-target\",type=\"distinct-filter\"}[1m])*60) by (source, target, type, scope)", "format": "time_series", "hide": true, "intervalFactor": 2, @@ -8353,7 +8353,7 @@ "refId": "B" }, { - "expr": "sum(delta(pd_schedule_filter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", action=\"filter-target\",type=\"rule-fit-filter\"}[1m])) by (source, target, type, scope)", + "expr": "sum(rate(pd_schedule_filter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", action=\"filter-target\",type=\"rule-fit-filter\"}[1m])*60) by (source, target, type, scope)", "format": "time_series", "hide": true, "intervalFactor": 2, @@ -8361,7 +8361,7 @@ "refId": "C" }, { - "expr": "sum(delta(pd_schedule_filter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", action=\"filter-target\",type=\"rule-fit-leader-filter\"}[1m])) by (source, target, type, scope)", + "expr": "sum(rate(pd_schedule_filter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", action=\"filter-target\",type=\"rule-fit-leader-filter\"}[1m])*60) by (source, target, type, scope)", "format": "time_series", "hide": true, "intervalFactor": 2, @@ -8389,7 +8389,7 @@ }, "yaxes": [ { - "format": "ops", + "format": "opm", "label": null, "logBase": 1, "max": null, @@ -8467,21 +8467,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_schedule_scatter_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"skip\"}[1m])) by (event)", + "expr": "sum(rate(pd_schedule_scatter_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"skip\"}[1m])*60) by (event)", "format": "time_series", "intervalFactor": 2, "legendFormat": "skip-{{event}}", "refId": "A" }, { - "expr": "delta(pd_schedule_scatter_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"fail\"}[1m])", + "expr": "rate(pd_schedule_scatter_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"fail\"}[1m]*60)", "format": "time_series", "intervalFactor": 2, "legendFormat": "fail", "refId": "B" }, { - "expr": "delta(pd_schedule_scatter_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"success\"}[1m])", + "expr": "rate(pd_schedule_scatter_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"success\"}[1m]*60)", "format": "time_series", "intervalFactor": 2, "legendFormat": "success", @@ -8570,14 +8570,14 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_schedule_scatter_distribution{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", engine=\"tikv\", is_leader=\"false\"}[1m])) by (store)", + "expr": "sum(rate(pd_schedule_scatter_distribution{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", engine=\"tikv\", is_leader=\"false\"}[1m])*60) by (store)", "format": "time_series", "intervalFactor": 1, "legendFormat": "peer-{{store}}", "refId": "A" }, { - "expr": "sum(delta(pd_schedule_scatter_distribution{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", engine=\"tikv\", is_leader=\"true\"}[1m])) by (store)", + "expr": "sum(rate(pd_schedule_scatter_distribution{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", engine=\"tikv\", is_leader=\"true\"}[1m])*60) by (store)", "format": "time_series", "intervalFactor": 1, "legendFormat": "leader-{{store}}", @@ -9754,7 +9754,7 @@ "query": { "datasourceId": 1, "model": { - "expr": "delta(etcd_disk_wal_fsync_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=\"pd\"}[1m])", + "expr": "rate(etcd_disk_wal_fsync_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=\"pd\"}[1m])*60", "intervalFactor": 2, "legendFormat": "{{instance}} etch disk wal fsync rate", "refId": "A", @@ -9825,7 +9825,7 @@ "steppedLine": false, "targets": [ { - "expr": "delta(etcd_disk_wal_fsync_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=\"pd\"}[1m])", + "expr": "rate(etcd_disk_wal_fsync_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=\"pd\"}[1m])*60", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -10024,7 +10024,7 @@ "steppedLine": false, "targets": [ { - "expr": "delta(etcd_disk_backend_commit_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])", + "expr": "rate(etcd_disk_backend_commit_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])*60", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -10988,7 +10988,7 @@ }, { "exemplar": true, - "expr": "sum(delta(pd_client_request_handle_tso_batch_size_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) / sum(delta(pd_client_request_handle_tso_batch_size_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m]))", + "expr": "sum(rate(pd_client_request_handle_tso_batch_size_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) / sum(rate(pd_client_request_handle_tso_batch_size_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m]))", "hide": false, "interval": "", "intervalFactor": 1, @@ -11411,7 +11411,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_scheduler_region_heartbeat{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"report\", status=\"ok\"}[1m])) by (address, store)", + "expr": "sum(rate(pd_scheduler_region_heartbeat{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"report\", status=\"ok\"}[1m])*60) by (address, store)", "format": "time_series", "hide": false, "intervalFactor": 2, @@ -11601,7 +11601,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_scheduler_region_heartbeat{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"report\", status=\"err\"}[1m])) by (address, store)", + "expr": "sum(rate(pd_scheduler_region_heartbeat{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"report\", status=\"err\"}[1m])*60) by (address, store)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", @@ -11691,7 +11691,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_scheduler_region_heartbeat{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"report\", status=\"bind\"}[1m])) by (address, store)", + "expr": "sum(rate(pd_scheduler_region_heartbeat{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"report\", status=\"bind\"}[1m])*60) by (address, store)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", @@ -11882,7 +11882,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_hbstream_region_message{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"push\", status=\"ok\"}[1m])) by (address, store)", + "expr": "sum(rate(pd_hbstream_region_message{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"push\", status=\"ok\"}[1m])*60) by (address, store)", "format": "time_series", "hide": false, "intervalFactor": 2, @@ -11979,7 +11979,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(delta(pd_hbstream_region_message{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"push\", status=\"err\"}[1m])) by (address, store)", + "expr": "sum(rate(pd_hbstream_region_message{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"push\", status=\"err\"}[1m])*60) by (address, store)", "format": "time_series", "hide": false, "intervalFactor": 2, @@ -12283,7 +12283,7 @@ "pluginVersion": "7.1.5", "targets": [ { - "expr": "sum(delta(pd_scheduler_read_byte_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", + "expr": "sum(rate(pd_scheduler_read_byte_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", "format": "heatmap", "hide": false, "interval": "", @@ -12341,7 +12341,7 @@ "pluginVersion": "7.1.5", "targets": [ { - "expr": "sum(delta(pd_scheduler_write_byte_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", + "expr": "sum(rate(pd_scheduler_write_byte_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", "format": "heatmap", "hide": false, "interval": "", @@ -12401,7 +12401,7 @@ "pluginVersion": "7.1.5", "targets": [ { - "expr": "sum(delta(pd_scheduler_read_key_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", + "expr": "sum(rate(pd_scheduler_read_key_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", "format": "heatmap", "hide": false, "interval": "", @@ -12461,7 +12461,7 @@ "pluginVersion": "7.1.5", "targets": [ { - "expr": "sum(delta(pd_scheduler_write_key_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", + "expr": "sum(rate(pd_scheduler_write_key_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", "format": "heatmap", "hide": false, "interval": "", @@ -12523,7 +12523,7 @@ "repeatDirection": "h", "targets": [ { - "expr": "sum(delta(pd_scheduler_store_heartbeat_interval_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", + "expr": "sum(rate(pd_scheduler_store_heartbeat_interval_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", "format": "heatmap", "hide": false, "interval": "", @@ -12585,7 +12585,7 @@ "repeatDirection": "h", "targets": [ { - "expr": "sum(delta(pd_scheduler_region_heartbeat_interval_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", + "expr": "sum(rate(pd_scheduler_region_heartbeat_interval_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", "format": "heatmap", "hide": false, "interval": "", @@ -12634,7 +12634,7 @@ "repeatDirection": "h", "targets": [ { - "expr": "sum(delta(pd_server_bucket_report_interval_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=~\"$tidb_cluster.*\"}[1m])) by (le)", + "expr": "sum(rate(pd_server_bucket_report_interval_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=~\"$tidb_cluster.*\"}[1m])) by (le)", "format": "heatmap", "hide": false, "interval": "", @@ -12683,7 +12683,7 @@ "repeatDirection": "h", "targets": [ { - "expr": "sum(delta(pd_scheduler_buckets_hot_degree_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=~\"$tidb_cluster.*\"}[1m])) by (le)", + "expr": "sum(rate(pd_scheduler_buckets_hot_degree_hist_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=~\"$tidb_cluster.*\"}[1m])) by (le)", "format": "heatmap", "hide": false, "interval": "",