From 508437cdd6aa9133626783a066341a2ccb938fbd Mon Sep 17 00:00:00 2001 From: JmPotato Date: Thu, 13 Jul 2023 16:59:39 +0800 Subject: [PATCH 1/9] Get region info via core cluster inside RegionStatistics Signed-off-by: JmPotato --- pkg/statistics/region_collection.go | 84 ++++++++++++++---------- pkg/statistics/region_collection_test.go | 4 +- server/cluster/cluster.go | 2 +- server/cluster/cluster_test.go | 36 ++++++++-- 4 files changed, 84 insertions(+), 42 deletions(-) diff --git a/pkg/statistics/region_collection.go b/pkg/statistics/region_collection.go index 93959576282..71e62d052a8 100644 --- a/pkg/statistics/region_collection.go +++ b/pkg/statistics/region_collection.go @@ -42,6 +42,28 @@ const ( WitnessLeader ) +var ( + regionStatisticTypes = []RegionStatisticType{ + MissPeer, + ExtraPeer, + DownPeer, + PendingPeer, + LearnerPeer, + EmptyRegion, + OversizedRegion, + UndersizedRegion, + WitnessLeader, + } + offlineRegionStatisticTypes = []RegionStatisticType{ + MissPeer, + ExtraPeer, + DownPeer, + PendingPeer, + OfflinePeer, + LearnerPeer, + } +) + const nonIsolation = "none" var ( @@ -64,9 +86,9 @@ var ( offlineOfflinePeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("offline-peer-region-count") ) -// RegionInfo is used to record the status of region. -type RegionInfo struct { - *core.RegionInfo +// RegionInfoWithTS is used to record the extra timestamp status of a region. +type RegionInfoWithTS struct { + id uint64 startMissVoterPeerTS int64 startDownPeerTS int64 } @@ -74,9 +96,11 @@ type RegionInfo struct { // RegionStatistics is used to record the status of regions. type RegionStatistics struct { sync.RWMutex + // core is used to get the region information. + core *core.BasicCluster conf sc.CheckerConfig - stats map[RegionStatisticType]map[uint64]*RegionInfo - offlineStats map[RegionStatisticType]map[uint64]*core.RegionInfo + stats map[RegionStatisticType]map[uint64]*RegionInfoWithTS + offlineStats map[RegionStatisticType]map[uint64]struct{} index map[uint64]RegionStatisticType offlineIndex map[uint64]RegionStatisticType ruleManager *placement.RuleManager @@ -84,32 +108,28 @@ type RegionStatistics struct { } // NewRegionStatistics creates a new RegionStatistics. -func NewRegionStatistics(conf sc.CheckerConfig, ruleManager *placement.RuleManager, storeConfigManager *config.StoreConfigManager) *RegionStatistics { +func NewRegionStatistics( + core *core.BasicCluster, + conf sc.CheckerConfig, + ruleManager *placement.RuleManager, + storeConfigManager *config.StoreConfigManager, +) *RegionStatistics { r := &RegionStatistics{ + core: core, conf: conf, ruleManager: ruleManager, storeConfigManager: storeConfigManager, - stats: make(map[RegionStatisticType]map[uint64]*RegionInfo), - offlineStats: make(map[RegionStatisticType]map[uint64]*core.RegionInfo), + stats: make(map[RegionStatisticType]map[uint64]*RegionInfoWithTS), + offlineStats: make(map[RegionStatisticType]map[uint64]struct{}), index: make(map[uint64]RegionStatisticType), offlineIndex: make(map[uint64]RegionStatisticType), } - r.stats[MissPeer] = make(map[uint64]*RegionInfo) - r.stats[ExtraPeer] = make(map[uint64]*RegionInfo) - r.stats[DownPeer] = make(map[uint64]*RegionInfo) - r.stats[PendingPeer] = make(map[uint64]*RegionInfo) - r.stats[LearnerPeer] = make(map[uint64]*RegionInfo) - r.stats[EmptyRegion] = make(map[uint64]*RegionInfo) - r.stats[OversizedRegion] = make(map[uint64]*RegionInfo) - r.stats[UndersizedRegion] = make(map[uint64]*RegionInfo) - r.stats[WitnessLeader] = make(map[uint64]*RegionInfo) - - r.offlineStats[MissPeer] = make(map[uint64]*core.RegionInfo) - r.offlineStats[ExtraPeer] = make(map[uint64]*core.RegionInfo) - r.offlineStats[DownPeer] = make(map[uint64]*core.RegionInfo) - r.offlineStats[PendingPeer] = make(map[uint64]*core.RegionInfo) - r.offlineStats[LearnerPeer] = make(map[uint64]*core.RegionInfo) - r.offlineStats[OfflinePeer] = make(map[uint64]*core.RegionInfo) + for _, typ := range regionStatisticTypes { + r.stats[typ] = make(map[uint64]*RegionInfoWithTS) + } + for _, typ := range offlineRegionStatisticTypes { + r.offlineStats[typ] = make(map[uint64]struct{}) + } return r } @@ -118,8 +138,8 @@ func (r *RegionStatistics) GetRegionStatsByType(typ RegionStatisticType) []*core r.RLock() defer r.RUnlock() res := make([]*core.RegionInfo, 0, len(r.stats[typ])) - for _, r := range r.stats[typ] { - res = append(res, r.RegionInfo.Clone()) + for regionID := range r.stats[typ] { + res = append(res, r.core.GetRegion(regionID).Clone()) } return res } @@ -137,8 +157,8 @@ func (r *RegionStatistics) GetOfflineRegionStatsByType(typ RegionStatisticType) r.RLock() defer r.RUnlock() res := make([]*core.RegionInfo, 0, len(r.stats[typ])) - for _, r := range r.offlineStats[typ] { - res = append(res, r.Clone()) + for regionID := range r.offlineStats[typ] { + res = append(res, r.core.GetRegion(regionID).Clone()) } return res } @@ -236,14 +256,12 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store for typ, c := range conditions { if c { if isRemoving && typ < EmptyRegion { - r.offlineStats[typ][regionID] = region + r.offlineStats[typ][regionID] = struct{}{} offlinePeerTypeIndex |= typ } info := r.stats[typ][regionID] if info == nil { - info = &RegionInfo{ - RegionInfo: region, - } + info = &RegionInfoWithTS{id: regionID} } if typ == DownPeer { if info.startDownPeerTS != 0 { @@ -265,7 +283,7 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store } if isRemoving { - r.offlineStats[OfflinePeer][regionID] = region + r.offlineStats[OfflinePeer][regionID] = struct{}{} offlinePeerTypeIndex |= OfflinePeer } diff --git a/pkg/statistics/region_collection_test.go b/pkg/statistics/region_collection_test.go index 1e071900708..c8958767e66 100644 --- a/pkg/statistics/region_collection_test.go +++ b/pkg/statistics/region_collection_test.go @@ -65,7 +65,7 @@ func TestRegionStatistics(t *testing.T) { r2 := &metapb.Region{Id: 2, Peers: peers[0:2], StartKey: []byte("cc"), EndKey: []byte("dd")} region1 := core.NewRegionInfo(r1, peers[0]) region2 := core.NewRegionInfo(r2, peers[0]) - regionStats := NewRegionStatistics(opt, manager, nil) + regionStats := NewRegionStatistics(nil, opt, manager, nil) regionStats.Observe(region1, stores) re.Len(regionStats.stats[ExtraPeer], 1) re.Len(regionStats.stats[LearnerPeer], 1) @@ -166,7 +166,7 @@ func TestRegionStatisticsWithPlacementRule(t *testing.T) { region3 := core.NewRegionInfo(r3, peers[0]) region4 := core.NewRegionInfo(r4, peers[0]) region5 := core.NewRegionInfo(r5, peers[4]) - regionStats := NewRegionStatistics(opt, manager, nil) + regionStats := NewRegionStatistics(nil, opt, manager, nil) // r2 didn't match the rules regionStats.Observe(region2, stores) re.Len(regionStats.stats[MissPeer], 1) diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 84dfd3cecfe..c905cef97dd 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -320,7 +320,7 @@ func (c *RaftCluster) Start(s Server) error { } c.storeConfigManager = config.NewStoreConfigManager(c.httpClient) c.coordinator = schedule.NewCoordinator(c.ctx, cluster, s.GetHBStreams()) - c.regionStats = statistics.NewRegionStatistics(c.opt, c.ruleManager, c.storeConfigManager) + c.regionStats = statistics.NewRegionStatistics(c.core, c.opt, c.ruleManager, c.storeConfigManager) c.limiter = NewStoreLimiter(s.GetPersistOptions()) c.externalTS, err = c.storage.LoadExternalTS() if err != nil { diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index c2ff966f228..f735fa30fe0 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -978,7 +978,11 @@ func TestRegionSizeChanged(t *testing.T) { re.NoError(err) cluster := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend(), core.NewBasicCluster()) cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) - cluster.regionStats = statistics.NewRegionStatistics(cluster.GetOpts(), cluster.ruleManager, cluster.storeConfigManager) + cluster.regionStats = statistics.NewRegionStatistics( + cluster.GetBasicCluster(), + cluster.GetOpts(), + cluster.ruleManager, + cluster.storeConfigManager) region := newTestRegions(1, 3, 3)[0] cluster.opt.GetMaxMergeRegionKeys() curMaxMergeSize := int64(cluster.opt.GetMaxMergeRegionSize()) @@ -1260,7 +1264,11 @@ func TestOfflineAndMerge(t *testing.T) { panic(err) } } - cluster.regionStats = statistics.NewRegionStatistics(cluster.GetOpts(), cluster.ruleManager, cluster.storeConfigManager) + cluster.regionStats = statistics.NewRegionStatistics( + cluster.GetBasicCluster(), + cluster.GetOpts(), + cluster.ruleManager, + cluster.storeConfigManager) cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) // Put 4 stores. @@ -1514,7 +1522,11 @@ func TestCalculateStoreSize1(t *testing.T) { opt.SetReplicationConfig(cfg) cluster := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend(), core.NewBasicCluster()) cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) - cluster.regionStats = statistics.NewRegionStatistics(cluster.GetOpts(), cluster.ruleManager, cluster.storeConfigManager) + cluster.regionStats = statistics.NewRegionStatistics( + cluster.GetBasicCluster(), + cluster.GetOpts(), + cluster.ruleManager, + cluster.storeConfigManager) // Put 10 stores. for i, store := range newTestStores(10, "6.0.0") { @@ -1597,7 +1609,11 @@ func TestCalculateStoreSize2(t *testing.T) { opt.SetMaxReplicas(3) cluster := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend(), core.NewBasicCluster()) cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) - cluster.regionStats = statistics.NewRegionStatistics(cluster.GetOpts(), cluster.ruleManager, cluster.storeConfigManager) + cluster.regionStats = statistics.NewRegionStatistics( + cluster.GetBasicCluster(), + cluster.GetOpts(), + cluster.ruleManager, + cluster.storeConfigManager) // Put 10 stores. for i, store := range newTestStores(10, "6.0.0") { @@ -2351,7 +2367,11 @@ func TestCollectMetricsConcurrent(t *testing.T) { re := require.New(t) tc, co, cleanup := prepare(nil, func(tc *testCluster) { - tc.regionStats = statistics.NewRegionStatistics(tc.GetOpts(), nil, tc.storeConfigManager) + tc.regionStats = statistics.NewRegionStatistics( + tc.GetBasicCluster(), + tc.GetOpts(), + nil, + tc.storeConfigManager) }, func(co *schedule.Coordinator) { co.Run() }, re) defer cleanup() @@ -2383,7 +2403,11 @@ func TestCollectMetrics(t *testing.T) { re := require.New(t) tc, co, cleanup := prepare(nil, func(tc *testCluster) { - tc.regionStats = statistics.NewRegionStatistics(tc.GetOpts(), nil, tc.storeConfigManager) + tc.regionStats = statistics.NewRegionStatistics( + tc.GetBasicCluster(), + tc.GetOpts(), + nil, + tc.storeConfigManager) }, func(co *schedule.Coordinator) { co.Run() }, re) defer cleanup() count := 10 From e722bbb97ff49a2daa92a08117cea258c805a9a8 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Thu, 13 Jul 2023 17:10:47 +0800 Subject: [PATCH 2/9] Fix the typo Signed-off-by: JmPotato --- pkg/statistics/region_collection.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pkg/statistics/region_collection.go b/pkg/statistics/region_collection.go index 71e62d052a8..c5797afd00b 100644 --- a/pkg/statistics/region_collection.go +++ b/pkg/statistics/region_collection.go @@ -68,15 +68,15 @@ const nonIsolation = "none" var ( // WithLabelValues is a heavy operation, define variable to avoid call it every time. - regionMissPeerRegionCounter = regionStatusGauge.WithLabelValues("miss-peer-region-count") - regionExtraPeerRegionCounter = regionStatusGauge.WithLabelValues("extra-peer-region-count") - regionDownPeerRegionCounter = regionStatusGauge.WithLabelValues("down-peer-region-count") - regionPendingPeerRegionCounter = regionStatusGauge.WithLabelValues("pending-peer-region-count") - regionLearnerPeerRegionCounter = regionStatusGauge.WithLabelValues("learner-peer-region-count") - regionEmptyRegionCounter = regionStatusGauge.WithLabelValues("empty-region-count") - regionOversizedRegionCounter = regionStatusGauge.WithLabelValues("oversized-region-count") - regionUndersizedRegionCounter = regionStatusGauge.WithLabelValues("undersized-region-count") - regionWitnesssLeaderRegionCounter = regionStatusGauge.WithLabelValues("witness-leader-region-count") + regionMissPeerRegionCounter = regionStatusGauge.WithLabelValues("miss-peer-region-count") + regionExtraPeerRegionCounter = regionStatusGauge.WithLabelValues("extra-peer-region-count") + regionDownPeerRegionCounter = regionStatusGauge.WithLabelValues("down-peer-region-count") + regionPendingPeerRegionCounter = regionStatusGauge.WithLabelValues("pending-peer-region-count") + regionLearnerPeerRegionCounter = regionStatusGauge.WithLabelValues("learner-peer-region-count") + regionEmptyRegionCounter = regionStatusGauge.WithLabelValues("empty-region-count") + regionOversizedRegionCounter = regionStatusGauge.WithLabelValues("oversized-region-count") + regionUndersizedRegionCounter = regionStatusGauge.WithLabelValues("undersized-region-count") + regionWitnessLeaderRegionCounter = regionStatusGauge.WithLabelValues("witness-leader-region-count") offlineMissPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("miss-peer-region-count") offlineExtraPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("extra-peer-region-count") @@ -324,7 +324,7 @@ func (r *RegionStatistics) Collect() { regionEmptyRegionCounter.Set(float64(len(r.stats[EmptyRegion]))) regionOversizedRegionCounter.Set(float64(len(r.stats[OversizedRegion]))) regionUndersizedRegionCounter.Set(float64(len(r.stats[UndersizedRegion]))) - regionWitnesssLeaderRegionCounter.Set(float64(len(r.stats[WitnessLeader]))) + regionWitnessLeaderRegionCounter.Set(float64(len(r.stats[WitnessLeader]))) offlineMissPeerRegionCounter.Set(float64(len(r.offlineStats[MissPeer]))) offlineExtraPeerRegionCounter.Set(float64(len(r.offlineStats[ExtraPeer]))) @@ -344,7 +344,7 @@ func (r *RegionStatistics) Reset() { regionEmptyRegionCounter.Set(0) regionOversizedRegionCounter.Set(0) regionUndersizedRegionCounter.Set(0) - regionWitnesssLeaderRegionCounter.Set(0) + regionWitnessLeaderRegionCounter.Set(0) offlineMissPeerRegionCounter.Set(0) offlineExtraPeerRegionCounter.Set(0) From 464ac9e0d7d7915bc1cc961e7a116cceece396c7 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Thu, 13 Jul 2023 19:33:02 +0800 Subject: [PATCH 3/9] Keep the offline region recorded inside RegionStatistics Signed-off-by: JmPotato --- pkg/statistics/region_collection.go | 144 ++++++++--------------- pkg/statistics/region_collection_test.go | 24 +--- server/api/region.go | 2 +- server/cluster/cluster.go | 6 +- server/cluster/cluster_test.go | 4 +- server/handler.go | 4 +- 6 files changed, 59 insertions(+), 125 deletions(-) diff --git a/pkg/statistics/region_collection.go b/pkg/statistics/region_collection.go index c5797afd00b..de423a459cf 100644 --- a/pkg/statistics/region_collection.go +++ b/pkg/statistics/region_collection.go @@ -42,27 +42,17 @@ const ( WitnessLeader ) -var ( - regionStatisticTypes = []RegionStatisticType{ - MissPeer, - ExtraPeer, - DownPeer, - PendingPeer, - LearnerPeer, - EmptyRegion, - OversizedRegion, - UndersizedRegion, - WitnessLeader, - } - offlineRegionStatisticTypes = []RegionStatisticType{ - MissPeer, - ExtraPeer, - DownPeer, - PendingPeer, - OfflinePeer, - LearnerPeer, - } -) +var regionStatisticTypes = []RegionStatisticType{ + MissPeer, + ExtraPeer, + DownPeer, + PendingPeer, + LearnerPeer, + EmptyRegion, + OversizedRegion, + UndersizedRegion, + WitnessLeader, +} const nonIsolation = "none" @@ -77,13 +67,7 @@ var ( regionOversizedRegionCounter = regionStatusGauge.WithLabelValues("oversized-region-count") regionUndersizedRegionCounter = regionStatusGauge.WithLabelValues("undersized-region-count") regionWitnessLeaderRegionCounter = regionStatusGauge.WithLabelValues("witness-leader-region-count") - - offlineMissPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("miss-peer-region-count") - offlineExtraPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("extra-peer-region-count") - offlineDownPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("down-peer-region-count") - offlinePendingPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("pending-peer-region-count") - offlineLearnerPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("learner-peer-region-count") - offlineOfflinePeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("offline-peer-region-count") + offlineOfflinePeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("offline-peer-region-count") ) // RegionInfoWithTS is used to record the extra timestamp status of a region. @@ -96,50 +80,49 @@ type RegionInfoWithTS struct { // RegionStatistics is used to record the status of regions. type RegionStatistics struct { sync.RWMutex - // core is used to get the region information. - core *core.BasicCluster - conf sc.CheckerConfig - stats map[RegionStatisticType]map[uint64]*RegionInfoWithTS - offlineStats map[RegionStatisticType]map[uint64]struct{} + // bc is used to get the region information. + bc *core.BasicCluster + conf sc.CheckerConfig + stats map[RegionStatisticType]map[uint64]*RegionInfoWithTS + // Since we may easily have a large number of offline regions in the scale-in scenario, + // to prevent the lock contention with the heartbeat processing, we use a separate map + // to record the full offline region information here. + offlineStats map[uint64]*core.RegionInfo index map[uint64]RegionStatisticType - offlineIndex map[uint64]RegionStatisticType ruleManager *placement.RuleManager storeConfigManager *config.StoreConfigManager } // NewRegionStatistics creates a new RegionStatistics. func NewRegionStatistics( - core *core.BasicCluster, + bc *core.BasicCluster, conf sc.CheckerConfig, ruleManager *placement.RuleManager, storeConfigManager *config.StoreConfigManager, ) *RegionStatistics { r := &RegionStatistics{ - core: core, + bc: bc, conf: conf, ruleManager: ruleManager, storeConfigManager: storeConfigManager, stats: make(map[RegionStatisticType]map[uint64]*RegionInfoWithTS), - offlineStats: make(map[RegionStatisticType]map[uint64]struct{}), + offlineStats: make(map[uint64]*core.RegionInfo), index: make(map[uint64]RegionStatisticType), - offlineIndex: make(map[uint64]RegionStatisticType), } for _, typ := range regionStatisticTypes { r.stats[typ] = make(map[uint64]*RegionInfoWithTS) } - for _, typ := range offlineRegionStatisticTypes { - r.offlineStats[typ] = make(map[uint64]struct{}) - } return r } -// GetRegionStatsByType gets the status of the region by types. The regions here need to be cloned, otherwise, it may cause data race problems. +// GetRegionStatsByType gets the status of the region by types. +// The regions here need to be cloned, otherwise, it may cause data race problems. func (r *RegionStatistics) GetRegionStatsByType(typ RegionStatisticType) []*core.RegionInfo { r.RLock() defer r.RUnlock() res := make([]*core.RegionInfo, 0, len(r.stats[typ])) for regionID := range r.stats[typ] { - res = append(res, r.core.GetRegion(regionID).Clone()) + res = append(res, r.bc.GetRegion(regionID).Clone()) } return res } @@ -152,13 +135,14 @@ func (r *RegionStatistics) IsRegionStatsType(regionID uint64, typ RegionStatisti return exist } -// GetOfflineRegionStatsByType gets the status of the offline region by types. The regions here need to be cloned, otherwise, it may cause data race problems. -func (r *RegionStatistics) GetOfflineRegionStatsByType(typ RegionStatisticType) []*core.RegionInfo { +// GetOfflineRegionStats gets the status of the offline region. +// The regions here need to be cloned, otherwise, it may cause data race problems. +func (r *RegionStatistics) GetOfflineRegionStats() []*core.RegionInfo { r.RLock() defer r.RUnlock() - res := make([]*core.RegionInfo, 0, len(r.stats[typ])) - for regionID := range r.offlineStats[typ] { - res = append(res, r.core.GetRegion(regionID).Clone()) + res := make([]*core.RegionInfo, 0, len(r.offlineStats)) + for _, r := range r.offlineStats { + res = append(res, r.Clone()) } return res } @@ -171,14 +155,6 @@ func (r *RegionStatistics) deleteEntry(deleteIndex RegionStatisticType, regionID } } -func (r *RegionStatistics) deleteOfflineEntry(deleteIndex RegionStatisticType, regionID uint64) { - for typ := RegionStatisticType(1); typ <= deleteIndex; typ <<= 1 { - if deleteIndex&typ != 0 { - delete(r.offlineStats[typ], regionID) - } - } -} - // RegionStatsNeedUpdate checks whether the region's status need to be updated // due to some special state types. func (r *RegionStatistics) RegionStatsNeedUpdate(region *core.RegionInfo) bool { @@ -195,15 +171,13 @@ func (r *RegionStatistics) RegionStatsNeedUpdate(region *core.RegionInfo) bool { func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.StoreInfo) { r.Lock() defer r.Unlock() - // Region state. - regionID := region.GetID() var ( - peerTypeIndex RegionStatisticType - offlinePeerTypeIndex RegionStatisticType - deleteIndex RegionStatisticType + desiredReplicas = r.conf.GetMaxReplicas() + desiredVoters = desiredReplicas + peerTypeIndex RegionStatisticType + deleteIndex RegionStatisticType ) - desiredReplicas := r.conf.GetMaxReplicas() - desiredVoters := desiredReplicas + // Check if the region meets count requirements of its rules. if r.conf.IsPlacementRulesEnabled() { if !r.ruleManager.IsInitialized() { log.Warn("ruleManager haven't been initialized") @@ -219,9 +193,8 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store } } } - + // Check if the region is in a removing state. var isRemoving bool - for _, store := range stores { if store.IsRemoving() { peer := region.GetStorePeer(store.GetID()) @@ -231,7 +204,6 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store } } } - // Better to make sure once any of these conditions changes, it will trigger the heartbeat `save_cache`. // Otherwise, the state may be out-of-date for a long time, which needs another way to apply the change ASAP. // For example, see `RegionStatsNeedUpdate` above to know how `OversizedRegion` and `UndersizedRegion` are updated. @@ -252,13 +224,10 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store ), WitnessLeader: region.GetLeader().GetIsWitness(), } - + // Check if the region meets any of the conditions and update the corresponding info. + regionID := region.GetID() for typ, c := range conditions { if c { - if isRemoving && typ < EmptyRegion { - r.offlineStats[typ][regionID] = struct{}{} - offlinePeerTypeIndex |= typ - } info := r.stats[typ][regionID] if info == nil { info = &RegionInfoWithTS{id: regionID} @@ -281,18 +250,13 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store peerTypeIndex |= typ } } - + // Update the offline region info. if isRemoving { - r.offlineStats[OfflinePeer][regionID] = struct{}{} - offlinePeerTypeIndex |= OfflinePeer - } - - if oldIndex, ok := r.offlineIndex[regionID]; ok { - deleteIndex = oldIndex &^ offlinePeerTypeIndex + r.offlineStats[regionID] = region + } else { + delete(r.offlineStats, regionID) } - r.deleteOfflineEntry(deleteIndex, regionID) - r.offlineIndex[regionID] = offlinePeerTypeIndex - + // Remove the info if any of the conditions doesn't meet any more. if oldIndex, ok := r.index[regionID]; ok { deleteIndex = oldIndex &^ peerTypeIndex } @@ -307,9 +271,7 @@ func (r *RegionStatistics) ClearDefunctRegion(regionID uint64) { if oldIndex, ok := r.index[regionID]; ok { r.deleteEntry(oldIndex, regionID) } - if oldIndex, ok := r.offlineIndex[regionID]; ok { - r.deleteOfflineEntry(oldIndex, regionID) - } + delete(r.offlineStats, regionID) } // Collect collects the metrics of the regions' status. @@ -325,13 +287,7 @@ func (r *RegionStatistics) Collect() { regionOversizedRegionCounter.Set(float64(len(r.stats[OversizedRegion]))) regionUndersizedRegionCounter.Set(float64(len(r.stats[UndersizedRegion]))) regionWitnessLeaderRegionCounter.Set(float64(len(r.stats[WitnessLeader]))) - - offlineMissPeerRegionCounter.Set(float64(len(r.offlineStats[MissPeer]))) - offlineExtraPeerRegionCounter.Set(float64(len(r.offlineStats[ExtraPeer]))) - offlineDownPeerRegionCounter.Set(float64(len(r.offlineStats[DownPeer]))) - offlinePendingPeerRegionCounter.Set(float64(len(r.offlineStats[PendingPeer]))) - offlineLearnerPeerRegionCounter.Set(float64(len(r.offlineStats[LearnerPeer]))) - offlineOfflinePeerRegionCounter.Set(float64(len(r.offlineStats[OfflinePeer]))) + offlineOfflinePeerRegionCounter.Set(float64(len(r.offlineStats))) } // Reset resets the metrics of the regions' status. @@ -345,12 +301,6 @@ func (r *RegionStatistics) Reset() { regionOversizedRegionCounter.Set(0) regionUndersizedRegionCounter.Set(0) regionWitnessLeaderRegionCounter.Set(0) - - offlineMissPeerRegionCounter.Set(0) - offlineExtraPeerRegionCounter.Set(0) - offlineDownPeerRegionCounter.Set(0) - offlinePendingPeerRegionCounter.Set(0) - offlineLearnerPeerRegionCounter.Set(0) offlineOfflinePeerRegionCounter.Set(0) } diff --git a/pkg/statistics/region_collection_test.go b/pkg/statistics/region_collection_test.go index c8958767e66..50e58d8ea04 100644 --- a/pkg/statistics/region_collection_test.go +++ b/pkg/statistics/region_collection_test.go @@ -71,8 +71,7 @@ func TestRegionStatistics(t *testing.T) { re.Len(regionStats.stats[LearnerPeer], 1) re.Len(regionStats.stats[EmptyRegion], 1) re.Len(regionStats.stats[UndersizedRegion], 1) - re.Len(regionStats.offlineStats[ExtraPeer], 1) - re.Len(regionStats.offlineStats[LearnerPeer], 1) + re.Len(regionStats.offlineStats, 1) region1 = region1.Clone( core.WithDownPeers(downPeers), @@ -88,12 +87,7 @@ func TestRegionStatistics(t *testing.T) { re.Empty(regionStats.stats[EmptyRegion]) re.Len(regionStats.stats[OversizedRegion], 1) re.Empty(regionStats.stats[UndersizedRegion]) - re.Len(regionStats.offlineStats[ExtraPeer], 1) - re.Empty(regionStats.offlineStats[MissPeer]) - re.Len(regionStats.offlineStats[DownPeer], 1) - re.Len(regionStats.offlineStats[PendingPeer], 1) - re.Len(regionStats.offlineStats[LearnerPeer], 1) - re.Len(regionStats.offlineStats[OfflinePeer], 1) + re.Len(regionStats.offlineStats, 1) region2 = region2.Clone(core.WithDownPeers(downPeers[0:1])) regionStats.Observe(region2, stores[0:2]) @@ -104,12 +98,7 @@ func TestRegionStatistics(t *testing.T) { re.Len(regionStats.stats[LearnerPeer], 1) re.Len(regionStats.stats[OversizedRegion], 1) re.Len(regionStats.stats[UndersizedRegion], 1) - re.Len(regionStats.offlineStats[ExtraPeer], 1) - re.Empty(regionStats.offlineStats[MissPeer]) - re.Len(regionStats.offlineStats[DownPeer], 1) - re.Len(regionStats.offlineStats[PendingPeer], 1) - re.Len(regionStats.offlineStats[LearnerPeer], 1) - re.Len(regionStats.offlineStats[OfflinePeer], 1) + re.Len(regionStats.offlineStats, 1) region1 = region1.Clone(core.WithRemoveStorePeer(7)) regionStats.Observe(region1, stores[0:3]) @@ -118,12 +107,7 @@ func TestRegionStatistics(t *testing.T) { re.Len(regionStats.stats[DownPeer], 2) re.Len(regionStats.stats[PendingPeer], 1) re.Empty(regionStats.stats[LearnerPeer]) - re.Empty(regionStats.offlineStats[ExtraPeer]) - re.Empty(regionStats.offlineStats[MissPeer]) - re.Empty(regionStats.offlineStats[DownPeer]) - re.Empty(regionStats.offlineStats[PendingPeer]) - re.Empty(regionStats.offlineStats[LearnerPeer]) - re.Empty(regionStats.offlineStats[OfflinePeer]) + re.Empty(regionStats.offlineStats) store3 = stores[3].Clone(core.UpStore()) stores[3] = store3 diff --git a/server/api/region.go b/server/api/region.go index bfc36d2c38c..92fc61dbc45 100644 --- a/server/api/region.go +++ b/server/api/region.go @@ -542,7 +542,7 @@ func (h *regionsHandler) GetLearnerPeerRegions(w http.ResponseWriter, r *http.Re // @Router /regions/check/offline-peer [get] func (h *regionsHandler) GetOfflinePeerRegions(w http.ResponseWriter, r *http.Request) { handler := h.svr.GetHandler() - regions, err := handler.GetOfflinePeer(statistics.OfflinePeer) + regions, err := handler.GetOfflinePeer() if err != nil { h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index c905cef97dd..cc461b06104 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -2129,12 +2129,12 @@ func (c *RaftCluster) GetRegionStatsByType(typ statistics.RegionStatisticType) [ return c.regionStats.GetRegionStatsByType(typ) } -// GetOfflineRegionStatsByType gets the status of the offline region by types. -func (c *RaftCluster) GetOfflineRegionStatsByType(typ statistics.RegionStatisticType) []*core.RegionInfo { +// GetOfflineRegionStats gets the status of the offline region. +func (c *RaftCluster) GetOfflineRegionStats() []*core.RegionInfo { if c.regionStats == nil { return nil } - return c.regionStats.GetOfflineRegionStatsByType(typ) + return c.regionStats.GetOfflineRegionStats() } // UpdateRegionsLabelLevelStats updates the status of the region label level by types. diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index f735fa30fe0..16f4cbe5ca4 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -1308,13 +1308,13 @@ func TestOfflineAndMerge(t *testing.T) { regions = core.SplitRegions(regions) } heartbeatRegions(re, cluster, regions) - re.Len(cluster.GetOfflineRegionStatsByType(statistics.OfflinePeer), len(regions)) + re.Len(cluster.GetOfflineRegionStats(), len(regions)) // Merge. for i := 0; i < n; i++ { regions = core.MergeRegions(regions) heartbeatRegions(re, cluster, regions) - re.Len(cluster.GetOfflineRegionStatsByType(statistics.OfflinePeer), len(regions)) + re.Len(cluster.GetOfflineRegionStats(), len(regions)) } } diff --git a/server/handler.go b/server/handler.go index 3f56a3b0631..a01d9c3d1c2 100644 --- a/server/handler.go +++ b/server/handler.go @@ -928,12 +928,12 @@ func (h *Handler) GetSchedulerConfigHandler() (http.Handler, error) { } // GetOfflinePeer gets the region with offline peer. -func (h *Handler) GetOfflinePeer(typ statistics.RegionStatisticType) ([]*core.RegionInfo, error) { +func (h *Handler) GetOfflinePeer() ([]*core.RegionInfo, error) { c := h.s.GetRaftCluster() if c == nil { return nil, errs.ErrNotBootstrapped.FastGenByArgs() } - return c.GetOfflineRegionStatsByType(typ), nil + return c.GetOfflineRegionStats(), nil } // ResetTS resets the ts with specified tso. From 102b07f01aa9b49cb414ebd7ba1cd3c476c6d626 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Thu, 13 Jul 2023 19:37:37 +0800 Subject: [PATCH 4/9] Refine the naming Signed-off-by: JmPotato --- server/api/region.go | 2 +- server/handler.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/server/api/region.go b/server/api/region.go index 92fc61dbc45..a0dfb928d09 100644 --- a/server/api/region.go +++ b/server/api/region.go @@ -542,7 +542,7 @@ func (h *regionsHandler) GetLearnerPeerRegions(w http.ResponseWriter, r *http.Re // @Router /regions/check/offline-peer [get] func (h *regionsHandler) GetOfflinePeerRegions(w http.ResponseWriter, r *http.Request) { handler := h.svr.GetHandler() - regions, err := handler.GetOfflinePeer() + regions, err := handler.GetOfflinePeerRegions() if err != nil { h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return diff --git a/server/handler.go b/server/handler.go index a01d9c3d1c2..10651da8baf 100644 --- a/server/handler.go +++ b/server/handler.go @@ -927,8 +927,8 @@ func (h *Handler) GetSchedulerConfigHandler() (http.Handler, error) { return mux, nil } -// GetOfflinePeer gets the region with offline peer. -func (h *Handler) GetOfflinePeer() ([]*core.RegionInfo, error) { +// GetOfflinePeerRegions gets the regions with offline peers. +func (h *Handler) GetOfflinePeerRegions() ([]*core.RegionInfo, error) { c := h.s.GetRaftCluster() if c == nil { return nil, errs.ErrNotBootstrapped.FastGenByArgs() From c86e28ea83ff787432cf14ba38a044e9106a2f89 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Thu, 13 Jul 2023 20:59:52 +0800 Subject: [PATCH 5/9] Add offline check test Signed-off-by: JmPotato --- pkg/statistics/region_collection.go | 6 +++--- server/api/region_test.go | 14 +++++++++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/pkg/statistics/region_collection.go b/pkg/statistics/region_collection.go index de423a459cf..ed3445b5975 100644 --- a/pkg/statistics/region_collection.go +++ b/pkg/statistics/region_collection.go @@ -67,7 +67,7 @@ var ( regionOversizedRegionCounter = regionStatusGauge.WithLabelValues("oversized-region-count") regionUndersizedRegionCounter = regionStatusGauge.WithLabelValues("undersized-region-count") regionWitnessLeaderRegionCounter = regionStatusGauge.WithLabelValues("witness-leader-region-count") - offlineOfflinePeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("offline-peer-region-count") + regionOfflinePeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("offline-peer-region-count") ) // RegionInfoWithTS is used to record the extra timestamp status of a region. @@ -287,7 +287,7 @@ func (r *RegionStatistics) Collect() { regionOversizedRegionCounter.Set(float64(len(r.stats[OversizedRegion]))) regionUndersizedRegionCounter.Set(float64(len(r.stats[UndersizedRegion]))) regionWitnessLeaderRegionCounter.Set(float64(len(r.stats[WitnessLeader]))) - offlineOfflinePeerRegionCounter.Set(float64(len(r.offlineStats))) + regionOfflinePeerRegionCounter.Set(float64(len(r.offlineStats))) } // Reset resets the metrics of the regions' status. @@ -301,7 +301,7 @@ func (r *RegionStatistics) Reset() { regionOversizedRegionCounter.Set(0) regionUndersizedRegionCounter.Set(0) regionWitnessLeaderRegionCounter.Set(0) - offlineOfflinePeerRegionCounter.Set(0) + regionOfflinePeerRegionCounter.Set(0) } // LabelStatistics is the statistics of the level of labels. diff --git a/server/api/region_test.go b/server/api/region_test.go index 18a5abe78d6..3f20f5ca29f 100644 --- a/server/api/region_test.go +++ b/server/api/region_test.go @@ -151,7 +151,10 @@ func (suite *regionTestSuite) TestRegionCheck() { core.SetApproximateKeys(10), core.SetApproximateSize(10)) downPeer := &metapb.Peer{Id: 13, StoreId: 2} - r = r.Clone(core.WithAddPeer(downPeer), core.WithDownPeers([]*pdpb.PeerStats{{Peer: downPeer, DownSeconds: 3600}}), core.WithPendingPeers([]*metapb.Peer{downPeer})) + r = r.Clone( + core.WithAddPeer(downPeer), + core.WithDownPeers([]*pdpb.PeerStats{{Peer: downPeer, DownSeconds: 3600}}), + core.WithPendingPeers([]*metapb.Peer{downPeer})) re := suite.Require() mustRegionHeartbeat(re, suite.svr, r) url := fmt.Sprintf("%s/region/id/%d", suite.urlPrefix, r.GetID()) @@ -201,6 +204,15 @@ func (suite *regionTestSuite) TestRegionCheck() { suite.NoError(tu.ReadGetJSON(re, testDialClient, url, &r7)) histKeys := []*histItem{{Start: 1000, End: 1999, Count: 1}} suite.Equal(histKeys, r7) + + mustPutStore(re, suite.svr, 2, metapb.StoreState_Offline, metapb.NodeState_Removing, []*metapb.StoreLabel{}) + mustRegionHeartbeat(re, suite.svr, r) + url = fmt.Sprintf("%s/regions/check/%s", suite.urlPrefix, "offline-peer") + r8 := &RegionsInfo{} + suite.NoError(tu.ReadGetJSON(re, testDialClient, url, r8)) + r4.Adjust() + suite.Equal(1, r8.Count) + suite.Equal(r.GetID(), r8.Regions[0].ID) } func (suite *regionTestSuite) TestRegions() { From 31042da8cf4241daac89af733c679f71200741de Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 17 Jul 2023 11:24:39 +0800 Subject: [PATCH 6/9] Use RegionInfoProvider interface Signed-off-by: JmPotato --- pkg/statistics/region_collection.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pkg/statistics/region_collection.go b/pkg/statistics/region_collection.go index ed3445b5975..e4610116b46 100644 --- a/pkg/statistics/region_collection.go +++ b/pkg/statistics/region_collection.go @@ -25,6 +25,12 @@ import ( "github.com/tikv/pd/server/config" ) +// RegionInfoProvider is an interface to provide the region information. +type RegionInfoProvider interface { + // GetRegion returns the region information according to the given region ID. + GetRegion(regionID uint64) *core.RegionInfo +} + // RegionStatisticType represents the type of the region's status. type RegionStatisticType uint32 @@ -80,8 +86,7 @@ type RegionInfoWithTS struct { // RegionStatistics is used to record the status of regions. type RegionStatistics struct { sync.RWMutex - // bc is used to get the region information. - bc *core.BasicCluster + rip RegionInfoProvider conf sc.CheckerConfig stats map[RegionStatisticType]map[uint64]*RegionInfoWithTS // Since we may easily have a large number of offline regions in the scale-in scenario, @@ -95,13 +100,13 @@ type RegionStatistics struct { // NewRegionStatistics creates a new RegionStatistics. func NewRegionStatistics( - bc *core.BasicCluster, + rip RegionInfoProvider, conf sc.CheckerConfig, ruleManager *placement.RuleManager, storeConfigManager *config.StoreConfigManager, ) *RegionStatistics { r := &RegionStatistics{ - bc: bc, + rip: rip, conf: conf, ruleManager: ruleManager, storeConfigManager: storeConfigManager, @@ -122,7 +127,7 @@ func (r *RegionStatistics) GetRegionStatsByType(typ RegionStatisticType) []*core defer r.RUnlock() res := make([]*core.RegionInfo, 0, len(r.stats[typ])) for regionID := range r.stats[typ] { - res = append(res, r.bc.GetRegion(regionID).Clone()) + res = append(res, r.rip.GetRegion(regionID).Clone()) } return res } From cee258a9a53023e208b8fdf7bf7cffbd6e615ace Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 17 Jul 2023 13:00:15 +0800 Subject: [PATCH 7/9] Unify the statistic type Signed-off-by: JmPotato --- pkg/statistics/region_collection.go | 61 ++++++++---------------- pkg/statistics/region_collection_test.go | 8 ++-- server/api/region.go | 2 +- server/cluster/cluster.go | 8 ---- server/cluster/cluster_test.go | 4 +- server/handler.go | 9 ---- 6 files changed, 27 insertions(+), 65 deletions(-) diff --git a/pkg/statistics/region_collection.go b/pkg/statistics/region_collection.go index e4610116b46..a5ceff3912b 100644 --- a/pkg/statistics/region_collection.go +++ b/pkg/statistics/region_collection.go @@ -53,6 +53,7 @@ var regionStatisticTypes = []RegionStatisticType{ ExtraPeer, DownPeer, PendingPeer, + OfflinePeer, LearnerPeer, EmptyRegion, OversizedRegion, @@ -73,7 +74,9 @@ var ( regionOversizedRegionCounter = regionStatusGauge.WithLabelValues("oversized-region-count") regionUndersizedRegionCounter = regionStatusGauge.WithLabelValues("undersized-region-count") regionWitnessLeaderRegionCounter = regionStatusGauge.WithLabelValues("witness-leader-region-count") - regionOfflinePeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("offline-peer-region-count") + // In order to maintain historical compatibility, we did not replace it with a unified `regionStatusGauge` metrics, + // but kept it so that we don't have to modify the Prometheus query on the Grafana dashboard. + regionOfflinePeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("offline-peer-region-count") ) // RegionInfoWithTS is used to record the extra timestamp status of a region. @@ -86,13 +89,9 @@ type RegionInfoWithTS struct { // RegionStatistics is used to record the status of regions. type RegionStatistics struct { sync.RWMutex - rip RegionInfoProvider - conf sc.CheckerConfig - stats map[RegionStatisticType]map[uint64]*RegionInfoWithTS - // Since we may easily have a large number of offline regions in the scale-in scenario, - // to prevent the lock contention with the heartbeat processing, we use a separate map - // to record the full offline region information here. - offlineStats map[uint64]*core.RegionInfo + rip RegionInfoProvider + conf sc.CheckerConfig + stats map[RegionStatisticType]map[uint64]*RegionInfoWithTS index map[uint64]RegionStatisticType ruleManager *placement.RuleManager storeConfigManager *config.StoreConfigManager @@ -111,7 +110,6 @@ func NewRegionStatistics( ruleManager: ruleManager, storeConfigManager: storeConfigManager, stats: make(map[RegionStatisticType]map[uint64]*RegionInfoWithTS), - offlineStats: make(map[uint64]*core.RegionInfo), index: make(map[uint64]RegionStatisticType), } for _, typ := range regionStatisticTypes { @@ -140,18 +138,6 @@ func (r *RegionStatistics) IsRegionStatsType(regionID uint64, typ RegionStatisti return exist } -// GetOfflineRegionStats gets the status of the offline region. -// The regions here need to be cloned, otherwise, it may cause data race problems. -func (r *RegionStatistics) GetOfflineRegionStats() []*core.RegionInfo { - r.RLock() - defer r.RUnlock() - res := make([]*core.RegionInfo, 0, len(r.offlineStats)) - for _, r := range r.offlineStats { - res = append(res, r.Clone()) - } - return res -} - func (r *RegionStatistics) deleteEntry(deleteIndex RegionStatisticType, regionID uint64) { for typ := RegionStatisticType(1); typ <= deleteIndex; typ <<= 1 { if deleteIndex&typ != 0 { @@ -198,17 +184,6 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store } } } - // Check if the region is in a removing state. - var isRemoving bool - for _, store := range stores { - if store.IsRemoving() { - peer := region.GetStorePeer(store.GetID()) - if peer != nil { - isRemoving = true - break - } - } - } // Better to make sure once any of these conditions changes, it will trigger the heartbeat `save_cache`. // Otherwise, the state may be out-of-date for a long time, which needs another way to apply the change ASAP. // For example, see `RegionStatsNeedUpdate` above to know how `OversizedRegion` and `UndersizedRegion` are updated. @@ -217,6 +192,17 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store ExtraPeer: len(region.GetPeers()) > desiredReplicas, DownPeer: len(region.GetDownPeers()) > 0, PendingPeer: len(region.GetPendingPeers()) > 0, + OfflinePeer: func() bool { + for _, store := range stores { + if store.IsRemoving() { + peer := region.GetStorePeer(store.GetID()) + if peer != nil { + return true + } + } + } + return false + }(), LearnerPeer: len(region.GetLearners()) > 0, EmptyRegion: region.GetApproximateSize() <= core.EmptyRegionApproximateSize, OversizedRegion: region.IsOversized( @@ -255,13 +241,7 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store peerTypeIndex |= typ } } - // Update the offline region info. - if isRemoving { - r.offlineStats[regionID] = region - } else { - delete(r.offlineStats, regionID) - } - // Remove the info if any of the conditions doesn't meet any more. + // Remove the info if any of the conditions are not met any more. if oldIndex, ok := r.index[regionID]; ok { deleteIndex = oldIndex &^ peerTypeIndex } @@ -276,7 +256,6 @@ func (r *RegionStatistics) ClearDefunctRegion(regionID uint64) { if oldIndex, ok := r.index[regionID]; ok { r.deleteEntry(oldIndex, regionID) } - delete(r.offlineStats, regionID) } // Collect collects the metrics of the regions' status. @@ -292,7 +271,7 @@ func (r *RegionStatistics) Collect() { regionOversizedRegionCounter.Set(float64(len(r.stats[OversizedRegion]))) regionUndersizedRegionCounter.Set(float64(len(r.stats[UndersizedRegion]))) regionWitnessLeaderRegionCounter.Set(float64(len(r.stats[WitnessLeader]))) - regionOfflinePeerRegionCounter.Set(float64(len(r.offlineStats))) + regionOfflinePeerRegionCounter.Set(float64(len(r.stats[OfflinePeer]))) } // Reset resets the metrics of the regions' status. diff --git a/pkg/statistics/region_collection_test.go b/pkg/statistics/region_collection_test.go index 50e58d8ea04..f767c30fcd4 100644 --- a/pkg/statistics/region_collection_test.go +++ b/pkg/statistics/region_collection_test.go @@ -71,7 +71,7 @@ func TestRegionStatistics(t *testing.T) { re.Len(regionStats.stats[LearnerPeer], 1) re.Len(regionStats.stats[EmptyRegion], 1) re.Len(regionStats.stats[UndersizedRegion], 1) - re.Len(regionStats.offlineStats, 1) + re.Len(regionStats.stats[OfflinePeer], 1) region1 = region1.Clone( core.WithDownPeers(downPeers), @@ -87,7 +87,7 @@ func TestRegionStatistics(t *testing.T) { re.Empty(regionStats.stats[EmptyRegion]) re.Len(regionStats.stats[OversizedRegion], 1) re.Empty(regionStats.stats[UndersizedRegion]) - re.Len(regionStats.offlineStats, 1) + re.Len(regionStats.stats[OfflinePeer], 1) region2 = region2.Clone(core.WithDownPeers(downPeers[0:1])) regionStats.Observe(region2, stores[0:2]) @@ -98,7 +98,7 @@ func TestRegionStatistics(t *testing.T) { re.Len(regionStats.stats[LearnerPeer], 1) re.Len(regionStats.stats[OversizedRegion], 1) re.Len(regionStats.stats[UndersizedRegion], 1) - re.Len(regionStats.offlineStats, 1) + re.Len(regionStats.stats[OfflinePeer], 1) region1 = region1.Clone(core.WithRemoveStorePeer(7)) regionStats.Observe(region1, stores[0:3]) @@ -107,7 +107,7 @@ func TestRegionStatistics(t *testing.T) { re.Len(regionStats.stats[DownPeer], 2) re.Len(regionStats.stats[PendingPeer], 1) re.Empty(regionStats.stats[LearnerPeer]) - re.Empty(regionStats.offlineStats) + re.Empty(regionStats.stats[OfflinePeer]) store3 = stores[3].Clone(core.UpStore()) stores[3] = store3 diff --git a/server/api/region.go b/server/api/region.go index a0dfb928d09..fc2aa4920e8 100644 --- a/server/api/region.go +++ b/server/api/region.go @@ -542,7 +542,7 @@ func (h *regionsHandler) GetLearnerPeerRegions(w http.ResponseWriter, r *http.Re // @Router /regions/check/offline-peer [get] func (h *regionsHandler) GetOfflinePeerRegions(w http.ResponseWriter, r *http.Request) { handler := h.svr.GetHandler() - regions, err := handler.GetOfflinePeerRegions() + regions, err := handler.GetRegionsByType(statistics.OfflinePeer) if err != nil { h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index cc461b06104..7915d7ee097 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -2129,14 +2129,6 @@ func (c *RaftCluster) GetRegionStatsByType(typ statistics.RegionStatisticType) [ return c.regionStats.GetRegionStatsByType(typ) } -// GetOfflineRegionStats gets the status of the offline region. -func (c *RaftCluster) GetOfflineRegionStats() []*core.RegionInfo { - if c.regionStats == nil { - return nil - } - return c.regionStats.GetOfflineRegionStats() -} - // UpdateRegionsLabelLevelStats updates the status of the region label level by types. func (c *RaftCluster) UpdateRegionsLabelLevelStats(regions []*core.RegionInfo) { for _, region := range regions { diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index 16f4cbe5ca4..0d394536b58 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -1308,13 +1308,13 @@ func TestOfflineAndMerge(t *testing.T) { regions = core.SplitRegions(regions) } heartbeatRegions(re, cluster, regions) - re.Len(cluster.GetOfflineRegionStats(), len(regions)) + re.Len(cluster.GetRegionStatsByType(statistics.OfflinePeer), len(regions)) // Merge. for i := 0; i < n; i++ { regions = core.MergeRegions(regions) heartbeatRegions(re, cluster, regions) - re.Len(cluster.GetOfflineRegionStats(), len(regions)) + re.Len(cluster.GetRegionStatsByType(statistics.OfflinePeer), len(regions)) } } diff --git a/server/handler.go b/server/handler.go index 10651da8baf..46199c0851b 100644 --- a/server/handler.go +++ b/server/handler.go @@ -927,15 +927,6 @@ func (h *Handler) GetSchedulerConfigHandler() (http.Handler, error) { return mux, nil } -// GetOfflinePeerRegions gets the regions with offline peers. -func (h *Handler) GetOfflinePeerRegions() ([]*core.RegionInfo, error) { - c := h.s.GetRaftCluster() - if c == nil { - return nil, errs.ErrNotBootstrapped.FastGenByArgs() - } - return c.GetOfflineRegionStats(), nil -} - // ResetTS resets the ts with specified tso. func (h *Handler) ResetTS(ts uint64, ignoreSmaller, skipUpperBoundCheck bool, _ uint32) error { log.Info("reset-ts", From 510a9c836974776575cd207b4ad2a2e339e7aab7 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 17 Jul 2023 14:02:25 +0800 Subject: [PATCH 8/9] Refine the code Signed-off-by: JmPotato --- server/api/region.go | 99 +++++++++++--------------------------------- 1 file changed, 25 insertions(+), 74 deletions(-) diff --git a/server/api/region.go b/server/api/region.go index fc2aa4920e8..db790e10127 100644 --- a/server/api/region.go +++ b/server/api/region.go @@ -455,9 +455,16 @@ func (h *regionsHandler) GetKeyspaceRegions(w http.ResponseWriter, r *http.Reque // @Success 200 {object} RegionsInfo // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /regions/check/miss-peer [get] -func (h *regionsHandler) GetMissPeerRegions(w http.ResponseWriter, r *http.Request) { +func (h *regionsHandler) GetMissPeerRegions(w http.ResponseWriter, _ *http.Request) { + h.getRegionsByType(w, statistics.MissPeer) +} + +func (h *regionsHandler) getRegionsByType( + w http.ResponseWriter, + typ statistics.RegionStatisticType, +) { handler := h.svr.GetHandler() - regions, err := handler.GetRegionsByType(statistics.MissPeer) + regions, err := handler.GetRegionsByType(typ) if err != nil { h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return @@ -472,15 +479,8 @@ func (h *regionsHandler) GetMissPeerRegions(w http.ResponseWriter, r *http.Reque // @Success 200 {object} RegionsInfo // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /regions/check/extra-peer [get] -func (h *regionsHandler) GetExtraPeerRegions(w http.ResponseWriter, r *http.Request) { - handler := h.svr.GetHandler() - regions, err := handler.GetRegionsByType(statistics.ExtraPeer) - if err != nil { - h.rd.JSON(w, http.StatusInternalServerError, err.Error()) - return - } - regionsInfo := convertToAPIRegions(regions) - h.rd.JSON(w, http.StatusOK, regionsInfo) +func (h *regionsHandler) GetExtraPeerRegions(w http.ResponseWriter, _ *http.Request) { + h.getRegionsByType(w, statistics.ExtraPeer) } // @Tags region @@ -489,15 +489,8 @@ func (h *regionsHandler) GetExtraPeerRegions(w http.ResponseWriter, r *http.Requ // @Success 200 {object} RegionsInfo // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /regions/check/pending-peer [get] -func (h *regionsHandler) GetPendingPeerRegions(w http.ResponseWriter, r *http.Request) { - handler := h.svr.GetHandler() - regions, err := handler.GetRegionsByType(statistics.PendingPeer) - if err != nil { - h.rd.JSON(w, http.StatusInternalServerError, err.Error()) - return - } - regionsInfo := convertToAPIRegions(regions) - h.rd.JSON(w, http.StatusOK, regionsInfo) +func (h *regionsHandler) GetPendingPeerRegions(w http.ResponseWriter, _ *http.Request) { + h.getRegionsByType(w, statistics.PendingPeer) } // @Tags region @@ -506,15 +499,8 @@ func (h *regionsHandler) GetPendingPeerRegions(w http.ResponseWriter, r *http.Re // @Success 200 {object} RegionsInfo // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /regions/check/down-peer [get] -func (h *regionsHandler) GetDownPeerRegions(w http.ResponseWriter, r *http.Request) { - handler := h.svr.GetHandler() - regions, err := handler.GetRegionsByType(statistics.DownPeer) - if err != nil { - h.rd.JSON(w, http.StatusInternalServerError, err.Error()) - return - } - regionsInfo := convertToAPIRegions(regions) - h.rd.JSON(w, http.StatusOK, regionsInfo) +func (h *regionsHandler) GetDownPeerRegions(w http.ResponseWriter, _ *http.Request) { + h.getRegionsByType(w, statistics.DownPeer) } // @Tags region @@ -523,15 +509,8 @@ func (h *regionsHandler) GetDownPeerRegions(w http.ResponseWriter, r *http.Reque // @Success 200 {object} RegionsInfo // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /regions/check/learner-peer [get] -func (h *regionsHandler) GetLearnerPeerRegions(w http.ResponseWriter, r *http.Request) { - handler := h.svr.GetHandler() - regions, err := handler.GetRegionsByType(statistics.LearnerPeer) - if err != nil { - h.rd.JSON(w, http.StatusInternalServerError, err.Error()) - return - } - regionsInfo := convertToAPIRegions(regions) - h.rd.JSON(w, http.StatusOK, regionsInfo) +func (h *regionsHandler) GetLearnerPeerRegions(w http.ResponseWriter, _ *http.Request) { + h.getRegionsByType(w, statistics.LearnerPeer) } // @Tags region @@ -540,15 +519,8 @@ func (h *regionsHandler) GetLearnerPeerRegions(w http.ResponseWriter, r *http.Re // @Success 200 {object} RegionsInfo // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /regions/check/offline-peer [get] -func (h *regionsHandler) GetOfflinePeerRegions(w http.ResponseWriter, r *http.Request) { - handler := h.svr.GetHandler() - regions, err := handler.GetRegionsByType(statistics.OfflinePeer) - if err != nil { - h.rd.JSON(w, http.StatusInternalServerError, err.Error()) - return - } - regionsInfo := convertToAPIRegions(regions) - h.rd.JSON(w, http.StatusOK, regionsInfo) +func (h *regionsHandler) GetOfflinePeerRegions(w http.ResponseWriter, _ *http.Request) { + h.getRegionsByType(w, statistics.OfflinePeer) } // @Tags region @@ -557,15 +529,8 @@ func (h *regionsHandler) GetOfflinePeerRegions(w http.ResponseWriter, r *http.Re // @Success 200 {object} RegionsInfo // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /regions/check/oversized-region [get] -func (h *regionsHandler) GetOverSizedRegions(w http.ResponseWriter, r *http.Request) { - handler := h.svr.GetHandler() - regions, err := handler.GetRegionsByType(statistics.OversizedRegion) - if err != nil { - h.rd.JSON(w, http.StatusInternalServerError, err.Error()) - return - } - regionsInfo := convertToAPIRegions(regions) - h.rd.JSON(w, http.StatusOK, regionsInfo) +func (h *regionsHandler) GetOverSizedRegions(w http.ResponseWriter, _ *http.Request) { + h.getRegionsByType(w, statistics.OversizedRegion) } // @Tags region @@ -574,15 +539,8 @@ func (h *regionsHandler) GetOverSizedRegions(w http.ResponseWriter, r *http.Requ // @Success 200 {object} RegionsInfo // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /regions/check/undersized-region [get] -func (h *regionsHandler) GetUndersizedRegions(w http.ResponseWriter, r *http.Request) { - handler := h.svr.GetHandler() - regions, err := handler.GetRegionsByType(statistics.UndersizedRegion) - if err != nil { - h.rd.JSON(w, http.StatusInternalServerError, err.Error()) - return - } - regionsInfo := convertToAPIRegions(regions) - h.rd.JSON(w, http.StatusOK, regionsInfo) +func (h *regionsHandler) GetUndersizedRegions(w http.ResponseWriter, _ *http.Request) { + h.getRegionsByType(w, statistics.UndersizedRegion) } // @Tags region @@ -591,15 +549,8 @@ func (h *regionsHandler) GetUndersizedRegions(w http.ResponseWriter, r *http.Req // @Success 200 {object} RegionsInfo // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /regions/check/empty-region [get] -func (h *regionsHandler) GetEmptyRegions(w http.ResponseWriter, r *http.Request) { - handler := h.svr.GetHandler() - regions, err := handler.GetRegionsByType(statistics.EmptyRegion) - if err != nil { - h.rd.JSON(w, http.StatusInternalServerError, err.Error()) - return - } - regionsInfo := convertToAPIRegions(regions) - h.rd.JSON(w, http.StatusOK, regionsInfo) +func (h *regionsHandler) GetEmptyRegions(w http.ResponseWriter, _ *http.Request) { + h.getRegionsByType(w, statistics.EmptyRegion) } type histItem struct { From b729f303a08befe84da041fb0be19de63987f3df Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 17 Jul 2023 14:26:04 +0800 Subject: [PATCH 9/9] Remove offlineRegionStatusGauge Signed-off-by: JmPotato --- metrics/grafana/pd.json | 7 ------- pkg/statistics/metrics.go | 9 --------- pkg/statistics/region_collection.go | 8 +++----- 3 files changed, 3 insertions(+), 21 deletions(-) diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index f07f4a30584..6ca2829167c 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -796,13 +796,6 @@ "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "B" - }, - { - "expr": "pd_regions_offline_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"offline-peer-region-count\", instance=\"$instance\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{type}}", - "refId": "C" } ], "thresholds": [ diff --git a/pkg/statistics/metrics.go b/pkg/statistics/metrics.go index bd4c897e258..a5ea07f4f55 100644 --- a/pkg/statistics/metrics.go +++ b/pkg/statistics/metrics.go @@ -41,14 +41,6 @@ var ( Help: "Status of the regions.", }, []string{"type"}) - offlineRegionStatusGauge = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: "pd", - Subsystem: "regions", - Name: "offline_status", - Help: "Status of the offline regions.", - }, []string{"type"}) - clusterStatusGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pd", @@ -190,7 +182,6 @@ func init() { prometheus.MustRegister(hotCacheStatusGauge) prometheus.MustRegister(storeStatusGauge) prometheus.MustRegister(regionStatusGauge) - prometheus.MustRegister(offlineRegionStatusGauge) prometheus.MustRegister(clusterStatusGauge) prometheus.MustRegister(placementStatusGauge) prometheus.MustRegister(configStatusGauge) diff --git a/pkg/statistics/region_collection.go b/pkg/statistics/region_collection.go index a5ceff3912b..17ef8e91602 100644 --- a/pkg/statistics/region_collection.go +++ b/pkg/statistics/region_collection.go @@ -69,14 +69,12 @@ var ( regionExtraPeerRegionCounter = regionStatusGauge.WithLabelValues("extra-peer-region-count") regionDownPeerRegionCounter = regionStatusGauge.WithLabelValues("down-peer-region-count") regionPendingPeerRegionCounter = regionStatusGauge.WithLabelValues("pending-peer-region-count") + regionOfflinePeerRegionCounter = regionStatusGauge.WithLabelValues("offline-peer-region-count") regionLearnerPeerRegionCounter = regionStatusGauge.WithLabelValues("learner-peer-region-count") regionEmptyRegionCounter = regionStatusGauge.WithLabelValues("empty-region-count") regionOversizedRegionCounter = regionStatusGauge.WithLabelValues("oversized-region-count") regionUndersizedRegionCounter = regionStatusGauge.WithLabelValues("undersized-region-count") regionWitnessLeaderRegionCounter = regionStatusGauge.WithLabelValues("witness-leader-region-count") - // In order to maintain historical compatibility, we did not replace it with a unified `regionStatusGauge` metrics, - // but kept it so that we don't have to modify the Prometheus query on the Grafana dashboard. - regionOfflinePeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("offline-peer-region-count") ) // RegionInfoWithTS is used to record the extra timestamp status of a region. @@ -266,12 +264,12 @@ func (r *RegionStatistics) Collect() { regionExtraPeerRegionCounter.Set(float64(len(r.stats[ExtraPeer]))) regionDownPeerRegionCounter.Set(float64(len(r.stats[DownPeer]))) regionPendingPeerRegionCounter.Set(float64(len(r.stats[PendingPeer]))) + regionOfflinePeerRegionCounter.Set(float64(len(r.stats[OfflinePeer]))) regionLearnerPeerRegionCounter.Set(float64(len(r.stats[LearnerPeer]))) regionEmptyRegionCounter.Set(float64(len(r.stats[EmptyRegion]))) regionOversizedRegionCounter.Set(float64(len(r.stats[OversizedRegion]))) regionUndersizedRegionCounter.Set(float64(len(r.stats[UndersizedRegion]))) regionWitnessLeaderRegionCounter.Set(float64(len(r.stats[WitnessLeader]))) - regionOfflinePeerRegionCounter.Set(float64(len(r.stats[OfflinePeer]))) } // Reset resets the metrics of the regions' status. @@ -280,12 +278,12 @@ func (r *RegionStatistics) Reset() { regionExtraPeerRegionCounter.Set(0) regionDownPeerRegionCounter.Set(0) regionPendingPeerRegionCounter.Set(0) + regionOfflinePeerRegionCounter.Set(0) regionLearnerPeerRegionCounter.Set(0) regionEmptyRegionCounter.Set(0) regionOversizedRegionCounter.Set(0) regionUndersizedRegionCounter.Set(0) regionWitnessLeaderRegionCounter.Set(0) - regionOfflinePeerRegionCounter.Set(0) } // LabelStatistics is the statistics of the level of labels.