Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

statistics: get region info via core cluster inside RegionStatistics #6804

Merged
merged 9 commits into from
Jul 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions metrics/grafana/pd.json
Original file line number Diff line number Diff line change
Expand Up @@ -796,13 +796,6 @@
"intervalFactor": 2,
"legendFormat": "{{type}}",
"refId": "B"
},
{
"expr": "pd_regions_offline_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"offline-peer-region-count\", instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{type}}",
"refId": "C"
}
],
"thresholds": [
Expand Down
9 changes: 0 additions & 9 deletions pkg/statistics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,6 @@ var (
Help: "Status of the regions.",
}, []string{"type"})

offlineRegionStatusGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pd",
Subsystem: "regions",
Name: "offline_status",
Help: "Status of the offline regions.",
}, []string{"type"})

clusterStatusGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pd",
Expand Down Expand Up @@ -190,7 +182,6 @@ func init() {
prometheus.MustRegister(hotCacheStatusGauge)
prometheus.MustRegister(storeStatusGauge)
prometheus.MustRegister(regionStatusGauge)
prometheus.MustRegister(offlineRegionStatusGauge)
prometheus.MustRegister(clusterStatusGauge)
prometheus.MustRegister(placementStatusGauge)
prometheus.MustRegister(configStatusGauge)
Expand Down
196 changes: 73 additions & 123 deletions pkg/statistics/region_collection.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ import (
"github.com/tikv/pd/server/config"
)

// RegionInfoProvider is an interface to provide the region information.
type RegionInfoProvider interface {
// GetRegion returns the region information according to the given region ID.
GetRegion(regionID uint64) *core.RegionInfo
}

// RegionStatisticType represents the type of the region's status.
type RegionStatisticType uint32

Expand All @@ -42,84 +48,82 @@ const (
WitnessLeader
)

var regionStatisticTypes = []RegionStatisticType{
MissPeer,
ExtraPeer,
DownPeer,
PendingPeer,
OfflinePeer,
LearnerPeer,
EmptyRegion,
OversizedRegion,
UndersizedRegion,
WitnessLeader,
}

const nonIsolation = "none"

var (
// WithLabelValues is a heavy operation, define variable to avoid call it every time.
regionMissPeerRegionCounter = regionStatusGauge.WithLabelValues("miss-peer-region-count")
regionExtraPeerRegionCounter = regionStatusGauge.WithLabelValues("extra-peer-region-count")
regionDownPeerRegionCounter = regionStatusGauge.WithLabelValues("down-peer-region-count")
regionPendingPeerRegionCounter = regionStatusGauge.WithLabelValues("pending-peer-region-count")
regionLearnerPeerRegionCounter = regionStatusGauge.WithLabelValues("learner-peer-region-count")
regionEmptyRegionCounter = regionStatusGauge.WithLabelValues("empty-region-count")
regionOversizedRegionCounter = regionStatusGauge.WithLabelValues("oversized-region-count")
regionUndersizedRegionCounter = regionStatusGauge.WithLabelValues("undersized-region-count")
regionWitnesssLeaderRegionCounter = regionStatusGauge.WithLabelValues("witness-leader-region-count")

offlineMissPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("miss-peer-region-count")
offlineExtraPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("extra-peer-region-count")
offlineDownPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("down-peer-region-count")
offlinePendingPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("pending-peer-region-count")
offlineLearnerPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("learner-peer-region-count")
offlineOfflinePeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("offline-peer-region-count")
regionMissPeerRegionCounter = regionStatusGauge.WithLabelValues("miss-peer-region-count")
regionExtraPeerRegionCounter = regionStatusGauge.WithLabelValues("extra-peer-region-count")
regionDownPeerRegionCounter = regionStatusGauge.WithLabelValues("down-peer-region-count")
regionPendingPeerRegionCounter = regionStatusGauge.WithLabelValues("pending-peer-region-count")
regionOfflinePeerRegionCounter = regionStatusGauge.WithLabelValues("offline-peer-region-count")
regionLearnerPeerRegionCounter = regionStatusGauge.WithLabelValues("learner-peer-region-count")
regionEmptyRegionCounter = regionStatusGauge.WithLabelValues("empty-region-count")
regionOversizedRegionCounter = regionStatusGauge.WithLabelValues("oversized-region-count")
regionUndersizedRegionCounter = regionStatusGauge.WithLabelValues("undersized-region-count")
regionWitnessLeaderRegionCounter = regionStatusGauge.WithLabelValues("witness-leader-region-count")
)

// RegionInfo is used to record the status of region.
type RegionInfo struct {
*core.RegionInfo
// RegionInfoWithTS is used to record the extra timestamp status of a region.
type RegionInfoWithTS struct {
id uint64
startMissVoterPeerTS int64
startDownPeerTS int64
}

// RegionStatistics is used to record the status of regions.
type RegionStatistics struct {
sync.RWMutex
rip RegionInfoProvider
conf sc.CheckerConfig
stats map[RegionStatisticType]map[uint64]*RegionInfo
offlineStats map[RegionStatisticType]map[uint64]*core.RegionInfo
stats map[RegionStatisticType]map[uint64]*RegionInfoWithTS
index map[uint64]RegionStatisticType
offlineIndex map[uint64]RegionStatisticType
ruleManager *placement.RuleManager
storeConfigManager *config.StoreConfigManager
}

// NewRegionStatistics creates a new RegionStatistics.
func NewRegionStatistics(conf sc.CheckerConfig, ruleManager *placement.RuleManager, storeConfigManager *config.StoreConfigManager) *RegionStatistics {
func NewRegionStatistics(
rip RegionInfoProvider,
conf sc.CheckerConfig,
ruleManager *placement.RuleManager,
storeConfigManager *config.StoreConfigManager,
) *RegionStatistics {
r := &RegionStatistics{
rip: rip,
conf: conf,
ruleManager: ruleManager,
storeConfigManager: storeConfigManager,
stats: make(map[RegionStatisticType]map[uint64]*RegionInfo),
offlineStats: make(map[RegionStatisticType]map[uint64]*core.RegionInfo),
stats: make(map[RegionStatisticType]map[uint64]*RegionInfoWithTS),
index: make(map[uint64]RegionStatisticType),
offlineIndex: make(map[uint64]RegionStatisticType),
}
r.stats[MissPeer] = make(map[uint64]*RegionInfo)
r.stats[ExtraPeer] = make(map[uint64]*RegionInfo)
r.stats[DownPeer] = make(map[uint64]*RegionInfo)
r.stats[PendingPeer] = make(map[uint64]*RegionInfo)
r.stats[LearnerPeer] = make(map[uint64]*RegionInfo)
r.stats[EmptyRegion] = make(map[uint64]*RegionInfo)
r.stats[OversizedRegion] = make(map[uint64]*RegionInfo)
r.stats[UndersizedRegion] = make(map[uint64]*RegionInfo)
r.stats[WitnessLeader] = make(map[uint64]*RegionInfo)

r.offlineStats[MissPeer] = make(map[uint64]*core.RegionInfo)
r.offlineStats[ExtraPeer] = make(map[uint64]*core.RegionInfo)
r.offlineStats[DownPeer] = make(map[uint64]*core.RegionInfo)
r.offlineStats[PendingPeer] = make(map[uint64]*core.RegionInfo)
r.offlineStats[LearnerPeer] = make(map[uint64]*core.RegionInfo)
r.offlineStats[OfflinePeer] = make(map[uint64]*core.RegionInfo)
for _, typ := range regionStatisticTypes {
r.stats[typ] = make(map[uint64]*RegionInfoWithTS)
}
return r
}

// GetRegionStatsByType gets the status of the region by types. The regions here need to be cloned, otherwise, it may cause data race problems.
// GetRegionStatsByType gets the status of the region by types.
// The regions here need to be cloned, otherwise, it may cause data race problems.
func (r *RegionStatistics) GetRegionStatsByType(typ RegionStatisticType) []*core.RegionInfo {
r.RLock()
defer r.RUnlock()
res := make([]*core.RegionInfo, 0, len(r.stats[typ]))
for _, r := range r.stats[typ] {
res = append(res, r.RegionInfo.Clone())
for regionID := range r.stats[typ] {
res = append(res, r.rip.GetRegion(regionID).Clone())
}
return res
}
Expand All @@ -132,17 +136,6 @@ func (r *RegionStatistics) IsRegionStatsType(regionID uint64, typ RegionStatisti
return exist
}

// GetOfflineRegionStatsByType gets the status of the offline region by types. The regions here need to be cloned, otherwise, it may cause data race problems.
func (r *RegionStatistics) GetOfflineRegionStatsByType(typ RegionStatisticType) []*core.RegionInfo {
r.RLock()
defer r.RUnlock()
res := make([]*core.RegionInfo, 0, len(r.stats[typ]))
for _, r := range r.offlineStats[typ] {
res = append(res, r.Clone())
}
return res
}

func (r *RegionStatistics) deleteEntry(deleteIndex RegionStatisticType, regionID uint64) {
for typ := RegionStatisticType(1); typ <= deleteIndex; typ <<= 1 {
if deleteIndex&typ != 0 {
Expand All @@ -151,14 +144,6 @@ func (r *RegionStatistics) deleteEntry(deleteIndex RegionStatisticType, regionID
}
}

func (r *RegionStatistics) deleteOfflineEntry(deleteIndex RegionStatisticType, regionID uint64) {
for typ := RegionStatisticType(1); typ <= deleteIndex; typ <<= 1 {
if deleteIndex&typ != 0 {
delete(r.offlineStats[typ], regionID)
}
}
}

// RegionStatsNeedUpdate checks whether the region's status need to be updated
// due to some special state types.
func (r *RegionStatistics) RegionStatsNeedUpdate(region *core.RegionInfo) bool {
Expand All @@ -175,15 +160,13 @@ func (r *RegionStatistics) RegionStatsNeedUpdate(region *core.RegionInfo) bool {
func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.StoreInfo) {
r.Lock()
defer r.Unlock()
// Region state.
regionID := region.GetID()
var (
peerTypeIndex RegionStatisticType
offlinePeerTypeIndex RegionStatisticType
deleteIndex RegionStatisticType
desiredReplicas = r.conf.GetMaxReplicas()
desiredVoters = desiredReplicas
peerTypeIndex RegionStatisticType
deleteIndex RegionStatisticType
)
desiredReplicas := r.conf.GetMaxReplicas()
desiredVoters := desiredReplicas
// Check if the region meets count requirements of its rules.
if r.conf.IsPlacementRulesEnabled() {
if !r.ruleManager.IsInitialized() {
log.Warn("ruleManager haven't been initialized")
Expand All @@ -199,19 +182,6 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store
}
}
}

var isRemoving bool

for _, store := range stores {
if store.IsRemoving() {
peer := region.GetStorePeer(store.GetID())
if peer != nil {
isRemoving = true
break
}
}
}

// Better to make sure once any of these conditions changes, it will trigger the heartbeat `save_cache`.
// Otherwise, the state may be out-of-date for a long time, which needs another way to apply the change ASAP.
// For example, see `RegionStatsNeedUpdate` above to know how `OversizedRegion` and `UndersizedRegion` are updated.
Expand All @@ -220,6 +190,17 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store
ExtraPeer: len(region.GetPeers()) > desiredReplicas,
DownPeer: len(region.GetDownPeers()) > 0,
PendingPeer: len(region.GetPendingPeers()) > 0,
OfflinePeer: func() bool {
for _, store := range stores {
if store.IsRemoving() {
peer := region.GetStorePeer(store.GetID())
if peer != nil {
return true
}
}
}
return false
}(),
LearnerPeer: len(region.GetLearners()) > 0,
EmptyRegion: region.GetApproximateSize() <= core.EmptyRegionApproximateSize,
OversizedRegion: region.IsOversized(
Expand All @@ -232,18 +213,13 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store
),
WitnessLeader: region.GetLeader().GetIsWitness(),
}

// Check if the region meets any of the conditions and update the corresponding info.
regionID := region.GetID()
for typ, c := range conditions {
if c {
if isRemoving && typ < EmptyRegion {
r.offlineStats[typ][regionID] = region
offlinePeerTypeIndex |= typ
}
info := r.stats[typ][regionID]
if info == nil {
info = &RegionInfo{
RegionInfo: region,
}
info = &RegionInfoWithTS{id: regionID}
}
if typ == DownPeer {
if info.startDownPeerTS != 0 {
Expand All @@ -263,18 +239,7 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store
peerTypeIndex |= typ
}
}

if isRemoving {
r.offlineStats[OfflinePeer][regionID] = region
offlinePeerTypeIndex |= OfflinePeer
}

if oldIndex, ok := r.offlineIndex[regionID]; ok {
deleteIndex = oldIndex &^ offlinePeerTypeIndex
}
r.deleteOfflineEntry(deleteIndex, regionID)
r.offlineIndex[regionID] = offlinePeerTypeIndex

// Remove the info if any of the conditions are not met any more.
if oldIndex, ok := r.index[regionID]; ok {
deleteIndex = oldIndex &^ peerTypeIndex
}
Expand All @@ -289,9 +254,6 @@ func (r *RegionStatistics) ClearDefunctRegion(regionID uint64) {
if oldIndex, ok := r.index[regionID]; ok {
r.deleteEntry(oldIndex, regionID)
}
if oldIndex, ok := r.offlineIndex[regionID]; ok {
r.deleteOfflineEntry(oldIndex, regionID)
}
}

// Collect collects the metrics of the regions' status.
Expand All @@ -302,18 +264,12 @@ func (r *RegionStatistics) Collect() {
regionExtraPeerRegionCounter.Set(float64(len(r.stats[ExtraPeer])))
regionDownPeerRegionCounter.Set(float64(len(r.stats[DownPeer])))
regionPendingPeerRegionCounter.Set(float64(len(r.stats[PendingPeer])))
regionOfflinePeerRegionCounter.Set(float64(len(r.stats[OfflinePeer])))
regionLearnerPeerRegionCounter.Set(float64(len(r.stats[LearnerPeer])))
regionEmptyRegionCounter.Set(float64(len(r.stats[EmptyRegion])))
regionOversizedRegionCounter.Set(float64(len(r.stats[OversizedRegion])))
regionUndersizedRegionCounter.Set(float64(len(r.stats[UndersizedRegion])))
regionWitnesssLeaderRegionCounter.Set(float64(len(r.stats[WitnessLeader])))

offlineMissPeerRegionCounter.Set(float64(len(r.offlineStats[MissPeer])))
offlineExtraPeerRegionCounter.Set(float64(len(r.offlineStats[ExtraPeer])))
offlineDownPeerRegionCounter.Set(float64(len(r.offlineStats[DownPeer])))
offlinePendingPeerRegionCounter.Set(float64(len(r.offlineStats[PendingPeer])))
offlineLearnerPeerRegionCounter.Set(float64(len(r.offlineStats[LearnerPeer])))
offlineOfflinePeerRegionCounter.Set(float64(len(r.offlineStats[OfflinePeer])))
regionWitnessLeaderRegionCounter.Set(float64(len(r.stats[WitnessLeader])))
}

// Reset resets the metrics of the regions' status.
Expand All @@ -322,18 +278,12 @@ func (r *RegionStatistics) Reset() {
regionExtraPeerRegionCounter.Set(0)
regionDownPeerRegionCounter.Set(0)
regionPendingPeerRegionCounter.Set(0)
regionOfflinePeerRegionCounter.Set(0)
regionLearnerPeerRegionCounter.Set(0)
regionEmptyRegionCounter.Set(0)
regionOversizedRegionCounter.Set(0)
regionUndersizedRegionCounter.Set(0)
regionWitnesssLeaderRegionCounter.Set(0)

offlineMissPeerRegionCounter.Set(0)
offlineExtraPeerRegionCounter.Set(0)
offlineDownPeerRegionCounter.Set(0)
offlinePendingPeerRegionCounter.Set(0)
offlineLearnerPeerRegionCounter.Set(0)
offlineOfflinePeerRegionCounter.Set(0)
regionWitnessLeaderRegionCounter.Set(0)
}

// LabelStatistics is the statistics of the level of labels.
Expand Down
Loading