diff --git a/pkg/core/metrics.go b/pkg/core/metrics.go index 4928df8d0979..6f75c70c100b 100644 --- a/pkg/core/metrics.go +++ b/pkg/core/metrics.go @@ -40,8 +40,28 @@ var ( Help: "Bucketed histogram of processing count of handle the heartbeat stage.", }, []string{"name"}) - waitLockDurationSum = HeartbeatBreakdownHandleDurationSum.WithLabelValues("WaitLock") - waitLockCount = HeartbeatBreakdownHandleCount.WithLabelValues("WaitLock") + AcquireRegionsLockWaitDurationSum = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "pd", + Subsystem: "core", + Name: "acquire_regions_lock_wait_duration_seconds_sum", + Help: "Bucketed histogram of processing time (s) of waiting for acquiring regions lock.", + }, []string{"type"}) + AcquireRegionsLockWaitCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "pd", + Subsystem: "core", + Name: "acquire_regions_lock_wait_duration_seconds_count", + Help: "Bucketed histogram of processing count of waiting for acquiring regions lock.", + }, []string{"type"}) + + // lock statistics + waitRegionsLockDurationSum = AcquireRegionsLockWaitDurationSum.WithLabelValues("WaitRegionsLock") + waitRegionsLockCount = AcquireRegionsLockWaitCount.WithLabelValues("WaitRegionsLock") + waitSubRegionsLockDurationSum = AcquireRegionsLockWaitDurationSum.WithLabelValues("WaitSubRegionsLock") + waitSubRegionsLockCount = AcquireRegionsLockWaitCount.WithLabelValues("WaitSubRegionsLock") + + // heartbeat breakdown statistics preCheckDurationSum = HeartbeatBreakdownHandleDurationSum.WithLabelValues("PreCheck") preCheckCount = HeartbeatBreakdownHandleCount.WithLabelValues("PreCheck") asyncHotStatsDurationSum = HeartbeatBreakdownHandleDurationSum.WithLabelValues("AsyncHotStatsDuration") @@ -63,6 +83,8 @@ var ( func init() { prometheus.MustRegister(HeartbeatBreakdownHandleDurationSum) prometheus.MustRegister(HeartbeatBreakdownHandleCount) + prometheus.MustRegister(AcquireRegionsLockWaitDurationSum) + prometheus.MustRegister(AcquireRegionsLockWaitCount) } type saveCacheStats struct { diff --git a/pkg/core/region.go b/pkg/core/region.go index c4c8d53abd9e..6c058fd424f4 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -1699,23 +1699,34 @@ const magicCount = 15 * time.Second // CollectWaitLockMetrics collects the metrics of waiting time for lock func (r *RegionsInfo) CollectWaitLockMetrics() { - sTotalTime := atomic.LoadInt64(&r.t.totalWaitTime) - stTotalTime := atomic.LoadInt64(&r.st.totalWaitTime) - sLockCount := atomic.LoadInt64(&r.t.lockCount) - stLockCount := atomic.LoadInt64(&r.st.lockCount) - lastTotalWaitTime := atomic.LoadInt64(&r.t.lastTotalWaitTime) + atomic.LoadInt64(&r.st.lastTotalWaitTime) - lastLockCount := atomic.LoadInt64(&r.t.lastLockCount) + atomic.LoadInt64(&r.st.lastLockCount) - totalLockCount := sLockCount + stLockCount - totalWaitTime := sTotalTime + stTotalTime - atomic.StoreInt64(&r.t.lastTotalWaitTime, sTotalTime) - atomic.StoreInt64(&r.t.lastLockCount, sLockCount) - atomic.StoreInt64(&r.st.lastTotalWaitTime, stTotalTime) - atomic.StoreInt64(&r.st.lastLockCount, stLockCount) - if lastTotalWaitTime == 0 || lastLockCount == 0 || totalLockCount-lastLockCount < 0 || totalLockCount-lastLockCount > int64(magicCount) { + regionsLockTotalWaitTime := atomic.LoadInt64(&r.t.totalWaitTime) + regionsLockCount := atomic.LoadInt64(&r.t.lockCount) + + subRegionsLockTotalWaitTime := atomic.LoadInt64(&r.st.totalWaitTime) + subRegionsLockCount := atomic.LoadInt64(&r.st.lockCount) + + lastRegionsLockTotalWaitTime := atomic.LoadInt64(&r.t.lastTotalWaitTime) + lastsRegionsLockCount := atomic.LoadInt64(&r.t.lastLockCount) + + lastSubRegionsLockTotalWaitTime := atomic.LoadInt64(&r.st.lastTotalWaitTime) + lastSubRegionsLockCount := atomic.LoadInt64(&r.st.lastLockCount) + + // skip invalid situation like initial status + if lastRegionsLockTotalWaitTime == 0 || lastsRegionsLockCount == 0 || lastSubRegionsLockTotalWaitTime == 0 || lastSubRegionsLockCount == 0 || + regionsLockTotalWaitTime-lastRegionsLockTotalWaitTime < 0 || regionsLockCount-lastsRegionsLockCount > int64(magicCount) || + subRegionsLockTotalWaitTime-lastSubRegionsLockTotalWaitTime < 0 || subRegionsLockCount-lastSubRegionsLockCount > int64(magicCount) { return } - waitLockDurationSum.Add(time.Duration(totalWaitTime - lastTotalWaitTime).Seconds()) - waitLockCount.Add(float64(totalLockCount - lastLockCount)) + + waitRegionsLockDurationSum.Add(float64(regionsLockTotalWaitTime - lastRegionsLockTotalWaitTime)) + waitRegionsLockCount.Add(float64(regionsLockCount - lastsRegionsLockCount)) + waitSubRegionsLockDurationSum.Add(float64(subRegionsLockTotalWaitTime - lastSubRegionsLockTotalWaitTime)) + waitSubRegionsLockCount.Add(float64(subRegionsLockCount - lastSubRegionsLockCount)) + + atomic.StoreInt64(&r.t.lastTotalWaitTime, regionsLockTotalWaitTime) + atomic.StoreInt64(&r.t.lastLockCount, regionsLockCount) + atomic.StoreInt64(&r.st.lastTotalWaitTime, subRegionsLockTotalWaitTime) + atomic.StoreInt64(&r.st.lastLockCount, subRegionsLockCount) } // GetAdjacentRegions returns region's info that is adjacent with specific region