Skip to content

Commit

Permalink
metric: migrate all histograms to use prometheus-backed version
Browse files Browse the repository at this point in the history
In a previous change, a new prometheus-backed histogram library was
introdced to help standardize histogram buckets across the codebase.
This change migrates all existing histograms to use the new library.

related: cockroachdb#85990

Release justification: low risk, high benefit
Release note (ops change): This change introduces a new histogram
implementation that will reduce the total number of buckets and
standardize them across all usage. This should help increase the
usability of histograms when exxported to a UI (i.e. Grafana).
  • Loading branch information
aadityasondhi committed Aug 23, 2022
1 parent 5658e48 commit 8477d00
Show file tree
Hide file tree
Showing 37 changed files with 480 additions and 540 deletions.
2 changes: 1 addition & 1 deletion pkg/ccl/changefeedccl/changefeed_processors.go
Original file line number Diff line number Diff line change
Expand Up @@ -840,7 +840,7 @@ func (j *jobState) checkpointCompleted(ctx context.Context, checkpointDuration t

j.metrics.CheckpointHistNanos.RecordValue(checkpointDuration.Nanoseconds())
j.lastProgressUpdate = j.ts.Now()
j.checkpointDuration = time.Duration(j.metrics.CheckpointHistNanos.Snapshot().Mean())
j.checkpointDuration = time.Duration(j.metrics.CheckpointHistNanos.Mean())
j.progressUpdatesSkipped = false
}

Expand Down
46 changes: 20 additions & 26 deletions pkg/ccl/changefeedccl/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -417,20 +417,15 @@ func newAggregateMetrics(histogramWindow time.Duration) *AggMetrics {
a := &AggMetrics{
ErrorRetries: b.Counter(metaChangefeedErrorRetries),
EmittedMessages: b.Counter(metaChangefeedEmittedMessages),
MessageSize: b.Histogram(metaMessageSize,
histogramWindow, 10<<20 /* 10MB max message size */, 1),
EmittedBytes: b.Counter(metaChangefeedEmittedBytes),
FlushedBytes: b.Counter(metaChangefeedFlushedBytes),
Flushes: b.Counter(metaChangefeedFlushes),

BatchHistNanos: b.Histogram(metaChangefeedBatchHistNanos,
histogramWindow, changefeedBatchHistMaxLatency.Nanoseconds(), 1),
FlushHistNanos: b.Histogram(metaChangefeedFlushHistNanos,
histogramWindow, changefeedFlushHistMaxLatency.Nanoseconds(), 2),
CommitLatency: b.Histogram(metaCommitLatency,
histogramWindow, commitLatencyMaxValue.Nanoseconds(), 1),
AdmitLatency: b.Histogram(metaAdmitLatency, histogramWindow,
admitLatencyMaxValue.Nanoseconds(), 1),
MessageSize: b.Histogram(metaMessageSize, histogramWindow, metric.DataSizeBuckets),
EmittedBytes: b.Counter(metaChangefeedEmittedBytes),
FlushedBytes: b.Counter(metaChangefeedFlushedBytes),
Flushes: b.Counter(metaChangefeedFlushes),

BatchHistNanos: b.Histogram(metaChangefeedBatchHistNanos, histogramWindow, metric.BatchProcessLatencyBuckets),
FlushHistNanos: b.Histogram(metaChangefeedFlushHistNanos, histogramWindow, metric.BatchProcessLatencyBuckets),
CommitLatency: b.Histogram(metaCommitLatency, histogramWindow, metric.BatchProcessLatencyBuckets),
AdmitLatency: b.Histogram(metaAdmitLatency, histogramWindow, metric.BatchProcessLatencyBuckets),
BackfillCount: b.Gauge(metaChangefeedBackfillCount),
BackfillPendingRanges: b.Gauge(metaChangefeedBackfillPendingRanges),
RunningCount: b.Gauge(metaChangefeedRunning),
Expand Down Expand Up @@ -505,7 +500,7 @@ type Metrics struct {
Failures *metric.Counter
ResolvedMessages *metric.Counter
QueueTimeNanos *metric.Counter
CheckpointHistNanos *metric.Histogram
CheckpointHistNanos *metric.HistogramV2
FrontierUpdates *metric.Counter
ThrottleMetrics cdcutils.Metrics
ReplanCount *metric.Counter
Expand All @@ -529,17 +524,16 @@ func (m *Metrics) getSLIMetrics(scope string) (*sliMetrics, error) {
// MakeMetrics makes the metrics for changefeed monitoring.
func MakeMetrics(histogramWindow time.Duration) metric.Struct {
m := &Metrics{
AggMetrics: newAggregateMetrics(histogramWindow),
KVFeedMetrics: kvevent.MakeMetrics(histogramWindow),
SchemaFeedMetrics: schemafeed.MakeMetrics(histogramWindow),
ResolvedMessages: metric.NewCounter(metaChangefeedForwardedResolvedMessages),
Failures: metric.NewCounter(metaChangefeedFailures),
QueueTimeNanos: metric.NewCounter(metaEventQueueTime),
CheckpointHistNanos: metric.NewHistogram(metaChangefeedCheckpointHistNanos, histogramWindow,
changefeedCheckpointHistMaxLatency.Nanoseconds(), 2),
FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates),
ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow),
ReplanCount: metric.NewCounter(metaChangefeedReplanCount),
AggMetrics: newAggregateMetrics(histogramWindow),
KVFeedMetrics: kvevent.MakeMetrics(histogramWindow),
SchemaFeedMetrics: schemafeed.MakeMetrics(histogramWindow),
ResolvedMessages: metric.NewCounter(metaChangefeedForwardedResolvedMessages),
Failures: metric.NewCounter(metaChangefeedFailures),
QueueTimeNanos: metric.NewCounter(metaEventQueueTime),
CheckpointHistNanos: metric.NewHistogramV2(metaChangefeedCheckpointHistNanos, histogramWindow, metric.IOLatencyBuckets),
FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates),
ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow),
ReplanCount: metric.NewCounter(metaChangefeedReplanCount),
}

m.mu.resolved = make(map[int]hlc.Timestamp)
Expand Down
2 changes: 1 addition & 1 deletion pkg/ccl/sqlproxyccl/connector.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ type connector struct {

// DialTenantLatency tracks how long it takes to retrieve the address for
// a tenant and set up a tcp connection to the address.
DialTenantLatency *metric.Histogram
DialTenantLatency *metric.HistogramV2

// DialTenantRetries counts how often dialing a tenant is retried.
DialTenantRetries *metric.Counter
Expand Down
10 changes: 7 additions & 3 deletions pkg/ccl/sqlproxyccl/connector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,9 @@ func TestConnector_dialTenantCluster(t *testing.T) {
defer cancel()

c := &connector{
DialTenantLatency: metric.NewLatency(metaDialTenantLatency, time.Millisecond),
DialTenantLatency: metric.NewHistogramV2(
metaDialTenantLatency, time.Millisecond, metric.IOLatencyBuckets,
),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
}
c.testingKnobs.lookupAddr = func(ctx context.Context) (string, error) {
Expand Down Expand Up @@ -403,8 +405,10 @@ func TestConnector_dialTenantCluster(t *testing.T) {

var reportFailureFnCount int
c := &connector{
TenantID: roachpb.MakeTenantID(42),
DialTenantLatency: metric.NewLatency(metaDialTenantLatency, time.Millisecond),
TenantID: roachpb.MakeTenantID(42),
DialTenantLatency: metric.NewHistogramV2(
metaDialTenantLatency, time.Millisecond, metric.IOLatencyBuckets,
),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
}
c.DirectoryCache = &testTenantDirectoryCache{
Expand Down
22 changes: 12 additions & 10 deletions pkg/ccl/sqlproxyccl/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,19 @@ type metrics struct {
RoutingErrCount *metric.Counter
RefusedConnCount *metric.Counter
SuccessfulConnCount *metric.Counter
ConnectionLatency *metric.Histogram
ConnectionLatency *metric.HistogramV2
AuthFailedCount *metric.Counter
ExpiredClientConnCount *metric.Counter

DialTenantLatency *metric.Histogram
DialTenantLatency *metric.HistogramV2
DialTenantRetries *metric.Counter

ConnMigrationSuccessCount *metric.Counter
ConnMigrationErrorFatalCount *metric.Counter
ConnMigrationErrorRecoverableCount *metric.Counter
ConnMigrationAttemptedCount *metric.Counter
ConnMigrationAttemptedLatency *metric.Histogram
ConnMigrationTransferResponseMessageSize *metric.Histogram
ConnMigrationAttemptedLatency *metric.HistogramV2
ConnMigrationTransferResponseMessageSize *metric.HistogramV2

QueryCancelReceivedPGWire *metric.Counter
QueryCancelReceivedHTTP *metric.Counter
Expand Down Expand Up @@ -224,32 +224,34 @@ func makeProxyMetrics() metrics {
RoutingErrCount: metric.NewCounter(metaRoutingErrCount),
RefusedConnCount: metric.NewCounter(metaRefusedConnCount),
SuccessfulConnCount: metric.NewCounter(metaSuccessfulConnCount),
ConnectionLatency: metric.NewLatency(
ConnectionLatency: metric.NewHistogramV2(
metaConnMigrationAttemptedCount,
base.DefaultHistogramWindowInterval(),
metric.IOLatencyBuckets,
),
AuthFailedCount: metric.NewCounter(metaAuthFailedCount),
ExpiredClientConnCount: metric.NewCounter(metaExpiredClientConnCount),
// Connector metrics.
DialTenantLatency: metric.NewLatency(
DialTenantLatency: metric.NewHistogramV2(
metaDialTenantLatency,
base.DefaultHistogramWindowInterval(),
metric.NetworkLatencyBuckets,
),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
// Connection migration metrics.
ConnMigrationSuccessCount: metric.NewCounter(metaConnMigrationSuccessCount),
ConnMigrationErrorFatalCount: metric.NewCounter(metaConnMigrationErrorFatalCount),
ConnMigrationErrorRecoverableCount: metric.NewCounter(metaConnMigrationErrorRecoverableCount),
ConnMigrationAttemptedCount: metric.NewCounter(metaConnMigrationAttemptedCount),
ConnMigrationAttemptedLatency: metric.NewLatency(
ConnMigrationAttemptedLatency: metric.NewHistogramV2(
metaConnMigrationAttemptedLatency,
base.DefaultHistogramWindowInterval(),
metric.NetworkLatencyBuckets,
),
ConnMigrationTransferResponseMessageSize: metric.NewHistogram(
ConnMigrationTransferResponseMessageSize: metric.NewHistogramV2(
metaConnMigrationTransferResponseMessageSize,
base.DefaultHistogramWindowInterval(),
maxExpectedTransferResponseMessageSize,
1,
metric.DataSizeBuckets,
),
QueryCancelReceivedPGWire: metric.NewCounter(metaQueryCancelReceivedPGWire),
QueryCancelReceivedHTTP: metric.NewCounter(metaQueryCancelReceivedHTTP),
Expand Down
18 changes: 9 additions & 9 deletions pkg/ccl/streamingccl/streamingest/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,9 @@ type Metrics struct {
Flushes *metric.Counter
JobProgressUpdates *metric.Counter
ResolvedEvents *metric.Counter
FlushHistNanos *metric.Histogram
CommitLatency *metric.Histogram
AdmitLatency *metric.Histogram
FlushHistNanos *metric.HistogramV2
CommitLatency *metric.HistogramV2
AdmitLatency *metric.HistogramV2
RunningCount *metric.Gauge
EarliestDataCheckpointSpan *metric.Gauge
LatestDataCheckpointSpan *metric.Gauge
Expand All @@ -134,12 +134,12 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
Flushes: metric.NewCounter(metaStreamingFlushes),
ResolvedEvents: metric.NewCounter(metaStreamingResolvedEventsIngested),
JobProgressUpdates: metric.NewCounter(metaJobProgressUpdates),
FlushHistNanos: metric.NewHistogram(metaStreamingFlushHistNanos,
histogramWindow, streamingFlushHistMaxLatency.Nanoseconds(), 1),
CommitLatency: metric.NewHistogram(metaStreamingCommitLatency,
histogramWindow, streamingCommitLatencyMaxValue.Nanoseconds(), 1),
AdmitLatency: metric.NewHistogram(metaStreamingAdmitLatency,
histogramWindow, streamingAdmitLatencyMaxValue.Nanoseconds(), 1),
FlushHistNanos: metric.NewHistogramV2(metaStreamingFlushHistNanos,
histogramWindow, metric.BatchProcessLatencyBuckets),
CommitLatency: metric.NewHistogramV2(metaStreamingCommitLatency,
histogramWindow, metric.BatchProcessLatencyBuckets),
AdmitLatency: metric.NewHistogramV2(metaStreamingAdmitLatency,
histogramWindow, metric.BatchProcessLatencyBuckets),
RunningCount: metric.NewGauge(metaStreamsRunning),
EarliestDataCheckpointSpan: metric.NewGauge(metaEarliestDataCheckpointSpan),
LatestDataCheckpointSpan: metric.NewGauge(metaLatestDataCheckpointSpan),
Expand Down
4 changes: 2 additions & 2 deletions pkg/kv/bulk/bulk_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
// Metrics contains pointers to the metrics for
// monitoring bulk operations.
type Metrics struct {
MaxBytesHist *metric.Histogram
MaxBytesHist *metric.HistogramV2
CurBytesCount *metric.Gauge
}

Expand Down Expand Up @@ -50,7 +50,7 @@ const log10int64times1000 = 19 * 1000
// MakeBulkMetrics instantiates the metrics holder for bulk operation monitoring.
func MakeBulkMetrics(histogramWindow time.Duration) Metrics {
return Metrics{
MaxBytesHist: metric.NewHistogram(metaMemMaxBytes, histogramWindow, log10int64times1000, 3),
MaxBytesHist: metric.NewHistogramV2(metaMemMaxBytes, histogramWindow, metric.MemoryUsageBuckets),
CurBytesCount: metric.NewGauge(metaMemCurBytes),
}
}
8 changes: 4 additions & 4 deletions pkg/kv/kvclient/kvcoord/txn_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ type TxnMetrics struct {
RefreshMemoryLimitExceeded *metric.Counter
RefreshAutoRetries *metric.Counter

Durations *metric.Histogram
Durations *metric.HistogramV2

TxnsWithCondensedIntents *metric.Counter
TxnsWithCondensedIntentsGauge *metric.Gauge
TxnsRejectedByLockSpanBudget *metric.Counter

// Restarts is the number of times we had to restart the transaction.
Restarts *metric.Histogram
Restarts *metric.HistogramV2

// Counts of restart types.
RestartsWriteTooOld telemetry.CounterWithMetric
Expand Down Expand Up @@ -274,11 +274,11 @@ func MakeTxnMetrics(histogramWindow time.Duration) TxnMetrics {
RefreshFailWithCondensedSpans: metric.NewCounter(metaRefreshFailWithCondensedSpans),
RefreshMemoryLimitExceeded: metric.NewCounter(metaRefreshMemoryLimitExceeded),
RefreshAutoRetries: metric.NewCounter(metaRefreshAutoRetries),
Durations: metric.NewLatency(metaDurationsHistograms, histogramWindow),
Durations: metric.NewHistogramV2(metaDurationsHistograms, histogramWindow, metric.IOLatencyBuckets),
TxnsWithCondensedIntents: metric.NewCounter(metaTxnsWithCondensedIntentSpans),
TxnsWithCondensedIntentsGauge: metric.NewGauge(metaTxnsWithCondensedIntentSpansGauge),
TxnsRejectedByLockSpanBudget: metric.NewCounter(metaTxnsRejectedByLockSpanBudget),
Restarts: metric.NewHistogram(metaRestartsHistogram, histogramWindow, 100, 3),
Restarts: metric.NewHistogramV2(metaRestartsHistogram, histogramWindow, metric.CountBuckets),
RestartsWriteTooOld: telemetry.NewCounterWithMetric(metaRestartsWriteTooOld),
RestartsWriteTooOldMulti: telemetry.NewCounterWithMetric(metaRestartsWriteTooOldMulti),
RestartsSerializable: telemetry.NewCounterWithMetric(metaRestartsSerializable),
Expand Down
20 changes: 12 additions & 8 deletions pkg/kv/kvprober/kvprober.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,10 @@ var (
type Metrics struct {
ReadProbeAttempts *metric.Counter
ReadProbeFailures *metric.Counter
ReadProbeLatency *metric.Histogram
ReadProbeLatency *metric.HistogramV2
WriteProbeAttempts *metric.Counter
WriteProbeFailures *metric.Counter
WriteProbeLatency *metric.Histogram
WriteProbeLatency *metric.HistogramV2
ProbePlanAttempts *metric.Counter
ProbePlanFailures *metric.Counter
}
Expand Down Expand Up @@ -214,14 +214,18 @@ func NewProber(opts Opts) *Prober {
writePlanner: newMeta2Planner(opts.DB, opts.Settings, func() time.Duration { return writeInterval.Get(&opts.Settings.SV) }),

metrics: Metrics{
ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts),
ReadProbeFailures: metric.NewCounter(metaReadProbeFailures),
ReadProbeLatency: metric.NewLatency(metaReadProbeLatency, opts.HistogramWindowInterval),
ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts),
ReadProbeFailures: metric.NewCounter(metaReadProbeFailures),
ReadProbeLatency: metric.NewHistogramV2(
metaReadProbeLatency, opts.HistogramWindowInterval, metric.IOLatencyBuckets,
),
WriteProbeAttempts: metric.NewCounter(metaWriteProbeAttempts),
WriteProbeFailures: metric.NewCounter(metaWriteProbeFailures),
WriteProbeLatency: metric.NewLatency(metaWriteProbeLatency, opts.HistogramWindowInterval),
ProbePlanAttempts: metric.NewCounter(metaProbePlanAttempts),
ProbePlanFailures: metric.NewCounter(metaProbePlanFailures),
WriteProbeLatency: metric.NewHistogramV2(
metaWriteProbeLatency, opts.HistogramWindowInterval, metric.IOLatencyBuckets,
),
ProbePlanAttempts: metric.NewCounter(metaProbePlanAttempts),
ProbePlanFailures: metric.NewCounter(metaProbePlanFailures),
},
tracer: opts.Tracer,
}
Expand Down
6 changes: 4 additions & 2 deletions pkg/kv/kvserver/liveness/liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ type Metrics struct {
HeartbeatSuccesses *metric.Counter
HeartbeatFailures telemetry.CounterWithMetric
EpochIncrements telemetry.CounterWithMetric
HeartbeatLatency *metric.Histogram
HeartbeatLatency *metric.HistogramV2
}

// IsLiveCallback is invoked when a node's IsLive state changes to true.
Expand Down Expand Up @@ -309,7 +309,9 @@ func NewNodeLiveness(opts NodeLivenessOptions) *NodeLiveness {
HeartbeatSuccesses: metric.NewCounter(metaHeartbeatSuccesses),
HeartbeatFailures: telemetry.NewCounterWithMetric(metaHeartbeatFailures),
EpochIncrements: telemetry.NewCounterWithMetric(metaEpochIncrements),
HeartbeatLatency: metric.NewLatency(metaHeartbeatLatency, opts.HistogramWindowInterval),
HeartbeatLatency: metric.NewHistogramV2(
metaHeartbeatLatency, opts.HistogramWindowInterval, metric.IOLatencyBuckets,
),
}
nl.mu.nodes = make(map[roachpb.NodeID]Record)
nl.heartbeatToken <- struct{}{}
Expand Down
Loading

0 comments on commit 8477d00

Please sign in to comment.