From 8477d00ddd55ba818a4385aaf28be6376fd77365 Mon Sep 17 00:00:00 2001 From: Aaditya Sondhi Date: Tue, 23 Aug 2022 11:14:25 -0400 Subject: [PATCH] metric: migrate all histograms to use prometheus-backed version In a previous change, a new prometheus-backed histogram library was introdced to help standardize histogram buckets across the codebase. This change migrates all existing histograms to use the new library. related: https://github.com/cockroachdb/cockroach/pull/85990 Release justification: low risk, high benefit Release note (ops change): This change introduces a new histogram implementation that will reduce the total number of buckets and standardize them across all usage. This should help increase the usability of histograms when exxported to a UI (i.e. Grafana). --- .../changefeedccl/changefeed_processors.go | 2 +- pkg/ccl/changefeedccl/metrics.go | 46 ++-- pkg/ccl/sqlproxyccl/connector.go | 2 +- pkg/ccl/sqlproxyccl/connector_test.go | 10 +- pkg/ccl/sqlproxyccl/metrics.go | 22 +- pkg/ccl/streamingccl/streamingest/metrics.go | 18 +- pkg/kv/bulk/bulk_metrics.go | 4 +- pkg/kv/kvclient/kvcoord/txn_metrics.go | 8 +- pkg/kv/kvprober/kvprober.go | 20 +- pkg/kv/kvserver/liveness/liveness.go | 6 +- pkg/kv/kvserver/metrics.go | 57 +++-- pkg/kv/kvserver/scheduler.go | 2 +- pkg/kv/kvserver/txnwait/metrics.go | 14 +- pkg/rpc/clock_offset.go | 6 +- pkg/server/node.go | 6 +- pkg/server/status/recorder.go | 20 -- pkg/server/status/recorder_test.go | 6 +- pkg/sql/conn_executor.go | 43 ++-- pkg/sql/execinfra/metrics.go | 12 +- pkg/sql/executor_statement_metrics.go | 18 +- pkg/sql/mem_metrics.go | 12 +- pkg/sql/pgwire/server.go | 14 +- .../sqlstats/persistedsqlstats/provider.go | 2 +- pkg/sql/sqlstats/sslocal/sql_stats.go | 2 +- pkg/sql/sqlstats/sslocal/sslocal_provider.go | 2 +- pkg/sql/ttl/ttljob/ttljob_metrics.go | 10 +- pkg/util/admission/work_queue.go | 7 +- pkg/util/metric/BUILD.bazel | 1 + pkg/util/metric/aggmetric/agg_metric.go | 4 +- pkg/util/metric/aggmetric/agg_metric_test.go | 5 +- pkg/util/metric/aggmetric/histogram.go | 27 +-- pkg/util/metric/histogram_buckets.go | 168 +++++++++++++ pkg/util/metric/histogram_buckets_test.go | 81 +++++++ pkg/util/metric/metric.go | 223 +----------------- pkg/util/metric/metric_test.go | 120 +--------- pkg/util/metric/registry_test.go | 10 +- pkg/util/mon/bytes_usage.go | 10 +- 37 files changed, 480 insertions(+), 540 deletions(-) create mode 100644 pkg/util/metric/histogram_buckets.go create mode 100644 pkg/util/metric/histogram_buckets_test.go diff --git a/pkg/ccl/changefeedccl/changefeed_processors.go b/pkg/ccl/changefeedccl/changefeed_processors.go index bc46b8c98e56..cda1e90c7275 100644 --- a/pkg/ccl/changefeedccl/changefeed_processors.go +++ b/pkg/ccl/changefeedccl/changefeed_processors.go @@ -840,7 +840,7 @@ func (j *jobState) checkpointCompleted(ctx context.Context, checkpointDuration t j.metrics.CheckpointHistNanos.RecordValue(checkpointDuration.Nanoseconds()) j.lastProgressUpdate = j.ts.Now() - j.checkpointDuration = time.Duration(j.metrics.CheckpointHistNanos.Snapshot().Mean()) + j.checkpointDuration = time.Duration(j.metrics.CheckpointHistNanos.Mean()) j.progressUpdatesSkipped = false } diff --git a/pkg/ccl/changefeedccl/metrics.go b/pkg/ccl/changefeedccl/metrics.go index 302babbde4ab..05d86ba243bb 100644 --- a/pkg/ccl/changefeedccl/metrics.go +++ b/pkg/ccl/changefeedccl/metrics.go @@ -417,20 +417,15 @@ func newAggregateMetrics(histogramWindow time.Duration) *AggMetrics { a := &AggMetrics{ ErrorRetries: b.Counter(metaChangefeedErrorRetries), EmittedMessages: b.Counter(metaChangefeedEmittedMessages), - MessageSize: b.Histogram(metaMessageSize, - histogramWindow, 10<<20 /* 10MB max message size */, 1), - EmittedBytes: b.Counter(metaChangefeedEmittedBytes), - FlushedBytes: b.Counter(metaChangefeedFlushedBytes), - Flushes: b.Counter(metaChangefeedFlushes), - - BatchHistNanos: b.Histogram(metaChangefeedBatchHistNanos, - histogramWindow, changefeedBatchHistMaxLatency.Nanoseconds(), 1), - FlushHistNanos: b.Histogram(metaChangefeedFlushHistNanos, - histogramWindow, changefeedFlushHistMaxLatency.Nanoseconds(), 2), - CommitLatency: b.Histogram(metaCommitLatency, - histogramWindow, commitLatencyMaxValue.Nanoseconds(), 1), - AdmitLatency: b.Histogram(metaAdmitLatency, histogramWindow, - admitLatencyMaxValue.Nanoseconds(), 1), + MessageSize: b.Histogram(metaMessageSize, histogramWindow, metric.DataSizeBuckets), + EmittedBytes: b.Counter(metaChangefeedEmittedBytes), + FlushedBytes: b.Counter(metaChangefeedFlushedBytes), + Flushes: b.Counter(metaChangefeedFlushes), + + BatchHistNanos: b.Histogram(metaChangefeedBatchHistNanos, histogramWindow, metric.BatchProcessLatencyBuckets), + FlushHistNanos: b.Histogram(metaChangefeedFlushHistNanos, histogramWindow, metric.BatchProcessLatencyBuckets), + CommitLatency: b.Histogram(metaCommitLatency, histogramWindow, metric.BatchProcessLatencyBuckets), + AdmitLatency: b.Histogram(metaAdmitLatency, histogramWindow, metric.BatchProcessLatencyBuckets), BackfillCount: b.Gauge(metaChangefeedBackfillCount), BackfillPendingRanges: b.Gauge(metaChangefeedBackfillPendingRanges), RunningCount: b.Gauge(metaChangefeedRunning), @@ -505,7 +500,7 @@ type Metrics struct { Failures *metric.Counter ResolvedMessages *metric.Counter QueueTimeNanos *metric.Counter - CheckpointHistNanos *metric.Histogram + CheckpointHistNanos *metric.HistogramV2 FrontierUpdates *metric.Counter ThrottleMetrics cdcutils.Metrics ReplanCount *metric.Counter @@ -529,17 +524,16 @@ func (m *Metrics) getSLIMetrics(scope string) (*sliMetrics, error) { // MakeMetrics makes the metrics for changefeed monitoring. func MakeMetrics(histogramWindow time.Duration) metric.Struct { m := &Metrics{ - AggMetrics: newAggregateMetrics(histogramWindow), - KVFeedMetrics: kvevent.MakeMetrics(histogramWindow), - SchemaFeedMetrics: schemafeed.MakeMetrics(histogramWindow), - ResolvedMessages: metric.NewCounter(metaChangefeedForwardedResolvedMessages), - Failures: metric.NewCounter(metaChangefeedFailures), - QueueTimeNanos: metric.NewCounter(metaEventQueueTime), - CheckpointHistNanos: metric.NewHistogram(metaChangefeedCheckpointHistNanos, histogramWindow, - changefeedCheckpointHistMaxLatency.Nanoseconds(), 2), - FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates), - ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow), - ReplanCount: metric.NewCounter(metaChangefeedReplanCount), + AggMetrics: newAggregateMetrics(histogramWindow), + KVFeedMetrics: kvevent.MakeMetrics(histogramWindow), + SchemaFeedMetrics: schemafeed.MakeMetrics(histogramWindow), + ResolvedMessages: metric.NewCounter(metaChangefeedForwardedResolvedMessages), + Failures: metric.NewCounter(metaChangefeedFailures), + QueueTimeNanos: metric.NewCounter(metaEventQueueTime), + CheckpointHistNanos: metric.NewHistogramV2(metaChangefeedCheckpointHistNanos, histogramWindow, metric.IOLatencyBuckets), + FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates), + ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow), + ReplanCount: metric.NewCounter(metaChangefeedReplanCount), } m.mu.resolved = make(map[int]hlc.Timestamp) diff --git a/pkg/ccl/sqlproxyccl/connector.go b/pkg/ccl/sqlproxyccl/connector.go index c6657f4c50bf..b5d6e7253476 100644 --- a/pkg/ccl/sqlproxyccl/connector.go +++ b/pkg/ccl/sqlproxyccl/connector.go @@ -76,7 +76,7 @@ type connector struct { // DialTenantLatency tracks how long it takes to retrieve the address for // a tenant and set up a tcp connection to the address. - DialTenantLatency *metric.Histogram + DialTenantLatency *metric.HistogramV2 // DialTenantRetries counts how often dialing a tenant is retried. DialTenantRetries *metric.Counter diff --git a/pkg/ccl/sqlproxyccl/connector_test.go b/pkg/ccl/sqlproxyccl/connector_test.go index 0d8eb941cdef..0ccc61c836a1 100644 --- a/pkg/ccl/sqlproxyccl/connector_test.go +++ b/pkg/ccl/sqlproxyccl/connector_test.go @@ -375,7 +375,9 @@ func TestConnector_dialTenantCluster(t *testing.T) { defer cancel() c := &connector{ - DialTenantLatency: metric.NewLatency(metaDialTenantLatency, time.Millisecond), + DialTenantLatency: metric.NewHistogramV2( + metaDialTenantLatency, time.Millisecond, metric.IOLatencyBuckets, + ), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), } c.testingKnobs.lookupAddr = func(ctx context.Context) (string, error) { @@ -403,8 +405,10 @@ func TestConnector_dialTenantCluster(t *testing.T) { var reportFailureFnCount int c := &connector{ - TenantID: roachpb.MakeTenantID(42), - DialTenantLatency: metric.NewLatency(metaDialTenantLatency, time.Millisecond), + TenantID: roachpb.MakeTenantID(42), + DialTenantLatency: metric.NewHistogramV2( + metaDialTenantLatency, time.Millisecond, metric.IOLatencyBuckets, + ), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), } c.DirectoryCache = &testTenantDirectoryCache{ diff --git a/pkg/ccl/sqlproxyccl/metrics.go b/pkg/ccl/sqlproxyccl/metrics.go index cf7abe31f044..2446626e7c16 100644 --- a/pkg/ccl/sqlproxyccl/metrics.go +++ b/pkg/ccl/sqlproxyccl/metrics.go @@ -24,19 +24,19 @@ type metrics struct { RoutingErrCount *metric.Counter RefusedConnCount *metric.Counter SuccessfulConnCount *metric.Counter - ConnectionLatency *metric.Histogram + ConnectionLatency *metric.HistogramV2 AuthFailedCount *metric.Counter ExpiredClientConnCount *metric.Counter - DialTenantLatency *metric.Histogram + DialTenantLatency *metric.HistogramV2 DialTenantRetries *metric.Counter ConnMigrationSuccessCount *metric.Counter ConnMigrationErrorFatalCount *metric.Counter ConnMigrationErrorRecoverableCount *metric.Counter ConnMigrationAttemptedCount *metric.Counter - ConnMigrationAttemptedLatency *metric.Histogram - ConnMigrationTransferResponseMessageSize *metric.Histogram + ConnMigrationAttemptedLatency *metric.HistogramV2 + ConnMigrationTransferResponseMessageSize *metric.HistogramV2 QueryCancelReceivedPGWire *metric.Counter QueryCancelReceivedHTTP *metric.Counter @@ -224,16 +224,18 @@ func makeProxyMetrics() metrics { RoutingErrCount: metric.NewCounter(metaRoutingErrCount), RefusedConnCount: metric.NewCounter(metaRefusedConnCount), SuccessfulConnCount: metric.NewCounter(metaSuccessfulConnCount), - ConnectionLatency: metric.NewLatency( + ConnectionLatency: metric.NewHistogramV2( metaConnMigrationAttemptedCount, base.DefaultHistogramWindowInterval(), + metric.IOLatencyBuckets, ), AuthFailedCount: metric.NewCounter(metaAuthFailedCount), ExpiredClientConnCount: metric.NewCounter(metaExpiredClientConnCount), // Connector metrics. - DialTenantLatency: metric.NewLatency( + DialTenantLatency: metric.NewHistogramV2( metaDialTenantLatency, base.DefaultHistogramWindowInterval(), + metric.NetworkLatencyBuckets, ), DialTenantRetries: metric.NewCounter(metaDialTenantRetries), // Connection migration metrics. @@ -241,15 +243,15 @@ func makeProxyMetrics() metrics { ConnMigrationErrorFatalCount: metric.NewCounter(metaConnMigrationErrorFatalCount), ConnMigrationErrorRecoverableCount: metric.NewCounter(metaConnMigrationErrorRecoverableCount), ConnMigrationAttemptedCount: metric.NewCounter(metaConnMigrationAttemptedCount), - ConnMigrationAttemptedLatency: metric.NewLatency( + ConnMigrationAttemptedLatency: metric.NewHistogramV2( metaConnMigrationAttemptedLatency, base.DefaultHistogramWindowInterval(), + metric.NetworkLatencyBuckets, ), - ConnMigrationTransferResponseMessageSize: metric.NewHistogram( + ConnMigrationTransferResponseMessageSize: metric.NewHistogramV2( metaConnMigrationTransferResponseMessageSize, base.DefaultHistogramWindowInterval(), - maxExpectedTransferResponseMessageSize, - 1, + metric.DataSizeBuckets, ), QueryCancelReceivedPGWire: metric.NewCounter(metaQueryCancelReceivedPGWire), QueryCancelReceivedHTTP: metric.NewCounter(metaQueryCancelReceivedHTTP), diff --git a/pkg/ccl/streamingccl/streamingest/metrics.go b/pkg/ccl/streamingccl/streamingest/metrics.go index 79b98a6667fb..e109c9a1898d 100644 --- a/pkg/ccl/streamingccl/streamingest/metrics.go +++ b/pkg/ccl/streamingccl/streamingest/metrics.go @@ -113,9 +113,9 @@ type Metrics struct { Flushes *metric.Counter JobProgressUpdates *metric.Counter ResolvedEvents *metric.Counter - FlushHistNanos *metric.Histogram - CommitLatency *metric.Histogram - AdmitLatency *metric.Histogram + FlushHistNanos *metric.HistogramV2 + CommitLatency *metric.HistogramV2 + AdmitLatency *metric.HistogramV2 RunningCount *metric.Gauge EarliestDataCheckpointSpan *metric.Gauge LatestDataCheckpointSpan *metric.Gauge @@ -134,12 +134,12 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct { Flushes: metric.NewCounter(metaStreamingFlushes), ResolvedEvents: metric.NewCounter(metaStreamingResolvedEventsIngested), JobProgressUpdates: metric.NewCounter(metaJobProgressUpdates), - FlushHistNanos: metric.NewHistogram(metaStreamingFlushHistNanos, - histogramWindow, streamingFlushHistMaxLatency.Nanoseconds(), 1), - CommitLatency: metric.NewHistogram(metaStreamingCommitLatency, - histogramWindow, streamingCommitLatencyMaxValue.Nanoseconds(), 1), - AdmitLatency: metric.NewHistogram(metaStreamingAdmitLatency, - histogramWindow, streamingAdmitLatencyMaxValue.Nanoseconds(), 1), + FlushHistNanos: metric.NewHistogramV2(metaStreamingFlushHistNanos, + histogramWindow, metric.BatchProcessLatencyBuckets), + CommitLatency: metric.NewHistogramV2(metaStreamingCommitLatency, + histogramWindow, metric.BatchProcessLatencyBuckets), + AdmitLatency: metric.NewHistogramV2(metaStreamingAdmitLatency, + histogramWindow, metric.BatchProcessLatencyBuckets), RunningCount: metric.NewGauge(metaStreamsRunning), EarliestDataCheckpointSpan: metric.NewGauge(metaEarliestDataCheckpointSpan), LatestDataCheckpointSpan: metric.NewGauge(metaLatestDataCheckpointSpan), diff --git a/pkg/kv/bulk/bulk_metrics.go b/pkg/kv/bulk/bulk_metrics.go index d85d5883e103..78646144d745 100644 --- a/pkg/kv/bulk/bulk_metrics.go +++ b/pkg/kv/bulk/bulk_metrics.go @@ -19,7 +19,7 @@ import ( // Metrics contains pointers to the metrics for // monitoring bulk operations. type Metrics struct { - MaxBytesHist *metric.Histogram + MaxBytesHist *metric.HistogramV2 CurBytesCount *metric.Gauge } @@ -50,7 +50,7 @@ const log10int64times1000 = 19 * 1000 // MakeBulkMetrics instantiates the metrics holder for bulk operation monitoring. func MakeBulkMetrics(histogramWindow time.Duration) Metrics { return Metrics{ - MaxBytesHist: metric.NewHistogram(metaMemMaxBytes, histogramWindow, log10int64times1000, 3), + MaxBytesHist: metric.NewHistogramV2(metaMemMaxBytes, histogramWindow, metric.MemoryUsageBuckets), CurBytesCount: metric.NewGauge(metaMemCurBytes), } } diff --git a/pkg/kv/kvclient/kvcoord/txn_metrics.go b/pkg/kv/kvclient/kvcoord/txn_metrics.go index 372361a64699..225604f242f2 100644 --- a/pkg/kv/kvclient/kvcoord/txn_metrics.go +++ b/pkg/kv/kvclient/kvcoord/txn_metrics.go @@ -31,14 +31,14 @@ type TxnMetrics struct { RefreshMemoryLimitExceeded *metric.Counter RefreshAutoRetries *metric.Counter - Durations *metric.Histogram + Durations *metric.HistogramV2 TxnsWithCondensedIntents *metric.Counter TxnsWithCondensedIntentsGauge *metric.Gauge TxnsRejectedByLockSpanBudget *metric.Counter // Restarts is the number of times we had to restart the transaction. - Restarts *metric.Histogram + Restarts *metric.HistogramV2 // Counts of restart types. RestartsWriteTooOld telemetry.CounterWithMetric @@ -274,11 +274,11 @@ func MakeTxnMetrics(histogramWindow time.Duration) TxnMetrics { RefreshFailWithCondensedSpans: metric.NewCounter(metaRefreshFailWithCondensedSpans), RefreshMemoryLimitExceeded: metric.NewCounter(metaRefreshMemoryLimitExceeded), RefreshAutoRetries: metric.NewCounter(metaRefreshAutoRetries), - Durations: metric.NewLatency(metaDurationsHistograms, histogramWindow), + Durations: metric.NewHistogramV2(metaDurationsHistograms, histogramWindow, metric.IOLatencyBuckets), TxnsWithCondensedIntents: metric.NewCounter(metaTxnsWithCondensedIntentSpans), TxnsWithCondensedIntentsGauge: metric.NewGauge(metaTxnsWithCondensedIntentSpansGauge), TxnsRejectedByLockSpanBudget: metric.NewCounter(metaTxnsRejectedByLockSpanBudget), - Restarts: metric.NewHistogram(metaRestartsHistogram, histogramWindow, 100, 3), + Restarts: metric.NewHistogramV2(metaRestartsHistogram, histogramWindow, metric.CountBuckets), RestartsWriteTooOld: telemetry.NewCounterWithMetric(metaRestartsWriteTooOld), RestartsWriteTooOldMulti: telemetry.NewCounterWithMetric(metaRestartsWriteTooOldMulti), RestartsSerializable: telemetry.NewCounterWithMetric(metaRestartsSerializable), diff --git a/pkg/kv/kvprober/kvprober.go b/pkg/kv/kvprober/kvprober.go index e2566a89bc30..726a20912d50 100644 --- a/pkg/kv/kvprober/kvprober.go +++ b/pkg/kv/kvprober/kvprober.go @@ -129,10 +129,10 @@ var ( type Metrics struct { ReadProbeAttempts *metric.Counter ReadProbeFailures *metric.Counter - ReadProbeLatency *metric.Histogram + ReadProbeLatency *metric.HistogramV2 WriteProbeAttempts *metric.Counter WriteProbeFailures *metric.Counter - WriteProbeLatency *metric.Histogram + WriteProbeLatency *metric.HistogramV2 ProbePlanAttempts *metric.Counter ProbePlanFailures *metric.Counter } @@ -214,14 +214,18 @@ func NewProber(opts Opts) *Prober { writePlanner: newMeta2Planner(opts.DB, opts.Settings, func() time.Duration { return writeInterval.Get(&opts.Settings.SV) }), metrics: Metrics{ - ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts), - ReadProbeFailures: metric.NewCounter(metaReadProbeFailures), - ReadProbeLatency: metric.NewLatency(metaReadProbeLatency, opts.HistogramWindowInterval), + ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts), + ReadProbeFailures: metric.NewCounter(metaReadProbeFailures), + ReadProbeLatency: metric.NewHistogramV2( + metaReadProbeLatency, opts.HistogramWindowInterval, metric.IOLatencyBuckets, + ), WriteProbeAttempts: metric.NewCounter(metaWriteProbeAttempts), WriteProbeFailures: metric.NewCounter(metaWriteProbeFailures), - WriteProbeLatency: metric.NewLatency(metaWriteProbeLatency, opts.HistogramWindowInterval), - ProbePlanAttempts: metric.NewCounter(metaProbePlanAttempts), - ProbePlanFailures: metric.NewCounter(metaProbePlanFailures), + WriteProbeLatency: metric.NewHistogramV2( + metaWriteProbeLatency, opts.HistogramWindowInterval, metric.IOLatencyBuckets, + ), + ProbePlanAttempts: metric.NewCounter(metaProbePlanAttempts), + ProbePlanFailures: metric.NewCounter(metaProbePlanFailures), }, tracer: opts.Tracer, } diff --git a/pkg/kv/kvserver/liveness/liveness.go b/pkg/kv/kvserver/liveness/liveness.go index 215a45e6ad22..004d24e3770a 100644 --- a/pkg/kv/kvserver/liveness/liveness.go +++ b/pkg/kv/kvserver/liveness/liveness.go @@ -144,7 +144,7 @@ type Metrics struct { HeartbeatSuccesses *metric.Counter HeartbeatFailures telemetry.CounterWithMetric EpochIncrements telemetry.CounterWithMetric - HeartbeatLatency *metric.Histogram + HeartbeatLatency *metric.HistogramV2 } // IsLiveCallback is invoked when a node's IsLive state changes to true. @@ -309,7 +309,9 @@ func NewNodeLiveness(opts NodeLivenessOptions) *NodeLiveness { HeartbeatSuccesses: metric.NewCounter(metaHeartbeatSuccesses), HeartbeatFailures: telemetry.NewCounterWithMetric(metaHeartbeatFailures), EpochIncrements: telemetry.NewCounterWithMetric(metaEpochIncrements), - HeartbeatLatency: metric.NewLatency(metaHeartbeatLatency, opts.HistogramWindowInterval), + HeartbeatLatency: metric.NewHistogramV2( + metaHeartbeatLatency, opts.HistogramWindowInterval, metric.IOLatencyBuckets, + ), } nl.mu.nodes = make(map[roachpb.NodeID]Record) nl.heartbeatToken <- struct{}{} diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index acb3fcf5189e..41f701bb9ef3 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -1737,15 +1737,15 @@ type StoreMetrics struct { // Raft processing metrics. RaftTicks *metric.Counter - RaftQuotaPoolPercentUsed *metric.Histogram + RaftQuotaPoolPercentUsed *metric.HistogramV2 RaftWorkingDurationNanos *metric.Counter RaftTickingDurationNanos *metric.Counter RaftCommandsApplied *metric.Counter - RaftLogCommitLatency *metric.Histogram - RaftCommandCommitLatency *metric.Histogram - RaftHandleReadyLatency *metric.Histogram - RaftApplyCommittedLatency *metric.Histogram - RaftSchedulerLatency *metric.Histogram + RaftLogCommitLatency *metric.HistogramV2 + RaftCommandCommitLatency *metric.HistogramV2 + RaftHandleReadyLatency *metric.HistogramV2 + RaftApplyCommittedLatency *metric.HistogramV2 + RaftSchedulerLatency *metric.HistogramV2 RaftTimeoutCampaign *metric.Counter // Raft message metrics. @@ -1875,8 +1875,8 @@ type StoreMetrics struct { ReplicaCircuitBreakerCumTripped *metric.Counter // Replica batch evaluation metrics. - ReplicaReadBatchEvaluationLatency *metric.Histogram - ReplicaWriteBatchEvaluationLatency *metric.Histogram + ReplicaReadBatchEvaluationLatency *metric.HistogramV2 + ReplicaWriteBatchEvaluationLatency *metric.HistogramV2 } type tenantMetricsRef struct { @@ -2236,19 +2236,28 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { // Raft processing metrics. RaftTicks: metric.NewCounter(metaRaftTicks), - RaftQuotaPoolPercentUsed: metric.NewHistogram( - // NB: this results in 64 buckets (i.e. 64 timeseries in prometheus). - metaRaftQuotaPoolPercentUsed, histogramWindow, 100 /* maxVal */, 1, /* sigFigs */ + RaftQuotaPoolPercentUsed: metric.NewHistogramV2( + metaRaftQuotaPoolPercentUsed, histogramWindow, metric.PercentBuckets, ), - RaftWorkingDurationNanos: metric.NewCounter(metaRaftWorkingDurationNanos), - RaftTickingDurationNanos: metric.NewCounter(metaRaftTickingDurationNanos), - RaftCommandsApplied: metric.NewCounter(metaRaftCommandsApplied), - RaftLogCommitLatency: metric.NewLatency(metaRaftLogCommitLatency, histogramWindow), - RaftCommandCommitLatency: metric.NewLatency(metaRaftCommandCommitLatency, histogramWindow), - RaftHandleReadyLatency: metric.NewLatency(metaRaftHandleReadyLatency, histogramWindow), - RaftApplyCommittedLatency: metric.NewLatency(metaRaftApplyCommittedLatency, histogramWindow), - RaftSchedulerLatency: metric.NewLatency(metaRaftSchedulerLatency, histogramWindow), - RaftTimeoutCampaign: metric.NewCounter(metaRaftTimeoutCampaign), + RaftWorkingDurationNanos: metric.NewCounter(metaRaftWorkingDurationNanos), + RaftTickingDurationNanos: metric.NewCounter(metaRaftTickingDurationNanos), + RaftCommandsApplied: metric.NewCounter(metaRaftCommandsApplied), + RaftLogCommitLatency: metric.NewHistogramV2( + metaRaftLogCommitLatency, histogramWindow, metric.IOLatencyBuckets, + ), + RaftCommandCommitLatency: metric.NewHistogramV2( + metaRaftCommandCommitLatency, histogramWindow, metric.IOLatencyBuckets, + ), + RaftHandleReadyLatency: metric.NewHistogramV2( + metaRaftHandleReadyLatency, histogramWindow, metric.IOLatencyBuckets, + ), + RaftApplyCommittedLatency: metric.NewHistogramV2( + metaRaftApplyCommittedLatency, histogramWindow, metric.IOLatencyBuckets, + ), + RaftSchedulerLatency: metric.NewHistogramV2( + metaRaftSchedulerLatency, histogramWindow, metric.IOLatencyBuckets, + ), + RaftTimeoutCampaign: metric.NewCounter(metaRaftTimeoutCampaign), // Raft message metrics. RaftRcvdMessages: [...]*metric.Counter{ @@ -2386,8 +2395,12 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { ReplicaCircuitBreakerCumTripped: metric.NewCounter(metaReplicaCircuitBreakerCumTripped), // Replica batch evaluation. - ReplicaReadBatchEvaluationLatency: metric.NewLatency(metaReplicaReadBatchEvaluationLatency, histogramWindow), - ReplicaWriteBatchEvaluationLatency: metric.NewLatency(metaReplicaWriteBatchEvaluationLatency, histogramWindow), + ReplicaReadBatchEvaluationLatency: metric.NewHistogramV2( + metaReplicaReadBatchEvaluationLatency, histogramWindow, metric.IOLatencyBuckets, + ), + ReplicaWriteBatchEvaluationLatency: metric.NewHistogramV2( + metaReplicaWriteBatchEvaluationLatency, histogramWindow, metric.IOLatencyBuckets, + ), } { diff --git a/pkg/kv/kvserver/scheduler.go b/pkg/kv/kvserver/scheduler.go index dbfdddb843d6..2ed446471219 100644 --- a/pkg/kv/kvserver/scheduler.go +++ b/pkg/kv/kvserver/scheduler.go @@ -169,7 +169,7 @@ type raftScheduleState struct { type raftScheduler struct { ambientContext log.AmbientContext processor raftProcessor - latency *metric.Histogram + latency *metric.HistogramV2 numWorkers int mu struct { diff --git a/pkg/kv/kvserver/txnwait/metrics.go b/pkg/kv/kvserver/txnwait/metrics.go index 1d4c6f2b0653..26d903ffac0b 100644 --- a/pkg/kv/kvserver/txnwait/metrics.go +++ b/pkg/kv/kvserver/txnwait/metrics.go @@ -22,8 +22,8 @@ type Metrics struct { PusherWaiting *metric.Gauge QueryWaiting *metric.Gauge PusherSlow *metric.Gauge - PusherWaitTime *metric.Histogram - QueryWaitTime *metric.Histogram + PusherWaitTime *metric.HistogramV2 + QueryWaitTime *metric.HistogramV2 DeadlocksTotal *metric.Counter } @@ -66,7 +66,7 @@ func NewMetrics(histogramWindowInterval time.Duration) *Metrics { }, ), - PusherWaitTime: metric.NewHistogram( + PusherWaitTime: metric.NewHistogramV2( metric.Metadata{ Name: "txnwaitqueue.pusher.wait_time", Help: "Histogram of durations spent in queue by pushers", @@ -74,11 +74,10 @@ func NewMetrics(histogramWindowInterval time.Duration) *Metrics { Unit: metric.Unit_NANOSECONDS, }, histogramWindowInterval, - time.Hour.Nanoseconds(), - 1, + metric.LongRunningProcessLatencyBuckets, ), - QueryWaitTime: metric.NewHistogram( + QueryWaitTime: metric.NewHistogramV2( metric.Metadata{ Name: "txnwaitqueue.query.wait_time", Help: "Histogram of durations spent in queue by queries", @@ -86,8 +85,7 @@ func NewMetrics(histogramWindowInterval time.Duration) *Metrics { Unit: metric.Unit_NANOSECONDS, }, histogramWindowInterval, - time.Hour.Nanoseconds(), - 1, + metric.LongRunningProcessLatencyBuckets, ), DeadlocksTotal: metric.NewCounter( diff --git a/pkg/rpc/clock_offset.go b/pkg/rpc/clock_offset.go index 90220dd288d3..28c91db8a080 100644 --- a/pkg/rpc/clock_offset.go +++ b/pkg/rpc/clock_offset.go @@ -28,7 +28,7 @@ import ( type RemoteClockMetrics struct { ClockOffsetMeanNanos *metric.Gauge ClockOffsetStdDevNanos *metric.Gauge - LatencyHistogramNanos *metric.Histogram + LatencyHistogramNanos *metric.HistogramV2 } // avgLatencyMeasurementAge determines how to exponentially weight the @@ -122,7 +122,9 @@ func newRemoteClockMonitor( r.metrics = RemoteClockMetrics{ ClockOffsetMeanNanos: metric.NewGauge(metaClockOffsetMeanNanos), ClockOffsetStdDevNanos: metric.NewGauge(metaClockOffsetStdDevNanos), - LatencyHistogramNanos: metric.NewLatency(metaLatencyHistogramNanos, histogramWindowInterval), + LatencyHistogramNanos: metric.NewHistogramV2( + metaLatencyHistogramNanos, histogramWindowInterval, metric.IOLatencyBuckets, + ), } return &r } diff --git a/pkg/server/node.go b/pkg/server/node.go index efb1e83c4acc..8f529bf2fb9e 100644 --- a/pkg/server/node.go +++ b/pkg/server/node.go @@ -164,7 +164,7 @@ var ( ) type nodeMetrics struct { - Latency *metric.Histogram + Latency *metric.HistogramV2 Success *metric.Counter Err *metric.Counter DiskStalls *metric.Counter @@ -175,7 +175,9 @@ type nodeMetrics struct { func makeNodeMetrics(reg *metric.Registry, histogramWindow time.Duration) nodeMetrics { nm := nodeMetrics{ - Latency: metric.NewLatency(metaExecLatency, histogramWindow), + Latency: metric.NewHistogramV2( + metaExecLatency, histogramWindow, metric.IOLatencyBuckets, + ), Success: metric.NewCounter(metaExecSuccess), Err: metric.NewCounter(metaExecError), DiskStalls: metric.NewCounter(metaDiskStalls), diff --git a/pkg/server/status/recorder.go b/pkg/server/status/recorder.go index 3d6fe5ad18ce..9e301161c702 100644 --- a/pkg/server/status/recorder.go +++ b/pkg/server/status/recorder.go @@ -528,26 +528,6 @@ type registryRecorder struct { func extractValue(name string, mtr interface{}, fn func(string, float64)) error { switch mtr := mtr.(type) { - case *metric.Histogram: - // TODO(mrtracy): Where should this comment go for better - // visibility? - // - // Proper support of Histograms for time series is difficult and - // likely not worth the trouble. Instead, we aggregate a windowed - // histogram at fixed quantiles. If the scraping window and the - // histogram's eviction duration are similar, this should give - // good results; if the two durations are very different, we either - // report stale results or report only the more recent data. - // - // Additionally, we can only aggregate max/min of the quantiles; - // roll-ups don't know that and so they will return mathematically - // nonsensical values, but that seems acceptable for the time - // being. - curr, _ := mtr.Windowed() - for _, pt := range recordHistogramQuantiles { - fn(name+pt.suffix, float64(curr.ValueAtQuantile(pt.quantile))) - } - fn(name+"-count", float64(curr.TotalCount())) case *metric.HistogramV2: n := float64(mtr.TotalCountWindowed()) fn(name+"-count", n) diff --git a/pkg/server/status/recorder_test.go b/pkg/server/status/recorder_test.go index 241a6733dd95..50eafae8df72 100644 --- a/pkg/server/status/recorder_test.go +++ b/pkg/server/status/recorder_test.go @@ -287,7 +287,7 @@ func TestMetricsRecorder(t *testing.T) { c.Inc((data.val)) addExpected(reg.prefix, data.name, reg.source, 100, data.val, reg.isNode) case "histogram": - h := metric.NewHistogram(metric.Metadata{Name: reg.prefix + data.name}, time.Second, 1000, 2) + h := metric.NewHistogramV2(metric.Metadata{Name: reg.prefix + data.name}, time.Second, metric.CountBuckets) reg.reg.AddMetric(h) h.RecordValue(data.val) for _, q := range recordHistogramQuantiles { @@ -295,7 +295,9 @@ func TestMetricsRecorder(t *testing.T) { } addExpected(reg.prefix, data.name+"-count", reg.source, 100, 1, reg.isNode) case "latency": - l := metric.NewLatency(metric.Metadata{Name: reg.prefix + data.name}, time.Hour) + l := metric.NewHistogramV2( + metric.Metadata{Name: reg.prefix + data.name}, time.Hour, metric.IOLatencyBuckets, + ) reg.reg.AddMetric(l) l.RecordValue(data.val) // Latency is simply three histograms (at different resolution diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index e58d459bf902..f8685a278369 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -444,16 +444,21 @@ func makeMetrics(internal bool) Metrics { SQLOptPlanCacheHits: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheHits, internal)), SQLOptPlanCacheMisses: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheMisses, internal)), // TODO(mrtracy): See HistogramWindowInterval in server/config.go for the 6x factor. - DistSQLExecLatency: metric.NewLatency(getMetricMeta(MetaDistSQLExecLatency, internal), - 6*metricsSampleInterval), - SQLExecLatency: metric.NewLatency(getMetricMeta(MetaSQLExecLatency, internal), - 6*metricsSampleInterval), - DistSQLServiceLatency: metric.NewLatency(getMetricMeta(MetaDistSQLServiceLatency, internal), - 6*metricsSampleInterval), - SQLServiceLatency: metric.NewLatency(getMetricMeta(MetaSQLServiceLatency, internal), - 6*metricsSampleInterval), - SQLTxnLatency: metric.NewLatency(getMetricMeta(MetaSQLTxnLatency, internal), - 6*metricsSampleInterval), + DistSQLExecLatency: metric.NewHistogramV2( + getMetricMeta(MetaDistSQLExecLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, + ), + SQLExecLatency: metric.NewHistogramV2( + getMetricMeta(MetaSQLExecLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, + ), + DistSQLServiceLatency: metric.NewHistogramV2( + getMetricMeta(MetaDistSQLServiceLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, + ), + SQLServiceLatency: metric.NewHistogramV2( + getMetricMeta(MetaSQLServiceLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, + ), + SQLTxnLatency: metric.NewHistogramV2( + getMetricMeta(MetaSQLTxnLatency, internal), 6*metricsSampleInterval, metric.IOLatencyBuckets, + ), SQLTxnsOpen: metric.NewGauge(getMetricMeta(MetaSQLTxnsOpen, internal)), SQLActiveStatements: metric.NewGauge(getMetricMeta(MetaSQLActiveQueries, internal)), SQLContendedTxns: metric.NewCounter(getMetricMeta(MetaSQLTxnContended, internal)), @@ -477,29 +482,27 @@ func makeMetrics(internal bool) Metrics { func makeServerMetrics(cfg *ExecutorConfig) ServerMetrics { return ServerMetrics{ StatsMetrics: StatsMetrics{ - SQLStatsMemoryMaxBytesHist: metric.NewHistogram( + SQLStatsMemoryMaxBytesHist: metric.NewHistogramV2( MetaSQLStatsMemMaxBytes, cfg.HistogramWindowInterval, - log10int64times1000, - 3, /* sigFigs */ + metric.MemoryUsageBuckets, ), SQLStatsMemoryCurBytesCount: metric.NewGauge(MetaSQLStatsMemCurBytes), - ReportedSQLStatsMemoryMaxBytesHist: metric.NewHistogram( + ReportedSQLStatsMemoryMaxBytesHist: metric.NewHistogramV2( MetaReportedSQLStatsMemMaxBytes, cfg.HistogramWindowInterval, - log10int64times1000, - 3, /* sigFigs */ + metric.MemoryUsageBuckets, ), ReportedSQLStatsMemoryCurBytesCount: metric.NewGauge(MetaReportedSQLStatsMemCurBytes), DiscardedStatsCount: metric.NewCounter(MetaDiscardedSQLStats), SQLStatsFlushStarted: metric.NewCounter(MetaSQLStatsFlushStarted), SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure), - SQLStatsFlushDuration: metric.NewLatency( - MetaSQLStatsFlushDuration, 6*metricsSampleInterval, + SQLStatsFlushDuration: metric.NewHistogramV2( + MetaSQLStatsFlushDuration, 6*metricsSampleInterval, metric.IOLatencyBuckets, ), SQLStatsRemovedRows: metric.NewCounter(MetaSQLStatsRemovedRows), - SQLTxnStatsCollectionOverhead: metric.NewLatency( - MetaSQLTxnStatsCollectionOverhead, 6*metricsSampleInterval, + SQLTxnStatsCollectionOverhead: metric.NewHistogramV2( + MetaSQLTxnStatsCollectionOverhead, 6*metricsSampleInterval, metric.IOLatencyBuckets, ), }, ContentionSubsystemMetrics: txnidcache.NewMetrics(), diff --git a/pkg/sql/execinfra/metrics.go b/pkg/sql/execinfra/metrics.go index b827c8936519..02b2f1b37794 100644 --- a/pkg/sql/execinfra/metrics.go +++ b/pkg/sql/execinfra/metrics.go @@ -26,12 +26,12 @@ type DistSQLMetrics struct { FlowsTotal *metric.Counter FlowsQueued *metric.Gauge FlowsScheduled *metric.Counter - QueueWaitHist *metric.Histogram - MaxBytesHist *metric.Histogram + QueueWaitHist *metric.HistogramV2 + MaxBytesHist *metric.HistogramV2 CurBytesCount *metric.Gauge VecOpenFDs *metric.Gauge CurDiskBytesCount *metric.Gauge - MaxDiskBytesHist *metric.Histogram + MaxDiskBytesHist *metric.HistogramV2 QueriesSpilled *metric.Counter SpilledBytesWritten *metric.Counter SpilledBytesRead *metric.Counter @@ -155,12 +155,12 @@ func MakeDistSQLMetrics(histogramWindow time.Duration) DistSQLMetrics { FlowsTotal: metric.NewCounter(metaFlowsTotal), FlowsQueued: metric.NewGauge(metaFlowsQueued), FlowsScheduled: metric.NewCounter(metaFlowsScheduled), - QueueWaitHist: metric.NewLatency(metaQueueWaitHist, histogramWindow), - MaxBytesHist: metric.NewHistogram(metaMemMaxBytes, histogramWindow, log10int64times1000, 3), + QueueWaitHist: metric.NewHistogramV2(metaQueueWaitHist, histogramWindow, metric.IOLatencyBuckets), + MaxBytesHist: metric.NewHistogramV2(metaMemMaxBytes, histogramWindow, metric.MemoryUsageBuckets), CurBytesCount: metric.NewGauge(metaMemCurBytes), VecOpenFDs: metric.NewGauge(metaVecOpenFDs), CurDiskBytesCount: metric.NewGauge(metaDiskCurBytes), - MaxDiskBytesHist: metric.NewHistogram(metaDiskMaxBytes, histogramWindow, log10int64times1000, 3), + MaxDiskBytesHist: metric.NewHistogramV2(metaDiskMaxBytes, histogramWindow, metric.MemoryUsageBuckets), QueriesSpilled: metric.NewCounter(metaQueriesSpilled), SpilledBytesWritten: metric.NewCounter(metaSpilledBytesWritten), SpilledBytesRead: metric.NewCounter(metaSpilledBytesRead), diff --git a/pkg/sql/executor_statement_metrics.go b/pkg/sql/executor_statement_metrics.go index 928e1314bbde..6a5eee1ec9e8 100644 --- a/pkg/sql/executor_statement_metrics.go +++ b/pkg/sql/executor_statement_metrics.go @@ -34,11 +34,11 @@ type EngineMetrics struct { SQLOptPlanCacheHits *metric.Counter SQLOptPlanCacheMisses *metric.Counter - DistSQLExecLatency *metric.Histogram - SQLExecLatency *metric.Histogram - DistSQLServiceLatency *metric.Histogram - SQLServiceLatency *metric.Histogram - SQLTxnLatency *metric.Histogram + DistSQLExecLatency *metric.HistogramV2 + SQLExecLatency *metric.HistogramV2 + DistSQLServiceLatency *metric.HistogramV2 + SQLServiceLatency *metric.HistogramV2 + SQLTxnLatency *metric.HistogramV2 SQLTxnsOpen *metric.Gauge SQLActiveStatements *metric.Gauge SQLContendedTxns *metric.Counter @@ -67,20 +67,20 @@ func (EngineMetrics) MetricStruct() {} // StatsMetrics groups metrics related to SQL Stats collection. type StatsMetrics struct { - SQLStatsMemoryMaxBytesHist *metric.Histogram + SQLStatsMemoryMaxBytesHist *metric.HistogramV2 SQLStatsMemoryCurBytesCount *metric.Gauge - ReportedSQLStatsMemoryMaxBytesHist *metric.Histogram + ReportedSQLStatsMemoryMaxBytesHist *metric.HistogramV2 ReportedSQLStatsMemoryCurBytesCount *metric.Gauge DiscardedStatsCount *metric.Counter SQLStatsFlushStarted *metric.Counter SQLStatsFlushFailure *metric.Counter - SQLStatsFlushDuration *metric.Histogram + SQLStatsFlushDuration *metric.HistogramV2 SQLStatsRemovedRows *metric.Counter - SQLTxnStatsCollectionOverhead *metric.Histogram + SQLTxnStatsCollectionOverhead *metric.HistogramV2 } // StatsMetrics is part of the metric.Struct interface. diff --git a/pkg/sql/mem_metrics.go b/pkg/sql/mem_metrics.go index aa3ce8a6416d..e5cfb743fbac 100644 --- a/pkg/sql/mem_metrics.go +++ b/pkg/sql/mem_metrics.go @@ -19,7 +19,7 @@ import ( // BaseMemoryMetrics contains a max histogram and a current count of the // bytes allocated by a sql endpoint. type BaseMemoryMetrics struct { - MaxBytesHist *metric.Histogram + MaxBytesHist *metric.HistogramV2 CurBytesCount *metric.Gauge } @@ -30,9 +30,9 @@ type BaseMemoryMetrics struct { // - "internal" for activities related to leases, schema changes, etc. type MemoryMetrics struct { BaseMemoryMetrics - TxnMaxBytesHist *metric.Histogram + TxnMaxBytesHist *metric.HistogramV2 TxnCurBytesCount *metric.Gauge - SessionMaxBytesHist *metric.Histogram + SessionMaxBytesHist *metric.HistogramV2 SessionCurBytesCount *metric.Gauge } @@ -73,7 +73,7 @@ func MakeBaseMemMetrics(endpoint string, histogramWindow time.Duration) BaseMemo MetaMemMaxBytes := makeMemMetricMetadata(prefix+".max", "Memory usage per sql statement for "+endpoint) MetaMemCurBytes := makeMemMetricMetadata(prefix+".current", "Current sql statement memory usage for "+endpoint) return BaseMemoryMetrics{ - MaxBytesHist: metric.NewHistogram(MetaMemMaxBytes, histogramWindow, log10int64times1000, 3), + MaxBytesHist: metric.NewHistogramV2(MetaMemMaxBytes, histogramWindow, metric.MemoryUsageBuckets), CurBytesCount: metric.NewGauge(MetaMemCurBytes), } } @@ -88,9 +88,9 @@ func MakeMemMetrics(endpoint string, histogramWindow time.Duration) MemoryMetric MetaMemSessionCurBytes := makeMemMetricMetadata(prefix+".session.current", "Current sql session memory usage for "+endpoint) return MemoryMetrics{ BaseMemoryMetrics: base, - TxnMaxBytesHist: metric.NewHistogram(MetaMemMaxTxnBytes, histogramWindow, log10int64times1000, 3), + TxnMaxBytesHist: metric.NewHistogramV2(MetaMemMaxTxnBytes, histogramWindow, metric.MemoryUsageBuckets), TxnCurBytesCount: metric.NewGauge(MetaMemTxnCurBytes), - SessionMaxBytesHist: metric.NewHistogram(MetaMemMaxSessionBytes, histogramWindow, log10int64times1000, 3), + SessionMaxBytesHist: metric.NewHistogramV2(MetaMemMaxSessionBytes, histogramWindow, metric.MemoryUsageBuckets), SessionCurBytesCount: metric.NewGauge(MetaMemSessionCurBytes), } diff --git a/pkg/sql/pgwire/server.go b/pkg/sql/pgwire/server.go index 5757f49777e4..82b103b6fda5 100644 --- a/pkg/sql/pgwire/server.go +++ b/pkg/sql/pgwire/server.go @@ -279,7 +279,7 @@ type ServerMetrics struct { BytesOutCount *metric.Counter Conns *metric.Gauge NewConns *metric.Counter - ConnLatency *metric.Histogram + ConnLatency *metric.HistogramV2 ConnFailures *metric.Counter PGWireCancelTotalCount *metric.Counter PGWireCancelIgnoredCount *metric.Counter @@ -292,11 +292,13 @@ func makeServerMetrics( sqlMemMetrics sql.MemoryMetrics, histogramWindow time.Duration, ) ServerMetrics { return ServerMetrics{ - BytesInCount: metric.NewCounter(MetaBytesIn), - BytesOutCount: metric.NewCounter(MetaBytesOut), - Conns: metric.NewGauge(MetaConns), - NewConns: metric.NewCounter(MetaNewConns), - ConnLatency: metric.NewLatency(MetaConnLatency, histogramWindow), + BytesInCount: metric.NewCounter(MetaBytesIn), + BytesOutCount: metric.NewCounter(MetaBytesOut), + Conns: metric.NewGauge(MetaConns), + NewConns: metric.NewCounter(MetaNewConns), + ConnLatency: metric.NewHistogramV2( + MetaConnLatency, histogramWindow, metric.IOLatencyBuckets, + ), ConnFailures: metric.NewCounter(MetaConnFailures), PGWireCancelTotalCount: metric.NewCounter(MetaPGWireCancelTotal), PGWireCancelIgnoredCount: metric.NewCounter(MetaPGWireCancelIgnored), diff --git a/pkg/sql/sqlstats/persistedsqlstats/provider.go b/pkg/sql/sqlstats/persistedsqlstats/provider.go index 1f3b93913d97..44e4217b3ede 100644 --- a/pkg/sql/sqlstats/persistedsqlstats/provider.go +++ b/pkg/sql/sqlstats/persistedsqlstats/provider.go @@ -45,7 +45,7 @@ type Config struct { // Metrics. FlushCounter *metric.Counter - FlushDuration *metric.Histogram + FlushDuration *metric.HistogramV2 FailureCounter *metric.Counter // Testing knobs. diff --git a/pkg/sql/sqlstats/sslocal/sql_stats.go b/pkg/sql/sqlstats/sslocal/sql_stats.go index a5b6ff1cb6c9..c8e1d1680130 100644 --- a/pkg/sql/sqlstats/sslocal/sql_stats.go +++ b/pkg/sql/sqlstats/sslocal/sql_stats.go @@ -75,7 +75,7 @@ func newSQLStats( uniqueStmtFingerprintLimit *settings.IntSetting, uniqueTxnFingerprintLimit *settings.IntSetting, curMemBytesCount *metric.Gauge, - maxMemBytesHist *metric.Histogram, + maxMemBytesHist *metric.HistogramV2, outliersRegistry insights.Registry, parentMon *mon.BytesMonitor, flushTarget Sink, diff --git a/pkg/sql/sqlstats/sslocal/sslocal_provider.go b/pkg/sql/sqlstats/sslocal/sslocal_provider.go index 12a98ba55ec0..2306c148b68c 100644 --- a/pkg/sql/sqlstats/sslocal/sslocal_provider.go +++ b/pkg/sql/sqlstats/sslocal/sslocal_provider.go @@ -37,7 +37,7 @@ func New( maxStmtFingerprints *settings.IntSetting, maxTxnFingerprints *settings.IntSetting, curMemoryBytesCount *metric.Gauge, - maxMemoryBytesHist *metric.Histogram, + maxMemoryBytesHist *metric.HistogramV2, outliersRegistry insights.Registry, pool *mon.BytesMonitor, reportingSink Sink, diff --git a/pkg/sql/ttl/ttljob/ttljob_metrics.go b/pkg/sql/ttl/ttljob/ttljob_metrics.go index ee23a7d70e26..d970d4f6bf38 100644 --- a/pkg/sql/ttl/ttljob/ttljob_metrics.go +++ b/pkg/sql/ttl/ttljob/ttljob_metrics.go @@ -96,7 +96,6 @@ func (m *RowLevelTTLAggMetrics) loadMetrics(labelMetrics bool, relation string) } func makeRowLevelTTLAggMetrics(histogramWindowInterval time.Duration) metric.Struct { - sigFigs := 2 b := aggmetric.MakeBuilder("relation") ret := &RowLevelTTLAggMetrics{ RangeTotalDuration: b.Histogram( @@ -108,8 +107,7 @@ func makeRowLevelTTLAggMetrics(histogramWindowInterval time.Duration) metric.Str MetricType: io_prometheus_client.MetricType_HISTOGRAM, }, histogramWindowInterval, - time.Hour.Nanoseconds(), - sigFigs, + metric.LongRunningProcessLatencyBuckets, ), SelectDuration: b.Histogram( metric.Metadata{ @@ -120,8 +118,7 @@ func makeRowLevelTTLAggMetrics(histogramWindowInterval time.Duration) metric.Str MetricType: io_prometheus_client.MetricType_HISTOGRAM, }, histogramWindowInterval, - time.Minute.Nanoseconds(), - sigFigs, + metric.BatchProcessLatencyBuckets, ), DeleteDuration: b.Histogram( metric.Metadata{ @@ -132,8 +129,7 @@ func makeRowLevelTTLAggMetrics(histogramWindowInterval time.Duration) metric.Str MetricType: io_prometheus_client.MetricType_HISTOGRAM, }, histogramWindowInterval, - time.Minute.Nanoseconds(), - sigFigs, + metric.BatchProcessLatencyBuckets, ), RowSelections: b.Counter( metric.Metadata{ diff --git a/pkg/util/admission/work_queue.go b/pkg/util/admission/work_queue.go index c51e42167bc4..20f9ebee92ed 100644 --- a/pkg/util/admission/work_queue.go +++ b/pkg/util/admission/work_queue.go @@ -1525,7 +1525,7 @@ type WorkQueueMetrics struct { Admitted *metric.Counter Errored *metric.Counter WaitDurationSum *metric.Counter - WaitDurations *metric.Histogram + WaitDurations *metric.HistogramV2 WaitQueueLength *metric.Gauge } @@ -1538,8 +1538,9 @@ func makeWorkQueueMetrics(name string) WorkQueueMetrics { Admitted: metric.NewCounter(addName(name, admittedMeta)), Errored: metric.NewCounter(addName(name, erroredMeta)), WaitDurationSum: metric.NewCounter(addName(name, waitDurationSumMeta)), - WaitDurations: metric.NewLatency( - addName(name, waitDurationsMeta), base.DefaultHistogramWindowInterval()), + WaitDurations: metric.NewHistogramV2( + addName(name, waitDurationsMeta), base.DefaultHistogramWindowInterval(), metric.IOLatencyBuckets, + ), WaitQueueLength: metric.NewGauge(addName(name, waitQueueLengthMeta)), } } diff --git a/pkg/util/metric/BUILD.bazel b/pkg/util/metric/BUILD.bazel index 2a74e5daf558..ee62b67b62de 100644 --- a/pkg/util/metric/BUILD.bazel +++ b/pkg/util/metric/BUILD.bazel @@ -8,6 +8,7 @@ go_library( srcs = [ "doc.go", "graphite_exporter.go", + "histogram_buckets.go", "metric.go", "prometheus_exporter.go", "prometheus_rule_exporter.go", diff --git a/pkg/util/metric/aggmetric/agg_metric.go b/pkg/util/metric/aggmetric/agg_metric.go index d3fbb0400f65..c9afb965d64f 100644 --- a/pkg/util/metric/aggmetric/agg_metric.go +++ b/pkg/util/metric/aggmetric/agg_metric.go @@ -51,9 +51,9 @@ func (b Builder) Counter(metadata metric.Metadata) *AggCounter { // Histogram constructs a new AggHistogram with the Builder's labels. func (b Builder) Histogram( - metadata metric.Metadata, duration time.Duration, maxVal int64, sigFigs int, + metadata metric.Metadata, duration time.Duration, buckets []float64, ) *AggHistogram { - return NewHistogram(metadata, duration, maxVal, sigFigs, b.labels...) + return NewHistogram(metadata, duration, buckets, b.labels...) } type childSet struct { diff --git a/pkg/util/metric/aggmetric/agg_metric_test.go b/pkg/util/metric/aggmetric/agg_metric_test.go index 68574822cd5b..aedd611606cc 100644 --- a/pkg/util/metric/aggmetric/agg_metric_test.go +++ b/pkg/util/metric/aggmetric/agg_metric_test.go @@ -22,6 +22,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/cockroachdb/cockroach/pkg/util/metric/aggmetric" + "github.com/prometheus/client_golang/prometheus" prometheusgo "github.com/prometheus/client_model/go" "github.com/stretchr/testify/require" ) @@ -64,7 +65,7 @@ func TestAggMetric(t *testing.T) { h := aggmetric.NewHistogram(metric.Metadata{ Name: "histo_gram", - }, base.DefaultHistogramWindowInterval(), 100, 1, "tenant_id") + }, base.DefaultHistogramWindowInterval(), prometheus.HistogramOpts{}, "tenant_id") r.AddMetric(h) tenant2 := roachpb.MakeTenantID(2) @@ -195,7 +196,7 @@ func TestAggMetricBuilder(t *testing.T) { g := b.Gauge(metric.Metadata{Name: "bar_gauge"}) f := b.GaugeFloat64(metric.Metadata{Name: "baz_gauge"}) h := b.Histogram(metric.Metadata{Name: "histo_gram"}, - base.DefaultHistogramWindowInterval(), 100, 1) + base.DefaultHistogramWindowInterval(), prometheus.HistogramOpts{}) for i := 5; i < 10; i++ { tenantLabel := roachpb.MakeTenantID(uint64(i)).String() diff --git a/pkg/util/metric/aggmetric/histogram.go b/pkg/util/metric/aggmetric/histogram.go index e3b710a30c5a..a62cfc50d0fa 100644 --- a/pkg/util/metric/aggmetric/histogram.go +++ b/pkg/util/metric/aggmetric/histogram.go @@ -14,7 +14,6 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/util/metric" - "github.com/codahale/hdrhistogram" io_prometheus_client "github.com/prometheus/client_model/go" ) @@ -23,8 +22,8 @@ import ( // children, while its children are additionally exported to prometheus via the // PrometheusIterable interface. type AggHistogram struct { - h metric.Histogram - create func() *metric.Histogram + h metric.HistogramV2 + create func() *metric.HistogramV2 childSet } @@ -34,14 +33,10 @@ var _ metric.PrometheusExportable = (*AggHistogram)(nil) // NewHistogram constructs a new AggHistogram. func NewHistogram( - metadata metric.Metadata, - duration time.Duration, - maxVal int64, - sigFigs int, - childLabels ...string, + metadata metric.Metadata, duration time.Duration, buckets []float64, childLabels ...string, ) *AggHistogram { - create := func() *metric.Histogram { - return metric.NewHistogram(metadata, duration, maxVal, sigFigs) + create := func() *metric.HistogramV2 { + return metric.NewHistogramV2(metadata, duration, buckets) } a := &AggHistogram{ h: *create(), @@ -84,11 +79,11 @@ func (a *AggHistogram) ToPrometheusMetric() *io_prometheus_client.Metric { return a.h.ToPrometheusMetric() } -// Windowed returns a copy of the current windowed histogram data and its -// rotation interval. -func (a *AggHistogram) Windowed() (*hdrhistogram.Histogram, time.Duration) { - return a.h.Windowed() -} +//// Windowed returns a copy of the current windowed histogram data and its +//// rotation interval. +//func (a *AggHistogram) Windowed() prometheus.Histogram { +// return a.h.Windowed() +//} // AddChild adds a Counter to this AggCounter. This method panics if a Counter // already exists for this set of labelVals. @@ -109,7 +104,7 @@ func (a *AggHistogram) AddChild(labelVals ...string) *Histogram { type Histogram struct { parent *AggHistogram labelValuesSlice - h metric.Histogram + h metric.HistogramV2 } // ToPrometheusMetric constructs a prometheus metric for this Histogram. diff --git a/pkg/util/metric/histogram_buckets.go b/pkg/util/metric/histogram_buckets.go new file mode 100644 index 000000000000..6259136aeb15 --- /dev/null +++ b/pkg/util/metric/histogram_buckets.go @@ -0,0 +1,168 @@ +package metric + +// IOLatencyBuckets are prometheus histogram buckets suitable for a histogram +// that records a quantity (nanosecond-denominated) in which most measurements +// resemble those of typical disk latencies, i.e. which are in the micro- and +// millisecond range during normal operation. +var IOLatencyBuckets = []float64{ + // Generated via TestHistogramBuckets/IOLatencyBuckets. + 10000.000000, // 10µs + 26826.957953, // 26.826µs + 71968.567300, // 71.968µs + 193069.772888, // 193.069µs + 517947.467923, // 517.947µs + 1389495.494373, // 1.389495ms + 3727593.720315, // 3.727593ms + 10000000.000000, // 9.999999ms + 26826957.952797, // 26.826957ms + 71968567.300115, // 71.968567ms + 193069772.888325, // 193.069772ms + 517947467.923120, // 517.947467ms + 1389495494.373135, // 1.389495494s + 3727593720.314933, // 3.72759372s + 9999999999.999981, // 9.999999999s +} + +// NetworkLatencyBuckets are prometheus histogram buckets suitable for a histogram +// that records a quantity (nanosecond-denominated) in which most measurements +// behave like network latencies, i.e. most measurements are in the ms to sub-second +// range during normal operation. +var NetworkLatencyBuckets = []float64{ + // Generated via TestHistogramBuckets/NetworkLatencyBuckets. + 500000.000000, // 500µs + 860513.842995, // 860.513µs + 1480968.147973, // 1.480968ms + 2548787.184731, // 2.548787ms + 4386533.310619, // 4.386533ms + 7549345.273094, // 7.549345ms + 12992632.226094, // 12.992632ms + 22360679.774998, // 22.360679ms + 38483348.970335, // 38.483348ms + 66230909.027573, // 66.230909ms + 113985228.104760, // 113.985228ms + 196171733.362212, // 196.171733ms + 337616984.325077, // 337.616984ms + 581048177.284016, // 581.048177ms + 999999999.999999, // 999.999999ms +} + +// BatchProcessLatencyBuckets are prometheus histogram buckets suitable for a +// histogram that records a quantity (nanosecond-denominated) in which most +// measurements are in the seconds to minutes range during normal operation. +var BatchProcessLatencyBuckets = []float64{ + // Generated via TestHistogramBuckets/BatchProcessLatencyBuckets. + 500000000.000000, // 500ms + 789604072.059876, // 789.604072ms + 1246949181.227077, // 1.246949181s + 1969192302.297256, // 1.969192302s + 3109764521.125753, // 3.109764521s + 4910965458.056452, // 4.910965458s + 7755436646.853539, // 7.755436646s + 12247448713.915894, // 12.247448713s + 19341270753.704967, // 19.341270753s + 30543892291.876068, // 30.543892291s + 48235163460.447227, // 48.23516346s + 76173362969.685760, // 1m16.173362969s + 120293595166.717728, // 2m0.293595166s + 189968625172.725128, // 3m9.968625172s + 300000000000.000183, // 5m0s +} + +// LongRunningProcessLatencyBuckets are prometheus histogram buckets suitable +// for a histogram that records a quantity (nanosecond-denominated) for +// long-running processes (multiple minutes). +var LongRunningProcessLatencyBuckets = []float64{ + // Generated via TestHistogramBuckets/LongRunningProcessLatencyBuckets. + 500000000.000000, // 500ms + 942961049.923126, // 942.961049ms + 1778351083.344248, // 1.778351083s + 3353831609.364442, // 3.353831609s + 6325065151.263324, // 6.325065151s + 11928580151.734879, // 11.928580151s + 22496372927.944168, // 22.496372927s + 42426406871.192848, // 42.426406871s + 80012898335.451462, // 1m20.012898335s + 150898093243.579315, // 2m30.898093243s + 284582048872.726685, // 4m44.582048872s + 536699575188.601318, // 8m56.699575188s + 1012173589826.278687, // 16m52.173589826s + 1908880541934.094238, // 31m48.880541934s + 3599999999999.998535, // 59m59.999999999s +} + +// CountBuckets are prometheus histogram buckets suitable for a histogram that +// records a quantity that is a count (unit-less) in which most measurements are +// in the 1 to ~1000 range during normal operation. +var CountBuckets = []float64{ + // Generated via TestHistogramBuckets/CountBuckets. + 1.000000, + 2.000000, + 4.000000, + 8.000000, + 16.000000, + 32.000000, + 64.000000, + 128.000000, + 256.000000, + 512.000000, + 1024.000000, +} + +// PercentBuckets are prometheus histogram buckets suitable for a histogram that +// records a percent quantity [0,100] +var PercentBuckets = []float64{ + // Generated via TestHistogramBuckets/PercentBuckets. + 10.000000, + 20.000000, + 30.000000, + 40.000000, + 50.000000, + 60.000000, + 70.000000, + 80.000000, + 90.000000, + 100.000000, +} + +// DataSizeBuckets are prometheus histogram buckets suitable for a histogram that +// records a quantity that is a size (byte-denominated) in which most measurements are +// in the kB to MB range during normal operation. +var DataSizeBuckets = []float64{ + // Generated via TestHistogramBuckets/DataSizeBuckets. + 1000.000000, // 1.0 kB + 2000.000000, // 2.0 kB + 4000.000000, // 4.0 kB + 8000.000000, // 8.0 kB + 16000.000000, // 16 kB + 32000.000000, // 32 kB + 64000.000000, // 64 kB + 128000.000000, // 128 kB + 256000.000000, // 256 kB + 512000.000000, // 512 kB + 1024000.000000, // 1.0 MB + 2048000.000000, // 2.0 MB + 4096000.000000, // 4.1 MB + 8192000.000000, // 8.2 MB + 16384000.000000, // 16 MB +} + +// MemoryUsageBuckets are prometheus histogram buckets suitable for a histogram that +// records memory usage (in Bytes) +var MemoryUsageBuckets = []float64{ + // Generated via TestHistogramBuckets/MemoryUsageBuckets. + 1.000000, // 1 B + 2.021274, // 2 B + 4.085550, // 4 B + 8.258017, // 8 B + 16.691718, // 16 B + 33.738540, // 33 B + 68.194844, // 68 B + 137.840488, // 137 B + 278.613437, // 278 B + 563.154184, // 563 B + 1138.289087, // 1.1 kB + 2300.794494, // 2.3 kB + 4650.536813, // 4.7 kB + 9400.010609, // 9.4 kB + 19000.000000, // 19 kB +} diff --git a/pkg/util/metric/histogram_buckets_test.go b/pkg/util/metric/histogram_buckets_test.go new file mode 100644 index 000000000000..ff83280ca137 --- /dev/null +++ b/pkg/util/metric/histogram_buckets_test.go @@ -0,0 +1,81 @@ +package metric + +import ( + "fmt" + "strings" + "testing" + "time" + + "github.com/dustin/go-humanize" + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/require" +) + +const LATENCY = "LATENCY" +const SIZE = "SIZE" + +const log10int64times1000 = 19 * 1000 + +// TestHistogramBuckets is used to generate additional prometheus buckets to be +// used with HistogramV2. Please include obs-inf in the review process of new +// buckets. +func TestHistogramBuckets(t *testing.T) { + verifyAndPrint := func(t *testing.T, exp, act []float64, histType string) { + t.Helper() + var buf strings.Builder + for idx, f := range exp { + if idx == 0 { + fmt.Fprintf(&buf, "// Generated via %s.", t.Name()) + } + switch histType { + case LATENCY: + fmt.Fprintf(&buf, "\n%f, // %s", f, time.Duration(f)) + case SIZE: + fmt.Fprintf(&buf, "\n%f, // %s", f, humanize.Bytes(uint64(f))) + default: + fmt.Fprintf(&buf, "\n%f,", f) + } + } + t.Logf("%s", &buf) + require.InDeltaSlice(t, exp, act, 1 /* delta */, "Please update the bucket boundaries for %s", t.Name()) + } + t.Run("IOLatencyBuckets", func(t *testing.T) { + exp := prometheus.ExponentialBucketsRange(10e3, 10e9, 15) + verifyAndPrint(t, exp, IOLatencyBuckets, LATENCY) + }) + + t.Run("NetworkLatencyBuckets", func(t *testing.T) { + exp := prometheus.ExponentialBucketsRange(500e3, 1e9, 15) + verifyAndPrint(t, exp, NetworkLatencyBuckets, LATENCY) + }) + + t.Run("BatchProcessLatencyBuckets", func(t *testing.T) { + exp := prometheus.ExponentialBucketsRange(500e6, 300e9, 15) + verifyAndPrint(t, exp, BatchProcessLatencyBuckets, LATENCY) + }) + + t.Run("LongRunningProcessLatencyBuckets", func(t *testing.T) { + exp := prometheus.ExponentialBucketsRange(500e6, 3600e9, 15) + verifyAndPrint(t, exp, LongRunningProcessLatencyBuckets, LATENCY) + }) + + t.Run("CountBuckets", func(t *testing.T) { + exp := prometheus.ExponentialBuckets(1, 2, 11) + verifyAndPrint(t, exp, CountBuckets, "") + }) + + t.Run("PercentBuckets", func(t *testing.T) { + exp := prometheus.LinearBuckets(10, 10, 10) + verifyAndPrint(t, exp, PercentBuckets, "") + }) + + t.Run("DataSizeBuckets", func(t *testing.T) { + exp := prometheus.ExponentialBuckets(1e3, 2, 15) + verifyAndPrint(t, exp, DataSizeBuckets, SIZE) + }) + + t.Run("MemoryUsageBuckets", func(t *testing.T) { + exp := prometheus.ExponentialBucketsRange(1, log10int64times1000, 15) + verifyAndPrint(t, exp, MemoryUsageBuckets, SIZE) + }) +} diff --git a/pkg/util/metric/metric.go b/pkg/util/metric/metric.go index 4ce6eda2840e..88ed6153a85e 100644 --- a/pkg/util/metric/metric.go +++ b/pkg/util/metric/metric.go @@ -19,7 +19,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" - "github.com/codahale/hdrhistogram" "github.com/gogo/protobuf/proto" "github.com/prometheus/client_golang/prometheus" prometheusgo "github.com/prometheus/client_model/go" @@ -135,7 +134,6 @@ func (m *Metadata) AddLabel(name, value string) { var _ Iterable = &Gauge{} var _ Iterable = &GaugeFloat64{} var _ Iterable = &Counter{} -var _ Iterable = &Histogram{} var _ json.Marshaler = &Gauge{} var _ json.Marshaler = &GaugeFloat64{} @@ -145,7 +143,6 @@ var _ json.Marshaler = &Registry{} var _ PrometheusExportable = &Gauge{} var _ PrometheusExportable = &GaugeFloat64{} var _ PrometheusExportable = &Counter{} -var _ PrometheusExportable = &Histogram{} type periodic interface { nextTick() time.Time @@ -164,228 +161,21 @@ func TestingSetNow(f func() time.Time) func() { } } -func cloneHistogram(in *hdrhistogram.Histogram) *hdrhistogram.Histogram { - return hdrhistogram.Import(in.Export()) -} - func maybeTick(m periodic) { for m.nextTick().Before(now()) { m.tick() } } -// A Histogram collects observed values by keeping bucketed counts. For -// convenience, internally two sets of buckets are kept: A cumulative set (i.e. -// data is never evicted) and a windowed set (which keeps only recently -// collected samples). -// -// Top-level methods generally apply to the cumulative buckets; the windowed -// variant is exposed through the Windowed method. -type Histogram struct { - Metadata - maxVal int64 - mu struct { - syncutil.Mutex - cumulative *hdrhistogram.Histogram - *tickHelper - sliding *hdrhistogram.WindowedHistogram - } -} - -// NewHistogram initializes a given Histogram. The contained windowed histogram -// rotates every 'duration'; both the windowed and the cumulative histogram -// track nonnegative values up to 'maxVal' with 'sigFigs' decimal points of -// precision. -func NewHistogram(metadata Metadata, duration time.Duration, maxVal int64, sigFigs int) *Histogram { - h := &Histogram{ - Metadata: metadata, - maxVal: maxVal, - } - wHist := hdrhistogram.NewWindowed(histWrapNum, 0, maxVal, sigFigs) - h.mu.cumulative = hdrhistogram.New(0, maxVal, sigFigs) - h.mu.sliding = wHist - h.mu.tickHelper = &tickHelper{ - nextT: now(), - tickInterval: duration / histWrapNum, - onTick: func() { - wHist.Rotate() - }, - } - return h -} - -// NewLatency is a convenience function which returns a histogram with -// suitable defaults for latency tracking. Values are expressed in ns, -// are truncated into the interval [0, MaxLatency] and are recorded -// with one digit of precision (i.e. errors of <10ms at 100ms, <6s at 60s). -// -// The windowed portion of the Histogram retains values for approximately -// histogramWindow. -func NewLatency(metadata Metadata, histogramWindow time.Duration) *Histogram { - return NewHistogram( - metadata, histogramWindow, MaxLatency.Nanoseconds(), 1, - ) -} - -// Windowed returns a copy of the current windowed histogram data and its -// rotation interval. -func (h *Histogram) Windowed() (*hdrhistogram.Histogram, time.Duration) { - h.mu.Lock() - defer h.mu.Unlock() - maybeTick(h.mu.tickHelper) - // TODO(obs-inf): not sure we should multiply by histWrapNum here, but it - // has been the behavior for a long time. - return cloneHistogram(h.mu.sliding.Merge()), histWrapNum * h.mu.tickInterval -} - -// Snapshot returns a copy of the cumulative (i.e. all-time samples) histogram -// data. -func (h *Histogram) Snapshot() *hdrhistogram.Histogram { - h.mu.Lock() - defer h.mu.Unlock() - return cloneHistogram(h.mu.cumulative) -} - -// RecordValue adds the given value to the histogram. Recording a value in -// excess of the configured maximum value for that histogram results in -// recording the maximum value instead. -func (h *Histogram) RecordValue(v int64) { - h.mu.Lock() - defer h.mu.Unlock() - - if h.mu.sliding.Current.RecordValue(v) != nil { - _ = h.mu.sliding.Current.RecordValue(h.maxVal) - } - if h.mu.cumulative.RecordValue(v) != nil { - _ = h.mu.cumulative.RecordValue(h.maxVal) - } -} - -// TotalCount returns the (cumulative) number of samples. -func (h *Histogram) TotalCount() int64 { - h.mu.Lock() - defer h.mu.Unlock() - return h.mu.cumulative.TotalCount() -} - -// Min returns the minimum. -func (h *Histogram) Min() int64 { - h.mu.Lock() - defer h.mu.Unlock() - return h.mu.cumulative.Min() -} - -// Inspect calls the closure with the empty string and the receiver. -func (h *Histogram) Inspect(f func(interface{})) { - h.mu.Lock() - maybeTick(h.mu.tickHelper) - h.mu.Unlock() - f(h) -} - -// GetType returns the prometheus type enum for this metric. -func (h *Histogram) GetType() *prometheusgo.MetricType { - return prometheusgo.MetricType_HISTOGRAM.Enum() -} - -// ToPrometheusMetric returns a filled-in prometheus metric of the right type. -func (h *Histogram) ToPrometheusMetric() *prometheusgo.Metric { - hist := &prometheusgo.Histogram{} - - h.mu.Lock() - maybeTick(h.mu.tickHelper) - bars := h.mu.cumulative.Distribution() - hist.Bucket = make([]*prometheusgo.Bucket, 0, len(bars)) - - var cumCount uint64 - var sum float64 - for _, bar := range bars { - if bar.Count == 0 { - // No need to expose trivial buckets. - continue - } - upperBound := float64(bar.To) - sum += upperBound * float64(bar.Count) - - cumCount += uint64(bar.Count) - curCumCount := cumCount // need a new alloc thanks to bad proto code - - hist.Bucket = append(hist.Bucket, &prometheusgo.Bucket{ - CumulativeCount: &curCumCount, - UpperBound: &upperBound, - }) - } - hist.SampleCount = &cumCount - hist.SampleSum = &sum // can do better here; we approximate in the loop - h.mu.Unlock() - - return &prometheusgo.Metric{ - Histogram: hist, - } -} - -// GetMetadata returns the metric's metadata including the Prometheus -// MetricType. -func (h *Histogram) GetMetadata() Metadata { - baseMetadata := h.Metadata - baseMetadata.MetricType = prometheusgo.MetricType_HISTOGRAM - return baseMetadata -} - -// IOLatencyBuckets are prometheus histogram buckets suitable for a histogram -// that records a quantity (nanosecond-denominated) in which most measurements -// resemble those of typical disk latencies, i.e. which are in the micro- and -// millisecond range during normal operation. -var IOLatencyBuckets = []float64{ - // Generated via TestHistogramBuckets/IOLatencyBuckets. - 10000.000000, // 10µs - 26826.957953, // 26.826µs - 71968.567300, // 71.968µs - 193069.772888, // 193.069µs - 517947.467923, // 517.947µs - 1389495.494373, // 1.389495ms - 3727593.720315, // 3.727593ms - 10000000.000000, // 9.999999ms - 26826957.952797, // 26.826957ms - 71968567.300115, // 71.968567ms - 193069772.888325, // 193.069772ms - 517947467.923120, // 517.947467ms - 1389495494.373135, // 1.389495494s - 3727593720.314933, // 3.72759372s - 9999999999.999981, // 9.999999999s -} - -// NetworkLatencyBuckets are prometheus histogram buckets suitable for a histogram -// that records a quantity (nanosecond-denominated) in which most measurements -// behave like network latencies, i.e. most measurements are in the ms to sub-second -// range during normal operation. -var NetworkLatencyBuckets = []float64{ - // Generated via TestHistogramBuckets/NetworkLatencyBuckets. - 500000.000000, // 500µs - 860513.842995, // 860.513µs - 1480968.147973, // 1.480968ms - 2548787.184731, // 2.548787ms - 4386533.310619, // 4.386533ms - 7549345.273094, // 7.549345ms - 12992632.226094, // 12.992632ms - 22360679.774998, // 22.360679ms - 38483348.970335, // 38.483348ms - 66230909.027573, // 66.230909ms - 113985228.104760, // 113.985228ms - 196171733.362212, // 196.171733ms - 337616984.325077, // 337.616984ms - 581048177.284016, // 581.048177ms - 999999999.999999, // 999.999999ms, -} - // NewHistogramV2 is a prometheus-backed histogram. Depending on the value of // opts.Buckets, this is suitable for recording any kind of quantity. Common // sensible choices are {IO,Network}LatencyBuckets. -func NewHistogramV2( - meta Metadata, windowDuration time.Duration, opts prometheus.HistogramOpts, -) *HistogramV2 { +func NewHistogramV2(meta Metadata, windowDuration time.Duration, buckets []float64) *HistogramV2 { // TODO(obs-inf): prometheus supports labeled histograms but they require more // plumbing and don't fit into the PrometheusObservable interface any more. + opts := prometheus.HistogramOpts{ + Buckets: buckets, + } cum := prometheus.NewHistogram(opts) h := &HistogramV2{ Metadata: meta, @@ -524,6 +314,11 @@ func (h *HistogramV2) TotalSumWindowed() float64 { return h.ToPrometheusMetricWindowed().Histogram.GetSampleSum() } +// Mean returns the (cumulative) mean of samples. +func (h *HistogramV2) Mean() float64 { + return h.TotalSum() / float64(h.TotalCount()) +} + // ValueAtQuantileWindowed takes a quantile value [0,100] and returns the // interpolated value at that quantile for the windowed histogram. // diff --git a/pkg/util/metric/metric_test.go b/pkg/util/metric/metric_test.go index a04d4d4953c7..d4f6c2878dfb 100644 --- a/pkg/util/metric/metric_test.go +++ b/pkg/util/metric/metric_test.go @@ -13,17 +13,14 @@ package metric import ( "bytes" "encoding/json" - "fmt" "math" "reflect" - "strings" "sync" "testing" "time" _ "github.com/cockroachdb/cockroach/pkg/util/log" // for flags "github.com/kr/pretty" - "github.com/prometheus/client_golang/prometheus" prometheusgo "github.com/prometheus/client_model/go" "github.com/stretchr/testify/require" ) @@ -103,42 +100,6 @@ func setNow(d time.Duration) { } } -func TestHistogramPrometheus(t *testing.T) { - u := func(v int) *uint64 { - n := uint64(v) - return &n - } - - f := func(v int) *float64 { - n := float64(v) - return &n - } - - h := NewHistogram(Metadata{}, time.Hour, 10, 1) - h.RecordValue(1) - h.RecordValue(5) - h.RecordValue(5) - h.RecordValue(10) - h.RecordValue(15000) // counts as 10 - act := *h.ToPrometheusMetric().Histogram - - expSum := float64(1*1 + 2*5 + 2*10) - - exp := prometheusgo.Histogram{ - SampleCount: u(5), - SampleSum: &expSum, - Bucket: []*prometheusgo.Bucket{ - {CumulativeCount: u(1), UpperBound: f(1)}, - {CumulativeCount: u(3), UpperBound: f(5)}, - {CumulativeCount: u(5), UpperBound: f(10)}, - }, - } - - if !reflect.DeepEqual(act, exp) { - t.Fatalf("expected differs from actual: %s", pretty.Diff(exp, act)) - } -} - func TestHistogramV2(t *testing.T) { u := func(v int) *uint64 { n := uint64(v) @@ -153,19 +114,12 @@ func TestHistogramV2(t *testing.T) { h := NewHistogramV2( Metadata{}, time.Hour, - prometheus.HistogramOpts{ - Namespace: "", - Subsystem: "", - Name: "", - Help: "", - ConstLabels: nil, - Buckets: []float64{ - 1.0, - 5.0, - 10.0, - 25.0, - 100.0, - }, + []float64{ + 1.0, + 5.0, + 10.0, + 25.0, + 100.0, }, ) @@ -210,38 +164,11 @@ func TestHistogramV2(t *testing.T) { require.Equal(t, 125.0, h.ValueAtQuantileWindowed(100)) } -// TestHistogramBuckets is used to generate additional prometheus buckets to be -// used with HistogramV2. Please include obs-inf in the review process of new -// buckets. -func TestHistogramBuckets(t *testing.T) { - verifyAndPrint := func(t *testing.T, exp, act []float64) { - t.Helper() - var buf strings.Builder - for idx, f := range exp { - if idx == 0 { - fmt.Fprintf(&buf, "// Generated via %s.", t.Name()) - } - fmt.Fprintf(&buf, "\n%f, // %s", f, time.Duration(f)) - } - t.Logf("%s", &buf) - require.InDeltaSlice(t, exp, act, 1 /* delta */, "Please update the bucket boundaries for %s", t.Name()) - } - t.Run("IOLatencyBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(10e3, 10e9, 15) - verifyAndPrint(t, exp, IOLatencyBuckets) - }) - - t.Run("NetworkLatencyBuckets", func(t *testing.T) { - exp := prometheus.ExponentialBucketsRange(500e3, 1e9, 15) - verifyAndPrint(t, exp, NetworkLatencyBuckets) - }) -} - func TestNewHistogramV2Rotate(t *testing.T) { defer TestingSetNow(nil)() setNow(0) - h := NewHistogramV2(emptyMetadata, 10*time.Second, prometheus.HistogramOpts{Buckets: nil}) + h := NewHistogramV2(emptyMetadata, 10*time.Second, nil) for i := 0; i < 4; i++ { // Windowed histogram is initially empty. h.Inspect(func(interface{}) {}) // triggers ticking @@ -264,36 +191,3 @@ func TestNewHistogramV2Rotate(t *testing.T) { // Go to beginning. } } - -func TestHistogramRotate(t *testing.T) { - defer TestingSetNow(nil)() - setNow(0) - duration := histWrapNum * time.Second - h := NewHistogram(emptyMetadata, duration, 1000+10*histWrapNum, 3) - var cur time.Duration - for i := 0; i < 3*histWrapNum; i++ { - v := int64(10 * i) - h.RecordValue(v) - cur += time.Second - setNow(cur) - cur, windowDuration := h.Windowed() - if windowDuration != duration { - t.Fatalf("window changed: is %s, should be %s", windowDuration, duration) - } - - // When i == histWrapNum-1, we expect the entry from i==0 to move out - // of the window (since we rotated for the histWrapNum'th time). - expMin := int64((1 + i - (histWrapNum - 1)) * 10) - if expMin < 0 { - expMin = 0 - } - - if min := cur.Min(); min != expMin { - t.Fatalf("%d: unexpected minimum %d, expected %d", i, min, expMin) - } - - if max, expMax := cur.Max(), v; max != expMax { - t.Fatalf("%d: unexpected maximum %d, expected %d", i, max, expMax) - } - } -} diff --git a/pkg/util/metric/registry_test.go b/pkg/util/metric/registry_test.go index 52e38069109c..7cdce9d135dd 100644 --- a/pkg/util/metric/registry_test.go +++ b/pkg/util/metric/registry_test.go @@ -76,14 +76,14 @@ func TestRegistry(t *testing.T) { topCounter := NewCounter(Metadata{Name: "top.counter"}) r.AddMetric(topCounter) - r.AddMetric(NewHistogram(Metadata{Name: "top.histogram"}, time.Minute, 1000, 3)) + r.AddMetric(NewHistogramV2(Metadata{Name: "top.histogram"}, time.Minute, CountBuckets)) r.AddMetric(NewGauge(Metadata{Name: "bottom.gauge"})) ms := &struct { StructGauge *Gauge StructGauge64 *GaugeFloat64 StructCounter *Counter - StructHistogram *Histogram + StructHistogram *HistogramV2 NestedStructGauge NestedStruct ArrayStructCounters [4]*Counter // Ensure that nil struct values in arrays are safe. @@ -92,7 +92,7 @@ func TestRegistry(t *testing.T) { privateStructGauge *Gauge privateStructGauge64 *GaugeFloat64 privateStructCounter *Counter - privateStructHistogram *Histogram + privateStructHistogram *HistogramV2 privateNestedStructGauge NestedStruct privateArrayStructCounters [2]*Counter NotAMetric int @@ -103,7 +103,7 @@ func TestRegistry(t *testing.T) { StructGauge: NewGauge(Metadata{Name: "struct.gauge"}), StructGauge64: NewGaugeFloat64(Metadata{Name: "struct.gauge64"}), StructCounter: NewCounter(Metadata{Name: "struct.counter"}), - StructHistogram: NewHistogram(Metadata{Name: "struct.histogram"}, time.Minute, 1000, 3), + StructHistogram: NewHistogramV2(Metadata{Name: "struct.histogram"}, time.Minute, CountBuckets), NestedStructGauge: NestedStruct{ NestedStructGauge: NewGauge(Metadata{Name: "nested.struct.gauge"}), }, @@ -122,7 +122,7 @@ func TestRegistry(t *testing.T) { privateStructGauge: NewGauge(Metadata{Name: "private.struct.gauge"}), privateStructGauge64: NewGaugeFloat64(Metadata{Name: "private.struct.gauge64"}), privateStructCounter: NewCounter(Metadata{Name: "private.struct.counter"}), - privateStructHistogram: NewHistogram(Metadata{Name: "private.struct.histogram"}, time.Minute, 1000, 3), + privateStructHistogram: NewHistogramV2(Metadata{Name: "private.struct.histogram"}, time.Minute, CountBuckets), privateNestedStructGauge: NestedStruct{ NestedStructGauge: NewGauge(Metadata{Name: "private.nested.struct.gauge"}), }, diff --git a/pkg/util/mon/bytes_usage.go b/pkg/util/mon/bytes_usage.go index 4d2f9246eb37..249fe4d28701 100644 --- a/pkg/util/mon/bytes_usage.go +++ b/pkg/util/mon/bytes_usage.go @@ -195,7 +195,7 @@ type BytesMonitor struct { // maxBytesHist is the metric object used to track the high watermark of bytes // allocated by the monitor during its lifetime. - maxBytesHist *metric.Histogram + maxBytesHist *metric.HistogramV2 } // name identifies this monitor in logging messages. @@ -276,7 +276,7 @@ func NewMonitor( name redact.RedactableString, res Resource, curCount *metric.Gauge, - maxHist *metric.Histogram, + maxHist *metric.HistogramV2, increment int64, noteworthy int64, settings *cluster.Settings, @@ -292,7 +292,7 @@ func NewMonitorWithLimit( res Resource, limit int64, curCount *metric.Gauge, - maxHist *metric.Histogram, + maxHist *metric.HistogramV2, increment int64, noteworthy int64, settings *cluster.Settings, @@ -386,7 +386,7 @@ func NewUnlimitedMonitor( name redact.RedactableString, res Resource, curCount *metric.Gauge, - maxHist *metric.Histogram, + maxHist *metric.HistogramV2, noteworthy int64, settings *cluster.Settings, ) *BytesMonitor { @@ -479,7 +479,7 @@ func (mm *BytesMonitor) AllocBytes() int64 { } // SetMetrics sets the metric objects for the monitor. -func (mm *BytesMonitor) SetMetrics(curCount *metric.Gauge, maxHist *metric.Histogram) { +func (mm *BytesMonitor) SetMetrics(curCount *metric.Gauge, maxHist *metric.HistogramV2) { mm.mu.Lock() defer mm.mu.Unlock() mm.mu.curBytesCount = curCount