Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metric: migrate all histograms to use prometheus-backed version #1

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/ccl/changefeedccl/changefeed_processors.go
Original file line number Diff line number Diff line change
Expand Up @@ -840,7 +840,7 @@ func (j *jobState) checkpointCompleted(ctx context.Context, checkpointDuration t

j.metrics.CheckpointHistNanos.RecordValue(checkpointDuration.Nanoseconds())
j.lastProgressUpdate = j.ts.Now()
j.checkpointDuration = time.Duration(j.metrics.CheckpointHistNanos.Snapshot().Mean())
j.checkpointDuration = time.Duration(j.metrics.CheckpointHistNanos.Mean())
j.progressUpdatesSkipped = false
}

Expand Down
46 changes: 20 additions & 26 deletions pkg/ccl/changefeedccl/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -417,20 +417,15 @@ func newAggregateMetrics(histogramWindow time.Duration) *AggMetrics {
a := &AggMetrics{
ErrorRetries: b.Counter(metaChangefeedErrorRetries),
EmittedMessages: b.Counter(metaChangefeedEmittedMessages),
MessageSize: b.Histogram(metaMessageSize,
histogramWindow, 10<<20 /* 10MB max message size */, 1),
EmittedBytes: b.Counter(metaChangefeedEmittedBytes),
FlushedBytes: b.Counter(metaChangefeedFlushedBytes),
Flushes: b.Counter(metaChangefeedFlushes),

BatchHistNanos: b.Histogram(metaChangefeedBatchHistNanos,
histogramWindow, changefeedBatchHistMaxLatency.Nanoseconds(), 1),
FlushHistNanos: b.Histogram(metaChangefeedFlushHistNanos,
histogramWindow, changefeedFlushHistMaxLatency.Nanoseconds(), 2),
CommitLatency: b.Histogram(metaCommitLatency,
histogramWindow, commitLatencyMaxValue.Nanoseconds(), 1),
AdmitLatency: b.Histogram(metaAdmitLatency, histogramWindow,
admitLatencyMaxValue.Nanoseconds(), 1),
MessageSize: b.Histogram(metaMessageSize, histogramWindow, metric.DataSizeBuckets),
EmittedBytes: b.Counter(metaChangefeedEmittedBytes),
FlushedBytes: b.Counter(metaChangefeedFlushedBytes),
Flushes: b.Counter(metaChangefeedFlushes),

BatchHistNanos: b.Histogram(metaChangefeedBatchHistNanos, histogramWindow, metric.BatchProcessLatencyBuckets),
FlushHistNanos: b.Histogram(metaChangefeedFlushHistNanos, histogramWindow, metric.BatchProcessLatencyBuckets),
CommitLatency: b.Histogram(metaCommitLatency, histogramWindow, metric.BatchProcessLatencyBuckets),
AdmitLatency: b.Histogram(metaAdmitLatency, histogramWindow, metric.BatchProcessLatencyBuckets),
BackfillCount: b.Gauge(metaChangefeedBackfillCount),
BackfillPendingRanges: b.Gauge(metaChangefeedBackfillPendingRanges),
RunningCount: b.Gauge(metaChangefeedRunning),
Expand Down Expand Up @@ -505,7 +500,7 @@ type Metrics struct {
Failures *metric.Counter
ResolvedMessages *metric.Counter
QueueTimeNanos *metric.Counter
CheckpointHistNanos *metric.Histogram
CheckpointHistNanos *metric.HistogramV2
FrontierUpdates *metric.Counter
ThrottleMetrics cdcutils.Metrics
ReplanCount *metric.Counter
Expand All @@ -529,17 +524,16 @@ func (m *Metrics) getSLIMetrics(scope string) (*sliMetrics, error) {
// MakeMetrics makes the metrics for changefeed monitoring.
func MakeMetrics(histogramWindow time.Duration) metric.Struct {
m := &Metrics{
AggMetrics: newAggregateMetrics(histogramWindow),
KVFeedMetrics: kvevent.MakeMetrics(histogramWindow),
SchemaFeedMetrics: schemafeed.MakeMetrics(histogramWindow),
ResolvedMessages: metric.NewCounter(metaChangefeedForwardedResolvedMessages),
Failures: metric.NewCounter(metaChangefeedFailures),
QueueTimeNanos: metric.NewCounter(metaEventQueueTime),
CheckpointHistNanos: metric.NewHistogram(metaChangefeedCheckpointHistNanos, histogramWindow,
changefeedCheckpointHistMaxLatency.Nanoseconds(), 2),
FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates),
ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow),
ReplanCount: metric.NewCounter(metaChangefeedReplanCount),
AggMetrics: newAggregateMetrics(histogramWindow),
KVFeedMetrics: kvevent.MakeMetrics(histogramWindow),
SchemaFeedMetrics: schemafeed.MakeMetrics(histogramWindow),
ResolvedMessages: metric.NewCounter(metaChangefeedForwardedResolvedMessages),
Failures: metric.NewCounter(metaChangefeedFailures),
QueueTimeNanos: metric.NewCounter(metaEventQueueTime),
CheckpointHistNanos: metric.NewHistogramV2(metaChangefeedCheckpointHistNanos, histogramWindow, metric.IOLatencyBuckets),
FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates),
ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow),
ReplanCount: metric.NewCounter(metaChangefeedReplanCount),
}

m.mu.resolved = make(map[int]hlc.Timestamp)
Expand Down
2 changes: 1 addition & 1 deletion pkg/ccl/sqlproxyccl/connector.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ type connector struct {

// DialTenantLatency tracks how long it takes to retrieve the address for
// a tenant and set up a tcp connection to the address.
DialTenantLatency *metric.Histogram
DialTenantLatency *metric.HistogramV2

// DialTenantRetries counts how often dialing a tenant is retried.
DialTenantRetries *metric.Counter
Expand Down
10 changes: 7 additions & 3 deletions pkg/ccl/sqlproxyccl/connector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,9 @@ func TestConnector_dialTenantCluster(t *testing.T) {
defer cancel()

c := &connector{
DialTenantLatency: metric.NewLatency(metaDialTenantLatency, time.Millisecond),
DialTenantLatency: metric.NewHistogramV2(
metaDialTenantLatency, time.Millisecond, metric.IOLatencyBuckets,
),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
}
c.testingKnobs.lookupAddr = func(ctx context.Context) (string, error) {
Expand Down Expand Up @@ -403,8 +405,10 @@ func TestConnector_dialTenantCluster(t *testing.T) {

var reportFailureFnCount int
c := &connector{
TenantID: roachpb.MakeTenantID(42),
DialTenantLatency: metric.NewLatency(metaDialTenantLatency, time.Millisecond),
TenantID: roachpb.MakeTenantID(42),
DialTenantLatency: metric.NewHistogramV2(
metaDialTenantLatency, time.Millisecond, metric.IOLatencyBuckets,
),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
}
c.DirectoryCache = &testTenantDirectoryCache{
Expand Down
22 changes: 12 additions & 10 deletions pkg/ccl/sqlproxyccl/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,19 @@ type metrics struct {
RoutingErrCount *metric.Counter
RefusedConnCount *metric.Counter
SuccessfulConnCount *metric.Counter
ConnectionLatency *metric.Histogram
ConnectionLatency *metric.HistogramV2
AuthFailedCount *metric.Counter
ExpiredClientConnCount *metric.Counter

DialTenantLatency *metric.Histogram
DialTenantLatency *metric.HistogramV2
DialTenantRetries *metric.Counter

ConnMigrationSuccessCount *metric.Counter
ConnMigrationErrorFatalCount *metric.Counter
ConnMigrationErrorRecoverableCount *metric.Counter
ConnMigrationAttemptedCount *metric.Counter
ConnMigrationAttemptedLatency *metric.Histogram
ConnMigrationTransferResponseMessageSize *metric.Histogram
ConnMigrationAttemptedLatency *metric.HistogramV2
ConnMigrationTransferResponseMessageSize *metric.HistogramV2

QueryCancelReceivedPGWire *metric.Counter
QueryCancelReceivedHTTP *metric.Counter
Expand Down Expand Up @@ -224,32 +224,34 @@ func makeProxyMetrics() metrics {
RoutingErrCount: metric.NewCounter(metaRoutingErrCount),
RefusedConnCount: metric.NewCounter(metaRefusedConnCount),
SuccessfulConnCount: metric.NewCounter(metaSuccessfulConnCount),
ConnectionLatency: metric.NewLatency(
ConnectionLatency: metric.NewHistogramV2(
metaConnMigrationAttemptedCount,
base.DefaultHistogramWindowInterval(),
metric.IOLatencyBuckets,
),
AuthFailedCount: metric.NewCounter(metaAuthFailedCount),
ExpiredClientConnCount: metric.NewCounter(metaExpiredClientConnCount),
// Connector metrics.
DialTenantLatency: metric.NewLatency(
DialTenantLatency: metric.NewHistogramV2(
metaDialTenantLatency,
base.DefaultHistogramWindowInterval(),
metric.NetworkLatencyBuckets,
),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
// Connection migration metrics.
ConnMigrationSuccessCount: metric.NewCounter(metaConnMigrationSuccessCount),
ConnMigrationErrorFatalCount: metric.NewCounter(metaConnMigrationErrorFatalCount),
ConnMigrationErrorRecoverableCount: metric.NewCounter(metaConnMigrationErrorRecoverableCount),
ConnMigrationAttemptedCount: metric.NewCounter(metaConnMigrationAttemptedCount),
ConnMigrationAttemptedLatency: metric.NewLatency(
ConnMigrationAttemptedLatency: metric.NewHistogramV2(
metaConnMigrationAttemptedLatency,
base.DefaultHistogramWindowInterval(),
metric.NetworkLatencyBuckets,
),
ConnMigrationTransferResponseMessageSize: metric.NewHistogram(
ConnMigrationTransferResponseMessageSize: metric.NewHistogramV2(
metaConnMigrationTransferResponseMessageSize,
base.DefaultHistogramWindowInterval(),
maxExpectedTransferResponseMessageSize,
1,
metric.DataSizeBuckets,
),
QueryCancelReceivedPGWire: metric.NewCounter(metaQueryCancelReceivedPGWire),
QueryCancelReceivedHTTP: metric.NewCounter(metaQueryCancelReceivedHTTP),
Expand Down
18 changes: 9 additions & 9 deletions pkg/ccl/streamingccl/streamingest/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,9 @@ type Metrics struct {
Flushes *metric.Counter
JobProgressUpdates *metric.Counter
ResolvedEvents *metric.Counter
FlushHistNanos *metric.Histogram
CommitLatency *metric.Histogram
AdmitLatency *metric.Histogram
FlushHistNanos *metric.HistogramV2
CommitLatency *metric.HistogramV2
AdmitLatency *metric.HistogramV2
RunningCount *metric.Gauge
EarliestDataCheckpointSpan *metric.Gauge
LatestDataCheckpointSpan *metric.Gauge
Expand All @@ -134,12 +134,12 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
Flushes: metric.NewCounter(metaStreamingFlushes),
ResolvedEvents: metric.NewCounter(metaStreamingResolvedEventsIngested),
JobProgressUpdates: metric.NewCounter(metaJobProgressUpdates),
FlushHistNanos: metric.NewHistogram(metaStreamingFlushHistNanos,
histogramWindow, streamingFlushHistMaxLatency.Nanoseconds(), 1),
CommitLatency: metric.NewHistogram(metaStreamingCommitLatency,
histogramWindow, streamingCommitLatencyMaxValue.Nanoseconds(), 1),
AdmitLatency: metric.NewHistogram(metaStreamingAdmitLatency,
histogramWindow, streamingAdmitLatencyMaxValue.Nanoseconds(), 1),
FlushHistNanos: metric.NewHistogramV2(metaStreamingFlushHistNanos,
histogramWindow, metric.BatchProcessLatencyBuckets),
CommitLatency: metric.NewHistogramV2(metaStreamingCommitLatency,
histogramWindow, metric.BatchProcessLatencyBuckets),
AdmitLatency: metric.NewHistogramV2(metaStreamingAdmitLatency,
histogramWindow, metric.BatchProcessLatencyBuckets),
RunningCount: metric.NewGauge(metaStreamsRunning),
EarliestDataCheckpointSpan: metric.NewGauge(metaEarliestDataCheckpointSpan),
LatestDataCheckpointSpan: metric.NewGauge(metaLatestDataCheckpointSpan),
Expand Down
4 changes: 2 additions & 2 deletions pkg/kv/bulk/bulk_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
// Metrics contains pointers to the metrics for
// monitoring bulk operations.
type Metrics struct {
MaxBytesHist *metric.Histogram
MaxBytesHist *metric.HistogramV2
CurBytesCount *metric.Gauge
}

Expand Down Expand Up @@ -50,7 +50,7 @@ const log10int64times1000 = 19 * 1000
// MakeBulkMetrics instantiates the metrics holder for bulk operation monitoring.
func MakeBulkMetrics(histogramWindow time.Duration) Metrics {
return Metrics{
MaxBytesHist: metric.NewHistogram(metaMemMaxBytes, histogramWindow, log10int64times1000, 3),
MaxBytesHist: metric.NewHistogramV2(metaMemMaxBytes, histogramWindow, metric.MemoryUsageBuckets),
CurBytesCount: metric.NewGauge(metaMemCurBytes),
}
}
8 changes: 4 additions & 4 deletions pkg/kv/kvclient/kvcoord/txn_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ type TxnMetrics struct {
RefreshMemoryLimitExceeded *metric.Counter
RefreshAutoRetries *metric.Counter

Durations *metric.Histogram
Durations *metric.HistogramV2

TxnsWithCondensedIntents *metric.Counter
TxnsWithCondensedIntentsGauge *metric.Gauge
TxnsRejectedByLockSpanBudget *metric.Counter

// Restarts is the number of times we had to restart the transaction.
Restarts *metric.Histogram
Restarts *metric.HistogramV2

// Counts of restart types.
RestartsWriteTooOld telemetry.CounterWithMetric
Expand Down Expand Up @@ -274,11 +274,11 @@ func MakeTxnMetrics(histogramWindow time.Duration) TxnMetrics {
RefreshFailWithCondensedSpans: metric.NewCounter(metaRefreshFailWithCondensedSpans),
RefreshMemoryLimitExceeded: metric.NewCounter(metaRefreshMemoryLimitExceeded),
RefreshAutoRetries: metric.NewCounter(metaRefreshAutoRetries),
Durations: metric.NewLatency(metaDurationsHistograms, histogramWindow),
Durations: metric.NewHistogramV2(metaDurationsHistograms, histogramWindow, metric.IOLatencyBuckets),
TxnsWithCondensedIntents: metric.NewCounter(metaTxnsWithCondensedIntentSpans),
TxnsWithCondensedIntentsGauge: metric.NewGauge(metaTxnsWithCondensedIntentSpansGauge),
TxnsRejectedByLockSpanBudget: metric.NewCounter(metaTxnsRejectedByLockSpanBudget),
Restarts: metric.NewHistogram(metaRestartsHistogram, histogramWindow, 100, 3),
Restarts: metric.NewHistogramV2(metaRestartsHistogram, histogramWindow, metric.CountBuckets),
RestartsWriteTooOld: telemetry.NewCounterWithMetric(metaRestartsWriteTooOld),
RestartsWriteTooOldMulti: telemetry.NewCounterWithMetric(metaRestartsWriteTooOldMulti),
RestartsSerializable: telemetry.NewCounterWithMetric(metaRestartsSerializable),
Expand Down
20 changes: 12 additions & 8 deletions pkg/kv/kvprober/kvprober.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,10 @@ var (
type Metrics struct {
ReadProbeAttempts *metric.Counter
ReadProbeFailures *metric.Counter
ReadProbeLatency *metric.Histogram
ReadProbeLatency *metric.HistogramV2
WriteProbeAttempts *metric.Counter
WriteProbeFailures *metric.Counter
WriteProbeLatency *metric.Histogram
WriteProbeLatency *metric.HistogramV2
ProbePlanAttempts *metric.Counter
ProbePlanFailures *metric.Counter
}
Expand Down Expand Up @@ -214,14 +214,18 @@ func NewProber(opts Opts) *Prober {
writePlanner: newMeta2Planner(opts.DB, opts.Settings, func() time.Duration { return writeInterval.Get(&opts.Settings.SV) }),

metrics: Metrics{
ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts),
ReadProbeFailures: metric.NewCounter(metaReadProbeFailures),
ReadProbeLatency: metric.NewLatency(metaReadProbeLatency, opts.HistogramWindowInterval),
ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts),
ReadProbeFailures: metric.NewCounter(metaReadProbeFailures),
ReadProbeLatency: metric.NewHistogramV2(
metaReadProbeLatency, opts.HistogramWindowInterval, metric.IOLatencyBuckets,
),
WriteProbeAttempts: metric.NewCounter(metaWriteProbeAttempts),
WriteProbeFailures: metric.NewCounter(metaWriteProbeFailures),
WriteProbeLatency: metric.NewLatency(metaWriteProbeLatency, opts.HistogramWindowInterval),
ProbePlanAttempts: metric.NewCounter(metaProbePlanAttempts),
ProbePlanFailures: metric.NewCounter(metaProbePlanFailures),
WriteProbeLatency: metric.NewHistogramV2(
metaWriteProbeLatency, opts.HistogramWindowInterval, metric.IOLatencyBuckets,
),
ProbePlanAttempts: metric.NewCounter(metaProbePlanAttempts),
ProbePlanFailures: metric.NewCounter(metaProbePlanFailures),
},
tracer: opts.Tracer,
}
Expand Down
1 change: 0 additions & 1 deletion pkg/kv/kvserver/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,6 @@ go_library(
"@com_github_gogo_protobuf//proto",
"@com_github_google_btree//:btree",
"@com_github_kr_pretty//:pretty",
"@com_github_prometheus_client_golang//prometheus",
"@io_etcd_go_etcd_raft_v3//:raft",
"@io_etcd_go_etcd_raft_v3//raftpb",
"@io_etcd_go_etcd_raft_v3//tracker",
Expand Down
6 changes: 4 additions & 2 deletions pkg/kv/kvserver/liveness/liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ type Metrics struct {
HeartbeatSuccesses *metric.Counter
HeartbeatFailures telemetry.CounterWithMetric
EpochIncrements telemetry.CounterWithMetric
HeartbeatLatency *metric.Histogram
HeartbeatLatency *metric.HistogramV2
}

// IsLiveCallback is invoked when a node's IsLive state changes to true.
Expand Down Expand Up @@ -309,7 +309,9 @@ func NewNodeLiveness(opts NodeLivenessOptions) *NodeLiveness {
HeartbeatSuccesses: metric.NewCounter(metaHeartbeatSuccesses),
HeartbeatFailures: telemetry.NewCounterWithMetric(metaHeartbeatFailures),
EpochIncrements: telemetry.NewCounterWithMetric(metaEpochIncrements),
HeartbeatLatency: metric.NewLatency(metaHeartbeatLatency, opts.HistogramWindowInterval),
HeartbeatLatency: metric.NewHistogramV2(
metaHeartbeatLatency, opts.HistogramWindowInterval, metric.IOLatencyBuckets,
),
}
nl.mu.nodes = make(map[roachpb.NodeID]Record)
nl.heartbeatToken <- struct{}{}
Expand Down
Loading