From eb82a437f09225de8dcaa4b5f65328eb7cf0b4bd Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Mon, 9 May 2022 14:13:44 +0200 Subject: [PATCH 1/2] metric: add prometheus-based histogram Our current histogram is based on `hdrhistogram`. This tends to create lots of buckets and is inflexible w.r.t the bucket layout. In hindsight, it was a poor choice of default implementation (I can say that since I introduced it) and its cost is disincentivizing the introduction of histograms that would be useful. This commit introduces a histogram that is based on a completely vanilla `prometheus.Histogram`. The only reason we need to wrap it is because we want to export quantiles to CockraochDB's internal timeseries (it does not support histograms) and this requires maintaining an internal windowed histogram (on top of the cumulative histogram). With this done, we can now introduce metrics with any kind of buckets we want. Helpfully, we introduce two common kinds of buckets, suitable for IO-type and RPC-type latencies. These are defined in a human-readable format by explicitly listing out the buckets. We can move existing metrics to HistogramV2 easily, assuming we are not concerned with existing prometheus scrapers getting tripped up by the changes in bucket boundaries. I assume this is not a big deal in practice as long as it doesn't happen "all the time". In fact, I would strongly suggest we move all metrics wholesale and remove the hdrhistogram-based implementation. If this is not acceptable for some reason, we ought to at least deprecated it. We also slightly improve the existing `Histogram` code by unifying how the windowed histograms are ticked and by making explicit where their quantiles are recorded (this dependency was previously hidden via a local interface assertion). Resolves https://github.com/cockroachdb/cockroach/issues/10015. Resolves https://github.com/cockroachdb/cockroach/issues/64962. Alternative to https://github.com/dhartunian/cockroach/commit/eac3d066de2703f75207a2c3b80ac42c4f4074c5 TODO - export quantiles (how to do this? Not clear, don't see the code laying around in prometheus, might have to hand-roll it but should be easy enough) Release note: None --- pkg/kv/kvserver/BUILD.bazel | 1 + pkg/kv/kvserver/metrics.go | 9 +- pkg/kv/kvserver/scheduler.go | 2 +- pkg/server/status/BUILD.bazel | 2 + pkg/server/status/recorder.go | 31 +++-- pkg/util/metric/BUILD.bazel | 2 +- pkg/util/metric/metric.go | 176 +++++++++++++++++++++++++-- pkg/util/metric/metric_test.go | 56 +++++++++ pkg/util/metric/sliding_histogram.go | 61 ---------- pkg/util/metric/tick_helper.go | 34 ++++++ 10 files changed, 291 insertions(+), 83 deletions(-) delete mode 100644 pkg/util/metric/sliding_histogram.go create mode 100644 pkg/util/metric/tick_helper.go diff --git a/pkg/kv/kvserver/BUILD.bazel b/pkg/kv/kvserver/BUILD.bazel index 0ecbdb630aa5..5c290bfec99d 100644 --- a/pkg/kv/kvserver/BUILD.bazel +++ b/pkg/kv/kvserver/BUILD.bazel @@ -208,6 +208,7 @@ go_library( "@com_github_gogo_protobuf//proto", "@com_github_google_btree//:btree", "@com_github_kr_pretty//:pretty", + "@com_github_prometheus_client_golang//prometheus", "@io_etcd_go_etcd_raft_v3//:raft", "@io_etcd_go_etcd_raft_v3//raftpb", "@io_etcd_go_etcd_raft_v3//tracker", diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index 8a47f9e2e975..e071d5e2c3b1 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -31,6 +31,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/slidingwindow" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/prometheus/client_golang/prometheus" "go.etcd.io/etcd/raft/v3/raftpb" ) @@ -1752,7 +1753,7 @@ type StoreMetrics struct { RaftCommandCommitLatency *metric.Histogram RaftHandleReadyLatency *metric.Histogram RaftApplyCommittedLatency *metric.Histogram - RaftSchedulerLatency *metric.Histogram + RaftSchedulerLatency *metric.HistogramV2 RaftTimeoutCampaign *metric.Counter // Raft message metrics. @@ -2255,8 +2256,10 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { RaftCommandCommitLatency: metric.NewLatency(metaRaftCommandCommitLatency, histogramWindow), RaftHandleReadyLatency: metric.NewLatency(metaRaftHandleReadyLatency, histogramWindow), RaftApplyCommittedLatency: metric.NewLatency(metaRaftApplyCommittedLatency, histogramWindow), - RaftSchedulerLatency: metric.NewLatency(metaRaftSchedulerLatency, histogramWindow), - RaftTimeoutCampaign: metric.NewCounter(metaRaftTimeoutCampaign), + RaftSchedulerLatency: metric.NewHistogramV2(metaRaftSchedulerLatency, histogramWindow, prometheus.HistogramOpts{ + Buckets: metric.IOLatencyBuckets, + }), + RaftTimeoutCampaign: metric.NewCounter(metaRaftTimeoutCampaign), // Raft message metrics. RaftRcvdMessages: [...]*metric.Counter{ diff --git a/pkg/kv/kvserver/scheduler.go b/pkg/kv/kvserver/scheduler.go index dbfdddb843d6..2ed446471219 100644 --- a/pkg/kv/kvserver/scheduler.go +++ b/pkg/kv/kvserver/scheduler.go @@ -169,7 +169,7 @@ type raftScheduleState struct { type raftScheduler struct { ambientContext log.AmbientContext processor raftProcessor - latency *metric.Histogram + latency *metric.HistogramV2 numWorkers int mu struct { diff --git a/pkg/server/status/BUILD.bazel b/pkg/server/status/BUILD.bazel index 216b2b0da11d..a4ddbc1fa350 100644 --- a/pkg/server/status/BUILD.bazel +++ b/pkg/server/status/BUILD.bazel @@ -68,6 +68,8 @@ go_library( "@com_github_codahale_hdrhistogram//:hdrhistogram", "@com_github_dustin_go_humanize//:go-humanize", "@com_github_elastic_gosigar//:gosigar", + "@com_github_prometheus_client_golang//prometheus", + "@com_github_prometheus_client_model//go", "@com_github_shirou_gopsutil_v3//net", ] + select({ "@io_bazel_rules_go//go/platform:aix": [ diff --git a/pkg/server/status/recorder.go b/pkg/server/status/recorder.go index 464eaa160eb8..3667b5eb6741 100644 --- a/pkg/server/status/recorder.go +++ b/pkg/server/status/recorder.go @@ -44,9 +44,9 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/system" "github.com/cockroachdb/errors" "github.com/cockroachdb/redact" - "github.com/codahale/hdrhistogram" - humanize "github.com/dustin/go-humanize" + "github.com/dustin/go-humanize" "github.com/elastic/gosigar" + prometheusgo "github.com/prometheus/client_model/go" ) const ( @@ -531,12 +531,10 @@ func extractValue(name string, mtr interface{}, fn func(string, float64)) error // TODO(tschottdorf,ajwerner): consider moving this switch to a single // interface implemented by the individual metric types. type ( - float64Valuer interface{ Value() float64 } - int64Valuer interface{ Value() int64 } - int64Counter interface{ Count() int64 } - histogramValuer interface { - Windowed() (*hdrhistogram.Histogram, time.Duration) - } + float64Valuer interface{ Value() float64 } + int64Valuer interface{ Value() int64 } + int64Counter interface{ Count() int64 } + prometheusMetricValuer interface{ ToPrometheusMetric() *prometheusgo.Metric } ) switch mtr := mtr.(type) { case float64: @@ -547,7 +545,7 @@ func extractValue(name string, mtr interface{}, fn func(string, float64)) error fn(name, float64(mtr.Value())) case int64Counter: fn(name, float64(mtr.Count())) - case histogramValuer: + case *metric.Histogram: // TODO(mrtracy): Where should this comment go for better // visibility? // @@ -567,6 +565,21 @@ func extractValue(name string, mtr interface{}, fn func(string, float64)) error fn(name+pt.suffix, float64(curr.ValueAtQuantile(pt.quantile))) } fn(name+"-count", float64(curr.TotalCount())) + case *metric.HistogramV2: + // NB: this branch is intentionally at the bottom since all metrics implement it. + cur := mtr.Windowed() + var m prometheusgo.Metric + _ = cur.Write(&m) + hist := m.Histogram + n := float64(*hist.SampleCount) + fn(name+"-count", n) + fn(name+"-avg", *hist.SampleSum/n) + // TODO(obs-inf): add quantiles like for the hdrhistogram. + case prometheusMetricValuer: + // TODO we should be able to handle all non-histogram branches using this, i.e. + // can delete the float, int, etc, cases above. + _ = mtr.ToPrometheusMetric() + default: return errors.Errorf("cannot extract value for type %T", mtr) } diff --git a/pkg/util/metric/BUILD.bazel b/pkg/util/metric/BUILD.bazel index cf5b88fac4c5..5db5d0bfe2dc 100644 --- a/pkg/util/metric/BUILD.bazel +++ b/pkg/util/metric/BUILD.bazel @@ -14,8 +14,8 @@ go_library( "registry.go", "rule.go", "rule_registry.go", - "sliding_histogram.go", "test_helpers.go", + "tick_helper.go", ], embed = [":metric_go_proto"], importpath = "github.com/cockroachdb/cockroach/pkg/util/metric", diff --git a/pkg/util/metric/metric.go b/pkg/util/metric/metric.go index 2ce62329a5d3..f7d1c833b67e 100644 --- a/pkg/util/metric/metric.go +++ b/pkg/util/metric/metric.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/codahale/hdrhistogram" "github.com/gogo/protobuf/proto" + "github.com/prometheus/client_golang/prometheus" prometheusgo "github.com/prometheus/client_model/go" metrics "github.com/rcrowley/go-metrics" ) @@ -185,7 +186,8 @@ type Histogram struct { mu struct { syncutil.Mutex cumulative *hdrhistogram.Histogram - sliding *slidingHistogram + *tickHelper + sliding *hdrhistogram.WindowedHistogram } } @@ -194,13 +196,20 @@ type Histogram struct { // track nonnegative values up to 'maxVal' with 'sigFigs' decimal points of // precision. func NewHistogram(metadata Metadata, duration time.Duration, maxVal int64, sigFigs int) *Histogram { - dHist := newSlidingHistogram(duration, maxVal, sigFigs) h := &Histogram{ Metadata: metadata, maxVal: maxVal, } + wHist := hdrhistogram.NewWindowed(histWrapNum, 0, maxVal, sigFigs) h.mu.cumulative = hdrhistogram.New(0, maxVal, sigFigs) - h.mu.sliding = dHist + h.mu.sliding = wHist + h.mu.tickHelper = &tickHelper{ + nextT: now(), + tickInterval: duration / histWrapNum, + onTick: func() { + wHist.Rotate() + }, + } return h } @@ -222,7 +231,9 @@ func NewLatency(metadata Metadata, histogramWindow time.Duration) *Histogram { func (h *Histogram) Windowed() (*hdrhistogram.Histogram, time.Duration) { h.mu.Lock() defer h.mu.Unlock() - return cloneHistogram(h.mu.sliding.Current()), h.mu.sliding.duration + // TODO(obs-inf): not sure we should multiply by histWrapNum here, but it + // has been the behavior for a long time. + return cloneHistogram(h.mu.sliding.Merge()), histWrapNum * h.mu.tickInterval } // Snapshot returns a copy of the cumulative (i.e. all-time samples) histogram @@ -240,8 +251,8 @@ func (h *Histogram) RecordValue(v int64) { h.mu.Lock() defer h.mu.Unlock() - if h.mu.sliding.RecordValue(v) != nil { - _ = h.mu.sliding.RecordValue(h.maxVal) + if h.mu.sliding.Current.RecordValue(v) != nil { + _ = h.mu.sliding.Current.RecordValue(h.maxVal) } if h.mu.cumulative.RecordValue(v) != nil { _ = h.mu.cumulative.RecordValue(h.maxVal) @@ -265,7 +276,7 @@ func (h *Histogram) Min() int64 { // Inspect calls the closure with the empty string and the receiver. func (h *Histogram) Inspect(f func(interface{})) { h.mu.Lock() - maybeTick(h.mu.sliding) + maybeTick(h.mu.tickHelper) h.mu.Unlock() f(h) } @@ -280,7 +291,7 @@ func (h *Histogram) ToPrometheusMetric() *prometheusgo.Metric { hist := &prometheusgo.Histogram{} h.mu.Lock() - maybeTick(h.mu.sliding) + maybeTick(h.mu.tickHelper) bars := h.mu.cumulative.Distribution() hist.Bucket = make([]*prometheusgo.Bucket, 0, len(bars)) @@ -319,6 +330,155 @@ func (h *Histogram) GetMetadata() Metadata { return baseMetadata } +// IOLatencyBuckets are prometheus histogram buckets suitable for a histogram +// that records a quantity (nanosecond-denominated) in which most measurements +// resemble those of typical disk latencies, i.e. which are in the micro- and +// millisecond range during normal operation. +var IOLatencyBuckets = []float64{ + // Generated via TestHistogramBuckets/IOLatencyBuckets. + 10000.000000, // 10µs + 26826.957953, // 26.826µs + 71968.567300, // 71.968µs + 193069.772888, // 193.069µs + 517947.467923, // 517.947µs + 1389495.494373, // 1.389495ms + 3727593.720315, // 3.727593ms + 10000000.000000, // 9.999999ms + 26826957.952797, // 26.826957ms + 71968567.300115, // 71.968567ms + 193069772.888325, // 193.069772ms + 517947467.923120, // 517.947467ms + 1389495494.373135, // 1.389495494s + 3727593720.314933, // 3.72759372s + 9999999999.999981, // 9.999999999s +} + +// NetworkLatencyBuckets are prometheus histogram buckets suitable for a histogram +// that records a quantity (nanosecond-denominated) in which most measurements +// behave like network latencies, i.e. most measurements are in the ms to sub-second +// range during normal operation. +var NetworkLatencyBuckets = []float64{ + // Generated via TestHistogramBuckets/NetworkLatencyBuckets. + 500000.000000, // 500µs + 860513.842995, // 860.513µs + 1480968.147973, // 1.480968ms + 2548787.184731, // 2.548787ms + 4386533.310619, // 4.386533ms + 7549345.273094, // 7.549345ms + 12992632.226094, // 12.992632ms + 22360679.774998, // 22.360679ms + 38483348.970335, // 38.483348ms + 66230909.027573, // 66.230909ms + 113985228.104760, // 113.985228ms + 196171733.362212, // 196.171733ms + 337616984.325077, // 337.616984ms + 581048177.284016, // 581.048177ms + 999999999.999999, // 999.999999ms, +} + +// NewHistogramV2 is a prometheus-backed histogram. Depending on the value of +// opts.Buckets, this is suitable for recording any kind of quantity. Common +// sensible choices are {IO,Network}LatencyBuckets. +func NewHistogramV2( + meta Metadata, windowDuration time.Duration, opts prometheus.HistogramOpts, +) *HistogramV2 { + // TODO(obs-inf): prometheus supports labeled histograms but they require more + // plumbing and don't fit into the PrometheusObservable interface any more. + cum := prometheus.NewHistogram(opts) + h := &HistogramV2{ + Metadata: meta, + cum: cum, + } + h.windowed.tickHelper = &tickHelper{ + nextT: now(), + tickInterval: windowDuration, + onTick: func() { + h.windowed.prev = h.windowed.cur + h.windowed.cur = prometheus.NewHistogram(opts) + }, + } + h.windowed.tickHelper.onTick() + return h +} + +var _ periodic = (*HistogramV2)(nil) +var _ PrometheusExportable = (*HistogramV2)(nil) + +type HistogramV2 struct { + Metadata + cum prometheus.Histogram + + // TODO(obs-inf): the way we implement windowed histograms is not great. If + // the windowed histogram is pulled right after a tick, it will be mostly + // empty. We could add a third bucket and represent the merged view of the two + // most recent buckets to avoid that. Or we could "just" double the rotation + // interval (so that the histogram really collects for 20s when we expect to + // persist the contents every 10s). Really it would make more sense to + // explicitly rotate the histogram atomically with collecting its contents, + // but that is now how we have set it up right now. It should be doable + // though, since there is only one consumer of windowed histograms - our + // internal timeseries system. + windowed struct { + // prometheus.Histogram is thread safe, so we only + // need an RLock to record into it. But write lock + // is held while rotating. + syncutil.RWMutex + *tickHelper + prev, cur prometheus.Histogram + } +} + +func (h *HistogramV2) nextTick() time.Time { + h.windowed.RLock() + defer h.windowed.RUnlock() + return h.windowed.nextTick() +} + +func (h *HistogramV2) tick() { + h.windowed.Lock() + defer h.windowed.Unlock() + h.windowed.tick() +} + +func (h *HistogramV2) Windowed() prometheus.Histogram { + h.windowed.RLock() + defer h.windowed.RUnlock() + return h.windowed.cur +} + +func (h *HistogramV2) RecordValue(n int64) { + v := float64(n) + h.cum.Observe(v) + + h.windowed.RLock() + defer h.windowed.RUnlock() + h.windowed.cur.Observe(v) + +} + +func (h *HistogramV2) GetType() *prometheusgo.MetricType { + return prometheusgo.MetricType_HISTOGRAM.Enum() +} + +func (h *HistogramV2) ToPrometheusMetric() *prometheusgo.Metric { + m := &prometheusgo.Metric{} + if err := h.cum.Write(m); err != nil { + panic(err) // TODD + } + return m +} + +func (h *HistogramV2) GetMetadata() Metadata { + return h.Metadata +} + +func (h *HistogramV2) Inspect(f func(interface{})) { + h.windowed.Lock() + maybeTick(&h.windowed) + h.windowed.Unlock() + f(h) +} + // A Counter holds a single mutable atomic value. type Counter struct { Metadata diff --git a/pkg/util/metric/metric_test.go b/pkg/util/metric/metric_test.go index 78f4aa040dcc..619886306d8e 100644 --- a/pkg/util/metric/metric_test.go +++ b/pkg/util/metric/metric_test.go @@ -13,15 +13,19 @@ package metric import ( "bytes" "encoding/json" + "fmt" "math" "reflect" + "strings" "sync" "testing" "time" _ "github.com/cockroachdb/cockroach/pkg/util/log" // for flags "github.com/kr/pretty" + "github.com/prometheus/client_golang/prometheus" prometheusgo "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/require" ) func testMarshal(t *testing.T, m json.Marshaler, exp string) { @@ -135,6 +139,58 @@ func TestHistogramPrometheus(t *testing.T) { } } +func TestHistogramBuckets(t *testing.T) { + verifyAndPrint := func(t *testing.T, exp, act []float64) { + t.Helper() + var buf strings.Builder + for idx, f := range exp { + if idx == 0 { + fmt.Fprintf(&buf, "// Generated via %s.", t.Name()) + } + fmt.Fprintf(&buf, "\n%f, // %s", f, time.Duration(f)) + } + t.Logf("%s", &buf) + require.InDeltaSlice(t, exp, act, 1 /* delta */, "Please update the bucket boundaries for %s", t.Name()) + } + t.Run("IOLatencyBuckets", func(t *testing.T) { + exp := prometheus.ExponentialBucketsRange(10e3, 10e9, 15) + verifyAndPrint(t, exp, IOLatencyBuckets) + }) + + t.Run("NetworkLatencyBuckets", func(t *testing.T) { + exp := prometheus.ExponentialBucketsRange(500e3, 1e9, 15) + verifyAndPrint(t, exp, NetworkLatencyBuckets) + }) +} + +func TestNewHistogramV2Rotate(t *testing.T) { + defer TestingSetNow(nil)() + setNow(0) + + h := NewHistogramV2(emptyMetadata, 10*time.Second, prometheus.HistogramOpts{Buckets: nil}) + for i := 0; i < 4; i++ { + // Windowed histogram is initially empty. + h.Inspect(func(interface{}) {}) // triggers ticking + var m prometheusgo.Metric + require.NoError(t, h.Windowed().Write(&m)) + require.Zero(t, *m.Histogram.SampleSum) + // But cumulative histogram has history (if i > 0). + require.EqualValues(t, i, *h.ToPrometheusMetric().Histogram.SampleCount) + + // Add a measurement and verify it's there. + { + h.RecordValue(12345) + f := float64(12345) + var m prometheusgo.Metric + require.NoError(t, h.Windowed().Write(&m)) + require.Equal(t, *m.Histogram.SampleSum, f) + } + // Tick. This rotates the histogram. + setNow(time.Duration(i+1) * 10 * time.Second) + // Go to beginning. + } +} + func TestHistogramRotate(t *testing.T) { defer TestingSetNow(nil)() setNow(0) diff --git a/pkg/util/metric/sliding_histogram.go b/pkg/util/metric/sliding_histogram.go deleted file mode 100644 index 20262befc1ef..000000000000 --- a/pkg/util/metric/sliding_histogram.go +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2015 The Cockroach Authors. -// -// Use of this software is governed by the Business Source License -// included in the file licenses/BSL.txt. -// -// As of the Change Date specified in that file, in accordance with -// the Business Source License, use of this software will be governed -// by the Apache License, Version 2.0, included in the file -// licenses/APL.txt. - -package metric - -import ( - "time" - - "github.com/codahale/hdrhistogram" -) - -var _ periodic = &slidingHistogram{} - -// A deprecatedWindowedHistogram is a wrapper around an -// hdrhistogram.WindowedHistogram. The caller must enforce proper -// synchronization. -type slidingHistogram struct { - windowed *hdrhistogram.WindowedHistogram - nextT time.Time - duration time.Duration -} - -// newSlidingHistogram creates a new windowed HDRHistogram with the given -// parameters. Data is kept in the active window for approximately the given -// duration. See the documentation for hdrhistogram.WindowedHistogram for -// details. -func newSlidingHistogram(duration time.Duration, maxVal int64, sigFigs int) *slidingHistogram { - if duration <= 0 { - panic("cannot create a sliding histogram with nonpositive duration") - } - return &slidingHistogram{ - nextT: now(), - duration: duration, - windowed: hdrhistogram.NewWindowed(histWrapNum, 0, maxVal, sigFigs), - } -} - -func (h *slidingHistogram) tick() { - h.nextT = h.nextT.Add(h.duration / histWrapNum) - h.windowed.Rotate() -} - -func (h *slidingHistogram) nextTick() time.Time { - return h.nextT -} - -func (h *slidingHistogram) Current() *hdrhistogram.Histogram { - maybeTick(h) - return h.windowed.Merge() -} - -func (h *slidingHistogram) RecordValue(v int64) error { - return h.windowed.Current.RecordValue(v) -} diff --git a/pkg/util/metric/tick_helper.go b/pkg/util/metric/tick_helper.go new file mode 100644 index 000000000000..fbc6ab19e96a --- /dev/null +++ b/pkg/util/metric/tick_helper.go @@ -0,0 +1,34 @@ +// Copyright 2015 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package metric + +import "time" + +var _ periodic = &tickHelper{} + +// tickHelper is used by metrics that are at heart cumulative, but wish to also +// maintain a windowed version to work around limitations of our internal +// timeseries database. +type tickHelper struct { + nextT time.Time + tickInterval time.Duration + + onTick func() +} + +func (s *tickHelper) nextTick() time.Time { + return s.nextT +} + +func (s *tickHelper) tick() { + s.nextT = s.nextT.Add(s.tickInterval) + s.onTick() +} From d7d5838f2be6c5f9d1acfa0e8cde8a0950b0520c Mon Sep 17 00:00:00 2001 From: Aaditya Sondhi Date: Tue, 9 Aug 2022 17:26:09 -0400 Subject: [PATCH 2/2] metric: export quantiles from prometheus-based histogram This change builds on the previous one and adds a function to export quantiles from the Prometheus-based histogram. This functionality is used to store histogram data in the internal timeseries database. The hdr library came with a function to do this, while Prometheus does not have a public API for exporting quantiles. The function implemented here is very similar to the one found internally in Prometheus, using linear interpolation to calculate values at a given quantile. This commit also includes some additional testing and general refactoring of the metrics code. Release note: None Release justification: low risk, high benefit changes --- pkg/kv/kvserver/BUILD.bazel | 1 - pkg/kv/kvserver/metrics.go | 9 +-- pkg/kv/kvserver/scheduler.go | 2 +- pkg/server/status/BUILD.bazel | 4 +- pkg/server/status/recorder.go | 46 ++++++--------- pkg/server/status/recorder_test.go | 12 ++++ pkg/util/metric/BUILD.bazel | 1 + pkg/util/metric/metric.go | 95 +++++++++++++++++++++++++++++- pkg/util/metric/metric_test.go | 75 +++++++++++++++++++++++ 9 files changed, 203 insertions(+), 42 deletions(-) diff --git a/pkg/kv/kvserver/BUILD.bazel b/pkg/kv/kvserver/BUILD.bazel index 5c290bfec99d..0ecbdb630aa5 100644 --- a/pkg/kv/kvserver/BUILD.bazel +++ b/pkg/kv/kvserver/BUILD.bazel @@ -208,7 +208,6 @@ go_library( "@com_github_gogo_protobuf//proto", "@com_github_google_btree//:btree", "@com_github_kr_pretty//:pretty", - "@com_github_prometheus_client_golang//prometheus", "@io_etcd_go_etcd_raft_v3//:raft", "@io_etcd_go_etcd_raft_v3//raftpb", "@io_etcd_go_etcd_raft_v3//tracker", diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index e071d5e2c3b1..8a47f9e2e975 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -31,7 +31,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/slidingwindow" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" - "github.com/prometheus/client_golang/prometheus" "go.etcd.io/etcd/raft/v3/raftpb" ) @@ -1753,7 +1752,7 @@ type StoreMetrics struct { RaftCommandCommitLatency *metric.Histogram RaftHandleReadyLatency *metric.Histogram RaftApplyCommittedLatency *metric.Histogram - RaftSchedulerLatency *metric.HistogramV2 + RaftSchedulerLatency *metric.Histogram RaftTimeoutCampaign *metric.Counter // Raft message metrics. @@ -2256,10 +2255,8 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { RaftCommandCommitLatency: metric.NewLatency(metaRaftCommandCommitLatency, histogramWindow), RaftHandleReadyLatency: metric.NewLatency(metaRaftHandleReadyLatency, histogramWindow), RaftApplyCommittedLatency: metric.NewLatency(metaRaftApplyCommittedLatency, histogramWindow), - RaftSchedulerLatency: metric.NewHistogramV2(metaRaftSchedulerLatency, histogramWindow, prometheus.HistogramOpts{ - Buckets: metric.IOLatencyBuckets, - }), - RaftTimeoutCampaign: metric.NewCounter(metaRaftTimeoutCampaign), + RaftSchedulerLatency: metric.NewLatency(metaRaftSchedulerLatency, histogramWindow), + RaftTimeoutCampaign: metric.NewCounter(metaRaftTimeoutCampaign), // Raft message metrics. RaftRcvdMessages: [...]*metric.Counter{ diff --git a/pkg/kv/kvserver/scheduler.go b/pkg/kv/kvserver/scheduler.go index 2ed446471219..dbfdddb843d6 100644 --- a/pkg/kv/kvserver/scheduler.go +++ b/pkg/kv/kvserver/scheduler.go @@ -169,7 +169,7 @@ type raftScheduleState struct { type raftScheduler struct { ambientContext log.AmbientContext processor raftProcessor - latency *metric.HistogramV2 + latency *metric.Histogram numWorkers int mu struct { diff --git a/pkg/server/status/BUILD.bazel b/pkg/server/status/BUILD.bazel index a4ddbc1fa350..9c846de34936 100644 --- a/pkg/server/status/BUILD.bazel +++ b/pkg/server/status/BUILD.bazel @@ -65,11 +65,8 @@ go_library( "//pkg/util/timeutil", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_redact//:redact", - "@com_github_codahale_hdrhistogram//:hdrhistogram", "@com_github_dustin_go_humanize//:go-humanize", "@com_github_elastic_gosigar//:gosigar", - "@com_github_prometheus_client_golang//prometheus", - "@com_github_prometheus_client_model//go", "@com_github_shirou_gopsutil_v3//net", ] + select({ "@io_bazel_rules_go//go/platform:aix": [ @@ -150,6 +147,7 @@ go_test( "//pkg/util/system", "//pkg/util/timeutil", "@com_github_kr_pretty//:pretty", + "@com_github_prometheus_client_golang//prometheus", "@com_github_shirou_gopsutil_v3//net", ], ) diff --git a/pkg/server/status/recorder.go b/pkg/server/status/recorder.go index 3667b5eb6741..3d6fe5ad18ce 100644 --- a/pkg/server/status/recorder.go +++ b/pkg/server/status/recorder.go @@ -46,7 +46,6 @@ import ( "github.com/cockroachdb/redact" "github.com/dustin/go-humanize" "github.com/elastic/gosigar" - prometheusgo "github.com/prometheus/client_model/go" ) const ( @@ -528,23 +527,7 @@ type registryRecorder struct { } func extractValue(name string, mtr interface{}, fn func(string, float64)) error { - // TODO(tschottdorf,ajwerner): consider moving this switch to a single - // interface implemented by the individual metric types. - type ( - float64Valuer interface{ Value() float64 } - int64Valuer interface{ Value() int64 } - int64Counter interface{ Count() int64 } - prometheusMetricValuer interface{ ToPrometheusMetric() *prometheusgo.Metric } - ) switch mtr := mtr.(type) { - case float64: - fn(name, mtr) - case float64Valuer: - fn(name, mtr.Value()) - case int64Valuer: - fn(name, float64(mtr.Value())) - case int64Counter: - fn(name, float64(mtr.Count())) case *metric.Histogram: // TODO(mrtracy): Where should this comment go for better // visibility? @@ -566,19 +549,24 @@ func extractValue(name string, mtr interface{}, fn func(string, float64)) error } fn(name+"-count", float64(curr.TotalCount())) case *metric.HistogramV2: - // NB: this branch is intentionally at the bottom since all metrics implement it. - cur := mtr.Windowed() - var m prometheusgo.Metric - _ = cur.Write(&m) - hist := m.Histogram - n := float64(*hist.SampleCount) + n := float64(mtr.TotalCountWindowed()) fn(name+"-count", n) - fn(name+"-avg", *hist.SampleSum/n) - // TODO(obs-inf): add quantiles like for the hdrhistogram. - case prometheusMetricValuer: - // TODO we should be able to handle all non-histogram branches using this, i.e. - // can delete the float, int, etc, cases above. - _ = mtr.ToPrometheusMetric() + avg := mtr.TotalSumWindowed() / n + if math.IsNaN(avg) || math.IsInf(avg, +1) || math.IsInf(avg, -1) { + avg = 0 + } + fn(name+"-avg", avg) + for _, pt := range recordHistogramQuantiles { + fn(name+pt.suffix, mtr.ValueAtQuantileWindowed(pt.quantile)) + } + case metric.PrometheusExportable: + // NB: this branch is intentionally at the bottom since all metrics implement it. + m := mtr.ToPrometheusMetric() + if m.Gauge != nil { + fn(name, *m.Gauge.Value) + } else if m.Counter != nil { + fn(name, *m.Counter.Value) + } default: return errors.Errorf("cannot extract value for type %T", mtr) diff --git a/pkg/server/status/recorder_test.go b/pkg/server/status/recorder_test.go index 241a6733dd95..3628c975643a 100644 --- a/pkg/server/status/recorder_test.go +++ b/pkg/server/status/recorder_test.go @@ -33,6 +33,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/system" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/kr/pretty" + "github.com/prometheus/client_golang/prometheus" ) // byTimeAndName is a slice of tspb.TimeSeriesData. @@ -202,6 +203,7 @@ func TestMetricsRecorder(t *testing.T) { {"testGaugeFloat64", "floatgauge", 20}, {"testCounter", "counter", 5}, {"testHistogram", "histogram", 10}, + {"testHistogramV2", "histogramV2", 9}, {"testLatency", "latency", 10}, {"testAggGauge", "agggauge", 4}, {"testAggCounter", "aggcounter", 7}, @@ -294,6 +296,16 @@ func TestMetricsRecorder(t *testing.T) { addExpected(reg.prefix, data.name+q.suffix, reg.source, 100, data.val, reg.isNode) } addExpected(reg.prefix, data.name+"-count", reg.source, 100, 1, reg.isNode) + case "histogramV2": + h := metric.NewHistogramV2(metric.Metadata{Name: reg.prefix + data.name}, time.Second, + prometheus.HistogramOpts{Buckets: []float64{1.0, 10.0, 100.0, 1000.0}}) + reg.reg.AddMetric(h) + h.RecordValue(data.val) + for _, q := range recordHistogramQuantiles { + addExpected(reg.prefix, data.name+q.suffix, reg.source, 100, 10, reg.isNode) + } + addExpected(reg.prefix, data.name+"-count", reg.source, 100, 1, reg.isNode) + addExpected(reg.prefix, data.name+"-avg", reg.source, 100, 9, reg.isNode) case "latency": l := metric.NewLatency(metric.Metadata{Name: reg.prefix + data.name}, time.Hour) reg.reg.AddMetric(l) diff --git a/pkg/util/metric/BUILD.bazel b/pkg/util/metric/BUILD.bazel index 5db5d0bfe2dc..2a74e5daf558 100644 --- a/pkg/util/metric/BUILD.bazel +++ b/pkg/util/metric/BUILD.bazel @@ -51,6 +51,7 @@ go_test( deps = [ "//pkg/util/log", "@com_github_kr_pretty//:pretty", + "@com_github_prometheus_client_golang//prometheus", "@com_github_prometheus_client_model//go", "@com_github_stretchr_testify//require", ], diff --git a/pkg/util/metric/metric.go b/pkg/util/metric/metric.go index f7d1c833b67e..46981240b87b 100644 --- a/pkg/util/metric/metric.go +++ b/pkg/util/metric/metric.go @@ -13,6 +13,7 @@ package metric import ( "encoding/json" "math" + "sort" "sync/atomic" "time" @@ -231,6 +232,7 @@ func NewLatency(metadata Metadata, histogramWindow time.Duration) *Histogram { func (h *Histogram) Windowed() (*hdrhistogram.Histogram, time.Duration) { h.mu.Lock() defer h.mu.Unlock() + maybeTick(h.mu.tickHelper) // TODO(obs-inf): not sure we should multiply by histWrapNum here, but it // has been the behavior for a long time. return cloneHistogram(h.mu.sliding.Merge()), histWrapNum * h.mu.tickInterval @@ -404,6 +406,12 @@ func NewHistogramV2( var _ periodic = (*HistogramV2)(nil) var _ PrometheusExportable = (*HistogramV2)(nil) +// HistogramV2 is a prometheus-backed histogram. It collects observed values by +// keeping bucketed counts. For convenience, internally two sets of buckets are +// kept: A cumulative set (i.e. data is never evicted) and a windowed set (which +// keeps only recently collected samples). +// +// New buckets are created using TestHistogramBuckets. type HistogramV2 struct { Metadata cum prometheus.Histogram @@ -440,12 +448,14 @@ func (h *HistogramV2) tick() { h.windowed.tick() } +// Windowed returns a copy of the current windowed histogram. func (h *HistogramV2) Windowed() prometheus.Histogram { h.windowed.RLock() defer h.windowed.RUnlock() return h.windowed.cur } +// RecordValue adds the given value to the histogram. func (h *HistogramV2) RecordValue(n int64) { v := float64(n) h.cum.Observe(v) @@ -453,25 +463,40 @@ func (h *HistogramV2) RecordValue(n int64) { h.windowed.RLock() defer h.windowed.RUnlock() h.windowed.cur.Observe(v) - } +// GetType returns the prometheus type enum for this metric. func (h *HistogramV2) GetType() *prometheusgo.MetricType { return prometheusgo.MetricType_HISTOGRAM.Enum() } +// ToPrometheusMetric returns a filled-in prometheus metric of the right type. func (h *HistogramV2) ToPrometheusMetric() *prometheusgo.Metric { m := &prometheusgo.Metric{} if err := h.cum.Write(m); err != nil { - panic(err) // TODD + panic(err) } return m } +// ToPrometheusMetricWindowed returns a filled-in prometheus metric of the right type. +func (h *HistogramV2) ToPrometheusMetricWindowed() *prometheusgo.Metric { + h.windowed.Lock() + defer h.windowed.Unlock() + m := &prometheusgo.Metric{} + if err := h.windowed.cur.Write(m); err != nil { + panic(err) + } + return m +} + +// GetMetadata returns the metric's metadata including the Prometheus +// MetricType. func (h *HistogramV2) GetMetadata() Metadata { return h.Metadata } +// Inspect calls the closure. func (h *HistogramV2) Inspect(f func(interface{})) { h.windowed.Lock() maybeTick(&h.windowed) @@ -479,6 +504,72 @@ func (h *HistogramV2) Inspect(f func(interface{})) { f(h) } +// TotalCount returns the (cumulative) number of samples. +func (h *HistogramV2) TotalCount() int64 { + return int64(h.ToPrometheusMetric().Histogram.GetSampleCount()) +} + +// TotalCountWindowed returns the number of samples in the current window. +func (h *HistogramV2) TotalCountWindowed() int64 { + return int64(h.ToPrometheusMetricWindowed().Histogram.GetSampleCount()) +} + +// TotalSum returns the (cumulative) number of samples. +func (h *HistogramV2) TotalSum() float64 { + return h.ToPrometheusMetric().Histogram.GetSampleSum() +} + +// TotalSumWindowed returns the number of samples in the current window. +func (h *HistogramV2) TotalSumWindowed() float64 { + return h.ToPrometheusMetricWindowed().Histogram.GetSampleSum() +} + +// ValueAtQuantileWindowed takes a quantile value [0,100] and returns the +// interpolated value at that quantile for the windowed histogram. +// +// https://github.com/prometheus/prometheus/blob/d91621890a2ccb3191a6d74812cc1827dd4093bf/promql/quantile.go#L75 +// This function is mostly taken from a prometheus internal function that +// does the same thing. There are a few differences for our use case: +// 1. As a user of the prometheus go client library, we don't have access +// to the implicit +Inf bucket, so we don't need special cases to deal +// with the quantiles that include the +Inf bucket. +// 2. Since the prometheus client library ensures buckets are in a strictly +// increasing order at creation, we do not sort them. +func (h *HistogramV2) ValueAtQuantileWindowed(q float64) float64 { + m := h.ToPrometheusMetricWindowed() + + buckets := m.Histogram.Bucket + n := float64(*m.Histogram.SampleCount) + if n == 0 { + return 0 + } + + rank := uint64(((q / 100) * n) + 0.5) + b := sort.Search(len(buckets)-1, func(i int) bool { return *buckets[i].CumulativeCount >= rank }) + + var ( + bucketStart float64 + bucketEnd = *buckets[b].UpperBound + count = *buckets[b].CumulativeCount + ) + + // Calculate the linearly interpolated value within the bucket + if b > 0 { + bucketStart = *buckets[b-1].UpperBound + count -= *buckets[b-1].CumulativeCount + rank -= *buckets[b-1].CumulativeCount + } + val := bucketStart + (bucketEnd-bucketStart)*(float64(rank)/float64(count)) + if math.IsNaN(val) || math.IsInf(val, -1) { + return 0 + } + // should not extrapolate past the upper bound of the largest bucket + if val > *buckets[len(buckets)-1].UpperBound { + return *buckets[len(buckets)-1].UpperBound + } + return val +} + // A Counter holds a single mutable atomic value. type Counter struct { Metadata diff --git a/pkg/util/metric/metric_test.go b/pkg/util/metric/metric_test.go index 619886306d8e..f65e31aea098 100644 --- a/pkg/util/metric/metric_test.go +++ b/pkg/util/metric/metric_test.go @@ -139,6 +139,81 @@ func TestHistogramPrometheus(t *testing.T) { } } +func TestHistogramV2(t *testing.T) { + u := func(v int) *uint64 { + n := uint64(v) + return &n + } + + f := func(v int) *float64 { + n := float64(v) + return &n + } + + h := NewHistogramV2( + Metadata{}, + time.Hour, + prometheus.HistogramOpts{ + Namespace: "", + Subsystem: "", + Name: "", + Help: "", + ConstLabels: nil, + Buckets: []float64{ + 1.0, + 5.0, + 10.0, + 25.0, + 100.0, + }, + }, + ) + + // should return 0 if no observations are made + require.Equal(t, 0.0, h.ValueAtQuantileWindowed(0)) + + // 200 is intentionally set us the first value to verify that the function + // does not return NaN or Inf. + measurements := []int64{200, 0, 4, 5, 10, 20, 25, 30, 40, 90} + var expSum float64 + for i, m := range measurements { + h.RecordValue(m) + if i == 0 { + require.Equal(t, 0.0, h.ValueAtQuantileWindowed(0)) + require.Equal(t, 100.0, h.ValueAtQuantileWindowed(99)) + } + expSum += float64(m) + } + + act := *h.ToPrometheusMetric().Histogram + exp := prometheusgo.Histogram{ + SampleCount: u(len(measurements)), + SampleSum: &expSum, + Bucket: []*prometheusgo.Bucket{ + {CumulativeCount: u(1), UpperBound: f(1)}, + {CumulativeCount: u(3), UpperBound: f(5)}, + {CumulativeCount: u(4), UpperBound: f(10)}, + {CumulativeCount: u(6), UpperBound: f(25)}, + {CumulativeCount: u(9), UpperBound: f(100)}, + // NB: 200 is greater than the largest defined bucket so prometheus + // puts it in an implicit bucket with +Inf as the upper bound. + }, + } + + if !reflect.DeepEqual(act, exp) { + t.Fatalf("expected differs from actual: %s", pretty.Diff(exp, act)) + } + + require.Equal(t, 0.0, h.ValueAtQuantileWindowed(0)) + require.Equal(t, 1.0, h.ValueAtQuantileWindowed(10)) + require.Equal(t, 17.5, h.ValueAtQuantileWindowed(50)) + require.Equal(t, 75.0, h.ValueAtQuantileWindowed(80)) + require.Equal(t, 100.0, h.ValueAtQuantileWindowed(99.99)) +} + +// TestHistogramBuckets is used to generate additional prometheus buckets to be +// used with HistogramV2. Please include obs-inf in the review process of new +// buckets. func TestHistogramBuckets(t *testing.T) { verifyAndPrint := func(t *testing.T, exp, act []float64) { t.Helper()