From 0ee98bafad88a81b269e23873872f1eb0344da73 Mon Sep 17 00:00:00 2001 From: William Dumont Date: Fri, 13 Sep 2024 15:08:05 +0200 Subject: [PATCH] add cluster_name label to cluster metrics (#61) --- metrics.go | 33 +++++++++++++++++++++++++++++++-- node.go | 4 ++-- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/metrics.go b/metrics.go index 34d3966..97a06e1 100644 --- a/metrics.go +++ b/metrics.go @@ -18,6 +18,8 @@ const ( eventNodeConflict = "node_conflict" ) +const clusterNameLabel = "cluster_name" + // metrics holds the set of metrics for a Node. Additional Collectors can be // registered by calling Add. type metrics struct { @@ -33,38 +35,56 @@ type metrics struct { var _ prometheus.Collector = (*metrics)(nil) -func newMetrics() *metrics { +func newMetrics(clusterName string) *metrics { var m metrics m.gossipEventsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "cluster_node_gossip_received_events_total", Help: "Total number of gossip messages handled by the node.", + ConstLabels: prometheus.Labels{ + clusterNameLabel: clusterName, + }, }, []string{"event"}) m.nodePeers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cluster_node_peers", Help: "Current number of healthy peers by state", + ConstLabels: prometheus.Labels{ + clusterNameLabel: clusterName, + }, }, []string{"state"}) m.nodeUpdating = prometheus.NewGauge(prometheus.GaugeOpts{ Name: "cluster_node_updating", Help: "1 if the node is currently processing a change to the cluster state.", + ConstLabels: prometheus.Labels{ + clusterNameLabel: clusterName, + }, }) m.nodeUpdateDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ Name: "cluster_node_update_duration_seconds", Help: "Histogram of the latency it took to process a change to the cluster state.", Buckets: prometheus.DefBuckets, + ConstLabels: prometheus.Labels{ + clusterNameLabel: clusterName, + }, }) m.nodeObservers = prometheus.NewGauge(prometheus.GaugeOpts{ Name: "cluster_node_update_observers", Help: "Number of internal observers waiting for changes to cluster state.", + ConstLabels: prometheus.Labels{ + clusterNameLabel: clusterName, + }, }) m.nodeInfo = metricsutil.NewInfoCollector(metricsutil.InfoOpts{ Name: "cluster_node_info", Help: "Info about the local node. Label values will change as the node changes state.", + ConstLabels: prometheus.Labels{ + clusterNameLabel: clusterName, + }, }, "state") m.Add( @@ -79,12 +99,15 @@ func newMetrics() *metrics { return &m } -func newMemberlistCollector(ml *memberlist.Memberlist) prometheus.Collector { +func newMemberlistCollector(ml *memberlist.Memberlist, clusterName string) prometheus.Collector { var container metricsutil.Container gossipProtoVersion := prometheus.NewGaugeFunc(prometheus.GaugeOpts{ Name: "cluster_node_gossip_proto_version", Help: "Gossip protocol version used by nodes to maintain the cluster", + ConstLabels: prometheus.Labels{ + clusterNameLabel: clusterName, + }, }, func() float64 { // NOTE(rfratto): while this is static at the time of writing, the internal // documentation for memberlist claims that ProtocolVersion may one day be @@ -95,6 +118,9 @@ func newMemberlistCollector(ml *memberlist.Memberlist) prometheus.Collector { gossipHealthScore := prometheus.NewGaugeFunc(prometheus.GaugeOpts{ Name: "cluster_node_gossip_health_score", Help: "Health value of a node; lower values means healthier. 0 is the minimum.", + ConstLabels: prometheus.Labels{ + clusterNameLabel: clusterName, + }, }, func() float64 { return float64(ml.GetHealthScore()) }) @@ -102,6 +128,9 @@ func newMemberlistCollector(ml *memberlist.Memberlist) prometheus.Collector { gossipPeers := prometheus.NewGaugeFunc(prometheus.GaugeOpts{ Name: "cluster_node_gossip_alive_peers", Help: "How many alive gossip peers a node has, including the local node.", + ConstLabels: prometheus.Labels{ + clusterNameLabel: clusterName, + }, }, func() float64 { return float64(ml.NumMembers()) }) diff --git a/node.go b/node.go index 721b35d..28b67fb 100644 --- a/node.go +++ b/node.go @@ -182,7 +182,7 @@ func NewNode(cli *http.Client, cfg Config) (*Node, error) { n := &Node{ log: cfg.Log, cfg: cfg, - m: newMetrics(), + m: newMetrics(mlc.Label), notifyObserversQueue: queue.New(1), @@ -209,7 +209,7 @@ func NewNode(cli *http.Client, cfg Config) (*Node, error) { // Include some extra metrics. n.m.Add( - newMemberlistCollector(ml), + newMemberlistCollector(ml, mlc.Label), transportMetrics, prometheus.NewGaugeFunc(prometheus.GaugeOpts{ Name: "cluster_node_lamport_time",