From 900ff223f3b0efc66e913b8a1fe86f638e8db6fd Mon Sep 17 00:00:00 2001 From: William Dumont Date: Fri, 13 Sep 2024 16:40:08 +0200 Subject: [PATCH 1/2] Add label cluster_name to alloy_config_hash and some cluster_ metrics (#1679) * Add cluster name in metric alloy_config_hash via the label alloy_cluster * changelog entry * bump ckit version to get cluster_name label on cluster metrics * change label to cluster_name to be inline with ckit * simplify config_metrics logic because empty label are ignored * update alerts with cluster_name label --- CHANGELOG.md | 5 +++ go.mod | 12 +++---- go.sum | 24 +++++++------- internal/alloycli/cmd_run.go | 5 +-- .../config/instrumentation/config_metrics.go | 31 ++++++++----------- .../alloy-mixin/alerts/clustering.libsonnet | 12 +++---- 6 files changed, 43 insertions(+), 46 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9155d37e7e..0e5763c060 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,11 @@ internal API changes are not present. Main (unreleased) ----------------- +### Enhancements + +- Add the label `alloy_cluster` in the metric `alloy_config_hash` when the flag `cluster.name` is set to help differentiate between + configs from the same alloy cluster or different alloy clusters. (@wildum) + v1.4.0-rc.0 ----------------- diff --git a/go.mod b/go.mod index 0b54ff5b1b..793387c56a 100644 --- a/go.mod +++ b/go.mod @@ -57,7 +57,7 @@ require ( github.com/grafana/alloy/syntax v0.1.0 github.com/grafana/beyla v1.8.2 github.com/grafana/catchpoint-prometheus-exporter v0.0.0-20240606062944-e55f3668661d - github.com/grafana/ckit v0.0.0-20240624165704-36f3407a8eaa + github.com/grafana/ckit v0.0.0-20240913130805-0ee98bafad88 github.com/grafana/cloudflare-go v0.0.0-20230110200409-c627cf6792f2 github.com/grafana/dskit v0.0.0-20240104111617-ea101a3b86eb github.com/grafana/go-gelf/v2 v2.0.1 @@ -154,7 +154,7 @@ require ( github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.66.0 github.com/prometheus-operator/prometheus-operator/pkg/client v0.66.0 github.com/prometheus/blackbox_exporter v0.24.1-0.20230623125439-bd22efa1c900 - github.com/prometheus/client_golang v1.20.2 + github.com/prometheus/client_golang v1.20.3 github.com/prometheus/client_model v0.6.1 github.com/prometheus/common v0.55.0 github.com/prometheus/common/sigv4 v0.1.0 @@ -242,13 +242,13 @@ require ( go.uber.org/goleak v1.3.0 go.uber.org/multierr v1.11.0 go.uber.org/zap v1.27.0 - golang.org/x/crypto v0.26.0 + golang.org/x/crypto v0.27.0 golang.org/x/crypto/x509roots/fallback v0.0.0-20240208163226-62c9f1799c91 golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 - golang.org/x/net v0.28.0 + golang.org/x/net v0.29.0 golang.org/x/oauth2 v0.22.0 golang.org/x/sys v0.25.0 - golang.org/x/text v0.17.0 + golang.org/x/text v0.18.0 golang.org/x/time v0.5.0 golang.org/x/tools v0.23.0 google.golang.org/api v0.188.0 @@ -791,7 +791,7 @@ require ( golang.org/x/arch v0.7.0 // indirect golang.org/x/mod v0.19.0 // indirect golang.org/x/sync v0.8.0 // indirect - golang.org/x/term v0.23.0 // indirect + golang.org/x/term v0.24.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect gonum.org/v1/gonum v0.15.1 // indirect diff --git a/go.sum b/go.sum index 31a4d90768..2ea57b6f06 100644 --- a/go.sum +++ b/go.sum @@ -1196,8 +1196,8 @@ github.com/grafana/cadvisor v0.0.0-20240729082359-1f04a91701e2 h1:ju6EcY2aEobeBg github.com/grafana/cadvisor v0.0.0-20240729082359-1f04a91701e2/go.mod h1:8sLW/G7rcFe1CKMaA4pYT4mX3P1xQVGqM6luzEzx/2g= github.com/grafana/catchpoint-prometheus-exporter v0.0.0-20240606062944-e55f3668661d h1:6sNPBwOokfCxAyateu7iLdtyWDUzaLLShPs7F4eTLfw= github.com/grafana/catchpoint-prometheus-exporter v0.0.0-20240606062944-e55f3668661d/go.mod h1:aGPSALDAkw18nn8M7gumhM/MbJG+zgOA3jNWTwPYtLg= -github.com/grafana/ckit v0.0.0-20240624165704-36f3407a8eaa h1:3rdc/z801roM6ky8cT8wz4tahQWkTxJ4VAmzANZe8qQ= -github.com/grafana/ckit v0.0.0-20240624165704-36f3407a8eaa/go.mod h1:k21VjCNs7gj1pAV80wb1577fVRePk51Hek5QUMEvKE0= +github.com/grafana/ckit v0.0.0-20240913130805-0ee98bafad88 h1:GgbYRGz2+/Vgz8/lk19Ht8TQDsAudl51Qenuw+COs5k= +github.com/grafana/ckit v0.0.0-20240913130805-0ee98bafad88/go.mod h1:dDqep1rKTbq2ppMYEgIM88GaPXHp4i6Cp3qantiloA0= github.com/grafana/cloudflare-go v0.0.0-20230110200409-c627cf6792f2 h1:qhugDMdQ4Vp68H0tp/0iN17DM2ehRo1rLEdOFe/gB8I= github.com/grafana/cloudflare-go v0.0.0-20230110200409-c627cf6792f2/go.mod h1:w/aiO1POVIeXUQyl0VQSZjl5OAGDTL5aX+4v0RA1tcw= github.com/grafana/dskit v0.0.0-20240104111617-ea101a3b86eb h1:AWE6+kvtE18HP+lRWNUCyvymyrFSXs6TcS2vXIXGIuw= @@ -2156,8 +2156,8 @@ github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.11.1/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY= -github.com/prometheus/client_golang v1.20.2 h1:5ctymQzZlyOON1666svgwn3s6IKWgfbjsejTMiXIyjg= -github.com/prometheus/client_golang v1.20.2/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_golang v1.20.3 h1:oPksm4K8B+Vt35tUhw6GbSNSgVlVSBH0qELP/7u83l4= +github.com/prometheus/client_golang v1.20.3/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= @@ -2769,8 +2769,8 @@ golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= -golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= -golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A= +golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70= golang.org/x/crypto/x509roots/fallback v0.0.0-20240208163226-62c9f1799c91 h1:Lyizcy9jX02jYR0ceBkL6S+jRys8Uepf7wt1vrz6Ras= golang.org/x/crypto/x509roots/fallback v0.0.0-20240208163226-62c9f1799c91/go.mod h1:kNa9WdvYnzFwC79zRpLRMJbdEFlhyM5RPFBBZp/wWH8= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -2896,8 +2896,8 @@ golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ= golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= -golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= -golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= +golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo= +golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= golang.org/x/oauth2 v0.0.0-20170807180024-9a379c6b3e95/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -3070,8 +3070,8 @@ golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= -golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU= -golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk= +golang.org/x/term v0.24.0 h1:Mh5cbb+Zk2hqqXNO7S1iTjEphVL+jb8ZWaqh/g+JWkM= +golang.org/x/term v0.24.0/go.mod h1:lOBK/LVxemqiMij05LGJ0tzNr8xlmwBRJ81PX6wVLH8= golang.org/x/text v0.0.0-20160726164857-2910a502d2bf/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -3092,8 +3092,8 @@ golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= -golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= +golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= diff --git a/internal/alloycli/cmd_run.go b/internal/alloycli/cmd_run.go index 6a969cc27f..7bd9c99ae4 100644 --- a/internal/alloycli/cmd_run.go +++ b/internal/alloycli/cmd_run.go @@ -321,8 +321,7 @@ func (fr *alloyRun) Run(configPath string) error { ready = f.Ready reload = func() (*alloy_runtime.Source, error) { alloySource, err := loadAlloySource(configPath, fr.configFormat, fr.configBypassConversionErrors, fr.configExtraArgs) - defer instrumentation.InstrumentSHA256(alloySource.SHA256()) - defer instrumentation.InstrumentLoad(err == nil) + defer instrumentation.InstrumentConfig(err == nil, alloySource.SHA256(), fr.clusterName) if err != nil { return nil, fmt.Errorf("reading config path %q: %w", configPath, err) @@ -476,8 +475,6 @@ func loadAlloySource(path string, converterSourceFormat string, converterBypassE } } - instrumentation.InstrumentConfig(bb) - return alloy_runtime.ParseSource(path, bb) } diff --git a/internal/static/config/instrumentation/config_metrics.go b/internal/static/config/instrumentation/config_metrics.go index add500eea7..9c0414c108 100644 --- a/internal/static/config/instrumentation/config_metrics.go +++ b/internal/static/config/instrumentation/config_metrics.go @@ -20,17 +20,20 @@ type configMetrics struct { var confMetrics *configMetrics var configMetricsInitializer sync.Once -func initializeConfigMetrics() { - confMetrics = newConfigMetrics() +func initializeConfigMetrics(clusterName string) { + confMetrics = newConfigMetrics(clusterName) } -func newConfigMetrics() *configMetrics { +func newConfigMetrics(clusterName string) *configMetrics { var m configMetrics m.configHash = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "alloy_config_hash", Help: "Hash of the currently active config file.", + ConstLabels: prometheus.Labels{ + "cluster_name": clusterName, + }, }, []string{"sha256"}, ) @@ -49,22 +52,11 @@ func newConfigMetrics() *configMetrics { return &m } -// Create a sha256 hash of the config before expansion and expose it via -// the alloy_config_hash metric. -func InstrumentConfig(buf []byte) { - InstrumentSHA256(sha256.Sum256(buf)) -} - -// InstrumentSHA256 stores the provided hash to the alloy_config_hash metric. -func InstrumentSHA256(hash [sha256.Size]byte) { - configMetricsInitializer.Do(initializeConfigMetrics) - confMetrics.configHash.Reset() - confMetrics.configHash.WithLabelValues(fmt.Sprintf("%x", hash)).Set(1) -} +func InstrumentConfig(success bool, hash [sha256.Size]byte, clusterName string) { + configMetricsInitializer.Do(func() { + initializeConfigMetrics(clusterName) + }) -// Expose metrics for load success / failures. -func InstrumentLoad(success bool) { - configMetricsInitializer.Do(initializeConfigMetrics) if success { confMetrics.configLoadSuccessSeconds.SetToCurrentTime() confMetrics.configLoadSuccess.Set(1) @@ -72,4 +64,7 @@ func InstrumentLoad(success bool) { confMetrics.configLoadSuccess.Set(0) confMetrics.configLoadFailures.Inc() } + + confMetrics.configHash.Reset() + confMetrics.configHash.WithLabelValues(fmt.Sprintf("%x", hash)).Set(1) } diff --git a/operations/alloy-mixin/alerts/clustering.libsonnet b/operations/alloy-mixin/alerts/clustering.libsonnet index 7216382476..5dbcf5710f 100644 --- a/operations/alloy-mixin/alerts/clustering.libsonnet +++ b/operations/alloy-mixin/alerts/clustering.libsonnet @@ -9,7 +9,7 @@ local alert = import './utils/alert.jsonnet'; alert.newRule( 'ClusterNotConverging', if enableK8sCluster then - 'stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) != 0' + 'stddev by (cluster, namespace, job, cluster_name) (sum without (state) (cluster_node_peers)) != 0' else 'stddev by (job) (sum without (state) (cluster_node_peers)) != 0', 'Cluster is not converging.', @@ -25,8 +25,8 @@ local alert = import './utils/alert.jsonnet'; // metrics. if enableK8sCluster then ||| sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) ||| else ||| sum without (state) (cluster_node_peers) != on (job) group_left @@ -53,7 +53,7 @@ local alert = import './utils/alert.jsonnet'; alert.newRule( 'ClusterNodeNameConflict', if enableK8sCluster then - 'sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0' + 'sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0' else 'sum by (job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0' , @@ -66,7 +66,7 @@ local alert = import './utils/alert.jsonnet'; alert.newRule( 'ClusterNodeStuckTerminating', if enableK8sCluster then - 'sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) > 0' + 'sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0' else 'sum by (job, instance) (cluster_node_peers{state="terminating"}) > 0' , @@ -80,7 +80,7 @@ local alert = import './utils/alert.jsonnet'; 'ClusterConfigurationDrift', if enableK8sCluster then ||| count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 ||| else ||| count without (sha256) ( From 34d850e3148c48469952507ac738a37786c68bfc Mon Sep 17 00:00:00 2001 From: William Dumont Date: Fri, 13 Sep 2024 17:31:44 +0200 Subject: [PATCH 2/2] fix changelog (#1683) --- CHANGELOG.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e5763c060..c6348eb57f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,11 +10,6 @@ internal API changes are not present. Main (unreleased) ----------------- -### Enhancements - -- Add the label `alloy_cluster` in the metric `alloy_config_hash` when the flag `cluster.name` is set to help differentiate between - configs from the same alloy cluster or different alloy clusters. (@wildum) - v1.4.0-rc.0 ----------------- @@ -64,6 +59,9 @@ v1.4.0-rc.0 - `prometheus.exporter.cloudwatch` can now collect metrics from custom namespaces via the `custom_namespace` block. (@ptodev) +- Add the label `alloy_cluster` in the metric `alloy_config_hash` when the flag `cluster.name` is set to help differentiate between + configs from the same alloy cluster or different alloy clusters. (@wildum) + ### Bugfixes - Fix a bug where the scrape timeout for a Probe resource was not applied, overwriting the scrape interval instead. (@morremeyer, @stefanandres)