diff --git a/CHANGELOG.md b/CHANGELOG.md index 9155d37e7e..c6348eb57f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,6 +59,9 @@ v1.4.0-rc.0 - `prometheus.exporter.cloudwatch` can now collect metrics from custom namespaces via the `custom_namespace` block. (@ptodev) +- Add the label `alloy_cluster` in the metric `alloy_config_hash` when the flag `cluster.name` is set to help differentiate between + configs from the same alloy cluster or different alloy clusters. (@wildum) + ### Bugfixes - Fix a bug where the scrape timeout for a Probe resource was not applied, overwriting the scrape interval instead. (@morremeyer, @stefanandres) diff --git a/go.mod b/go.mod index eebce03238..fe5a3f8ab1 100644 --- a/go.mod +++ b/go.mod @@ -57,7 +57,7 @@ require ( github.com/grafana/alloy/syntax v0.1.0 github.com/grafana/beyla v1.8.2 github.com/grafana/catchpoint-prometheus-exporter v0.0.0-20240606062944-e55f3668661d - github.com/grafana/ckit v0.0.0-20240624165704-36f3407a8eaa + github.com/grafana/ckit v0.0.0-20240913130805-0ee98bafad88 github.com/grafana/cloudflare-go v0.0.0-20230110200409-c627cf6792f2 github.com/grafana/dskit v0.0.0-20240104111617-ea101a3b86eb github.com/grafana/go-gelf/v2 v2.0.1 @@ -154,7 +154,7 @@ require ( github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.66.0 github.com/prometheus-operator/prometheus-operator/pkg/client v0.66.0 github.com/prometheus/blackbox_exporter v0.24.1-0.20230623125439-bd22efa1c900 - github.com/prometheus/client_golang v1.20.2 + github.com/prometheus/client_golang v1.20.3 github.com/prometheus/client_model v0.6.1 github.com/prometheus/common v0.55.0 github.com/prometheus/common/sigv4 v0.1.0 diff --git a/go.sum b/go.sum index c38016bb9d..92e8705aec 100644 --- a/go.sum +++ b/go.sum @@ -1196,8 +1196,8 @@ github.com/grafana/cadvisor v0.0.0-20240729082359-1f04a91701e2 h1:ju6EcY2aEobeBg github.com/grafana/cadvisor v0.0.0-20240729082359-1f04a91701e2/go.mod h1:8sLW/G7rcFe1CKMaA4pYT4mX3P1xQVGqM6luzEzx/2g= github.com/grafana/catchpoint-prometheus-exporter v0.0.0-20240606062944-e55f3668661d h1:6sNPBwOokfCxAyateu7iLdtyWDUzaLLShPs7F4eTLfw= github.com/grafana/catchpoint-prometheus-exporter v0.0.0-20240606062944-e55f3668661d/go.mod h1:aGPSALDAkw18nn8M7gumhM/MbJG+zgOA3jNWTwPYtLg= -github.com/grafana/ckit v0.0.0-20240624165704-36f3407a8eaa h1:3rdc/z801roM6ky8cT8wz4tahQWkTxJ4VAmzANZe8qQ= -github.com/grafana/ckit v0.0.0-20240624165704-36f3407a8eaa/go.mod h1:k21VjCNs7gj1pAV80wb1577fVRePk51Hek5QUMEvKE0= +github.com/grafana/ckit v0.0.0-20240913130805-0ee98bafad88 h1:GgbYRGz2+/Vgz8/lk19Ht8TQDsAudl51Qenuw+COs5k= +github.com/grafana/ckit v0.0.0-20240913130805-0ee98bafad88/go.mod h1:dDqep1rKTbq2ppMYEgIM88GaPXHp4i6Cp3qantiloA0= github.com/grafana/cloudflare-go v0.0.0-20230110200409-c627cf6792f2 h1:qhugDMdQ4Vp68H0tp/0iN17DM2ehRo1rLEdOFe/gB8I= github.com/grafana/cloudflare-go v0.0.0-20230110200409-c627cf6792f2/go.mod h1:w/aiO1POVIeXUQyl0VQSZjl5OAGDTL5aX+4v0RA1tcw= github.com/grafana/dskit v0.0.0-20240104111617-ea101a3b86eb h1:AWE6+kvtE18HP+lRWNUCyvymyrFSXs6TcS2vXIXGIuw= @@ -2149,8 +2149,8 @@ github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.11.1/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY= -github.com/prometheus/client_golang v1.20.2 h1:5ctymQzZlyOON1666svgwn3s6IKWgfbjsejTMiXIyjg= -github.com/prometheus/client_golang v1.20.2/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_golang v1.20.3 h1:oPksm4K8B+Vt35tUhw6GbSNSgVlVSBH0qELP/7u83l4= +github.com/prometheus/client_golang v1.20.3/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= diff --git a/internal/alloycli/cmd_run.go b/internal/alloycli/cmd_run.go index 6a969cc27f..7bd9c99ae4 100644 --- a/internal/alloycli/cmd_run.go +++ b/internal/alloycli/cmd_run.go @@ -321,8 +321,7 @@ func (fr *alloyRun) Run(configPath string) error { ready = f.Ready reload = func() (*alloy_runtime.Source, error) { alloySource, err := loadAlloySource(configPath, fr.configFormat, fr.configBypassConversionErrors, fr.configExtraArgs) - defer instrumentation.InstrumentSHA256(alloySource.SHA256()) - defer instrumentation.InstrumentLoad(err == nil) + defer instrumentation.InstrumentConfig(err == nil, alloySource.SHA256(), fr.clusterName) if err != nil { return nil, fmt.Errorf("reading config path %q: %w", configPath, err) @@ -476,8 +475,6 @@ func loadAlloySource(path string, converterSourceFormat string, converterBypassE } } - instrumentation.InstrumentConfig(bb) - return alloy_runtime.ParseSource(path, bb) } diff --git a/internal/static/config/instrumentation/config_metrics.go b/internal/static/config/instrumentation/config_metrics.go index add500eea7..9c0414c108 100644 --- a/internal/static/config/instrumentation/config_metrics.go +++ b/internal/static/config/instrumentation/config_metrics.go @@ -20,17 +20,20 @@ type configMetrics struct { var confMetrics *configMetrics var configMetricsInitializer sync.Once -func initializeConfigMetrics() { - confMetrics = newConfigMetrics() +func initializeConfigMetrics(clusterName string) { + confMetrics = newConfigMetrics(clusterName) } -func newConfigMetrics() *configMetrics { +func newConfigMetrics(clusterName string) *configMetrics { var m configMetrics m.configHash = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "alloy_config_hash", Help: "Hash of the currently active config file.", + ConstLabels: prometheus.Labels{ + "cluster_name": clusterName, + }, }, []string{"sha256"}, ) @@ -49,22 +52,11 @@ func newConfigMetrics() *configMetrics { return &m } -// Create a sha256 hash of the config before expansion and expose it via -// the alloy_config_hash metric. -func InstrumentConfig(buf []byte) { - InstrumentSHA256(sha256.Sum256(buf)) -} - -// InstrumentSHA256 stores the provided hash to the alloy_config_hash metric. -func InstrumentSHA256(hash [sha256.Size]byte) { - configMetricsInitializer.Do(initializeConfigMetrics) - confMetrics.configHash.Reset() - confMetrics.configHash.WithLabelValues(fmt.Sprintf("%x", hash)).Set(1) -} +func InstrumentConfig(success bool, hash [sha256.Size]byte, clusterName string) { + configMetricsInitializer.Do(func() { + initializeConfigMetrics(clusterName) + }) -// Expose metrics for load success / failures. -func InstrumentLoad(success bool) { - configMetricsInitializer.Do(initializeConfigMetrics) if success { confMetrics.configLoadSuccessSeconds.SetToCurrentTime() confMetrics.configLoadSuccess.Set(1) @@ -72,4 +64,7 @@ func InstrumentLoad(success bool) { confMetrics.configLoadSuccess.Set(0) confMetrics.configLoadFailures.Inc() } + + confMetrics.configHash.Reset() + confMetrics.configHash.WithLabelValues(fmt.Sprintf("%x", hash)).Set(1) } diff --git a/operations/alloy-mixin/alerts/clustering.libsonnet b/operations/alloy-mixin/alerts/clustering.libsonnet index 7216382476..5dbcf5710f 100644 --- a/operations/alloy-mixin/alerts/clustering.libsonnet +++ b/operations/alloy-mixin/alerts/clustering.libsonnet @@ -9,7 +9,7 @@ local alert = import './utils/alert.jsonnet'; alert.newRule( 'ClusterNotConverging', if enableK8sCluster then - 'stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) != 0' + 'stddev by (cluster, namespace, job, cluster_name) (sum without (state) (cluster_node_peers)) != 0' else 'stddev by (job) (sum without (state) (cluster_node_peers)) != 0', 'Cluster is not converging.', @@ -25,8 +25,8 @@ local alert = import './utils/alert.jsonnet'; // metrics. if enableK8sCluster then ||| sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) ||| else ||| sum without (state) (cluster_node_peers) != on (job) group_left @@ -53,7 +53,7 @@ local alert = import './utils/alert.jsonnet'; alert.newRule( 'ClusterNodeNameConflict', if enableK8sCluster then - 'sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0' + 'sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0' else 'sum by (job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0' , @@ -66,7 +66,7 @@ local alert = import './utils/alert.jsonnet'; alert.newRule( 'ClusterNodeStuckTerminating', if enableK8sCluster then - 'sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) > 0' + 'sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0' else 'sum by (job, instance) (cluster_node_peers{state="terminating"}) > 0' , @@ -80,7 +80,7 @@ local alert = import './utils/alert.jsonnet'; 'ClusterConfigurationDrift', if enableK8sCluster then ||| count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 ||| else ||| count without (sha256) (