From 00e292ae92ff7c73ccceb2ce41a3040c540591eb Mon Sep 17 00:00:00 2001 From: Lukasz Mierzwa Date: Thu, 28 Nov 2024 12:39:36 +0000 Subject: [PATCH] Warn about wrong rate() usage even without metadata --- docs/changelog.md | 3 + internal/checks/promql_rate.go | 36 ++++++++--- internal/checks/promql_rate_test.go | 56 ++++++++++++++++- internal/parser/utils/sum.go | 72 --------------------- internal/parser/utils/sum_test.go | 97 ----------------------------- 5 files changed, 85 insertions(+), 179 deletions(-) delete mode 100644 internal/parser/utils/sum.go delete mode 100644 internal/parser/utils/sum_test.go diff --git a/docs/changelog.md b/docs/changelog.md index edfad774..112eacaf 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -12,6 +12,9 @@ {"status.üp"} == 0 ``` +- [promql/rate](checks/promql/rate.md) will now report a warning if it detects a `rate(sum(...))` + but doesn't have metadata to confirm if `...` is a counter or not. + ## v0.67.3 ### Fixed diff --git a/internal/checks/promql_rate.go b/internal/checks/promql_rate.go index 2a1b745d..3f8c44c5 100644 --- a/internal/checks/promql_rate.go +++ b/internal/checks/promql_rate.go @@ -144,9 +144,12 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entri continue } if e.Rule.RecordingRule != nil && e.Rule.RecordingRule.Expr.SyntaxError == nil && e.Rule.RecordingRule.Record.Value == s.Name { - for _, sm := range utils.HasOuterSum(e.Rule.RecordingRule.Expr.Query) { - if sv, ok := sm.Expr.(*promParser.VectorSelector); ok { - metadata, err := c.prom.Metadata(ctx, sv.Name) + for _, src := range utils.LabelsSource(e.Rule.RecordingRule.Expr.Value.Value, e.Rule.RecordingRule.Expr.Query.Expr) { + if src.Type != utils.AggregateSource { + continue + } + for _, vs := range src.Selectors { + metadata, err := c.prom.Metadata(ctx, vs.Name) if err != nil { text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) problems = append(problems, exprProblem{ @@ -155,15 +158,30 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entri }) continue } + canReport := true + severity := Warning for _, m := range metadata.Metadata { - if m.Type == v1.MetricTypeCounter { - problems = append(problems, exprProblem{ - text: fmt.Sprintf("`rate(sum(counter))` chain detected, `%s` is called here on results of `%s`, calling `rate()` on `sum()` results will return bogus results, always `sum(rate(counter))`, never `rate(sum(counter))`.", - node.Expr, sm), - severity: Bug, - }) + // nolint:exhaustive + switch m.Type { + case v1.MetricTypeCounter: + severity = Bug + default: + canReport = false } } + if !canReport { + continue + } + problems = append(problems, exprProblem{ + text: fmt.Sprintf("`rate(%s(counter))` chain detected, `%s` is called here on results of `%s(%s)`.", + src.Operation, node.Expr, src.Operation, vs), + details: fmt.Sprintf( + "You can only calculate `rate()` directly from a counter metric. "+ + "Calling `rate()` on `%s()` results will return bogus results because `%s()` will hide information on when each counter resets. "+ + "You must first calculate `rate()` before calling any aggregation function. Always `sum(rate(counter))`, never `rate(sum(counter))`", + src.Operation, src.Operation), + severity: severity, + }) } } } diff --git a/internal/checks/promql_rate_test.go b/internal/checks/promql_rate_test.go index 2bd95839..8482976c 100644 --- a/internal/checks/promql_rate_test.go +++ b/internal/checks/promql_rate_test.go @@ -27,7 +27,11 @@ func notCounterText(name, uri, fun, metric, kind string) string { } func rateSumText(rateName, sumExpr string) string { - return fmt.Sprintf("`rate(sum(counter))` chain detected, `rate(%s)` is called here on results of `%s`, calling `rate()` on `sum()` results will return bogus results, always `sum(rate(counter))`, never `rate(sum(counter))`.", rateName, sumExpr) + return fmt.Sprintf("`rate(sum(counter))` chain detected, `rate(%s)` is called here on results of `%s`.", rateName, sumExpr) +} + +func rateSumDetails() string { + return "You can only calculate `rate()` directly from a counter metric. Calling `rate()` on `sum()` results will return bogus results because `sum()` will hide information on when each counter resets. You must first calculate `rate()` before calling any aggregation function. Always `sum(rate(counter))`, never `rate(sum(counter))`" } func TestRateCheck(t *testing.T) { @@ -678,6 +682,7 @@ func TestRateCheck(t *testing.T) { }, Reporter: "promql/rate", Text: rateSumText("my:sum[5m]", "sum(foo)"), + Details: rateSumDetails(), Severity: checks.Bug, }, } @@ -839,6 +844,55 @@ func TestRateCheck(t *testing.T) { }, }, }, + { + description: "sum(rate(sum)) / sum(rate(sum))", + content: ` +- alert: Plexi_Worker_High_Signing_Latency + expr: | + sum( + rate(global:response_time_sum{namespace!~"test[.].+"}[15m]) + ) by (environment, namespace) + / + sum( + rate(global:response_time_count{namespace!~"test[.].+"}[15m]) + ) by (environment, namespace) + > 3000 +`, + checker: newRateCheck, + prometheus: newSimpleProm, + entries: mustParseContent(` +- record: global:response_time_sum + expr: sum(response_time_sum:rate2m) +- record: response_time_sum:rate2m + expr: rate(response_time_sum[2m]) +`), + problems: func(_ string) []checks.Problem { + return []checks.Problem{ + { + Lines: parser.LineRange{ + First: 3, + Last: 11, + }, + Reporter: "promql/rate", + Text: rateSumText(`global:response_time_sum{namespace!~"test[.].+"}[15m]`, "sum(response_time_sum:rate2m)"), + Details: rateSumDetails(), + Severity: checks.Warning, + }, + } + }, + mocks: []*prometheusMock{ + { + conds: []requestCondition{requireConfigPath}, + resp: configResponse{yaml: "global:\n scrape_interval: 53s\n"}, + }, + { + conds: []requestCondition{ + requireMetadataPath, + }, + resp: metadataResponse{metadata: map[string][]v1.Metadata{}}, + }, + }, + }, } runTests(t, testCases) } diff --git a/internal/parser/utils/sum.go b/internal/parser/utils/sum.go deleted file mode 100644 index 65479fd6..00000000 --- a/internal/parser/utils/sum.go +++ /dev/null @@ -1,72 +0,0 @@ -package utils - -import ( - "log/slog" - - promParser "github.com/prometheus/prometheus/promql/parser" - - "github.com/cloudflare/pint/internal/parser" -) - -func HasOuterSum(node *parser.PromQLNode) (calls []*promParser.AggregateExpr) { - if n, ok := node.Expr.(*promParser.AggregateExpr); ok { - if n.Op == promParser.SUM { - calls = append(calls, n) - return calls - } - } - - if n, ok := node.Expr.(*promParser.AggregateExpr); ok { - switch n.Op { - case promParser.COUNT: - return nil - case promParser.COUNT_VALUES: - return nil - } - } - - if n, ok := node.Expr.(*promParser.BinaryExpr); ok { - if n.VectorMatching != nil { - switch n.VectorMatching.Card { - case promParser.CardOneToOne: - case promParser.CardOneToMany: - for i, child := range node.Children { - if i == len(node.Children)-1 { - return HasOuterSum(child) - } - } - case promParser.CardManyToOne: - return HasOuterSum(node.Children[0]) - case promParser.CardManyToMany: - default: - slog.Warn("Unsupported VectorMatching operation", slog.String("matching", n.VectorMatching.Card.String())) - } - } - - if n.Op.IsComparisonOperator() { - for i, child := range node.Children { - if i == 0 { - return HasOuterSum(child) - } - } - } else { - switch n.Op { - case promParser.LOR: - for _, child := range node.Children { - calls = append(calls, HasOuterSum(child)...) - } - return calls - case promParser.DIV, promParser.LUNLESS, promParser.LAND: - for _, child := range node.Children { - return HasOuterSum(child) - } - } - } - } - - for _, child := range node.Children { - calls = append(calls, HasOuterSum(child)...) - } - - return calls -} diff --git a/internal/parser/utils/sum_test.go b/internal/parser/utils/sum_test.go deleted file mode 100644 index 5a817159..00000000 --- a/internal/parser/utils/sum_test.go +++ /dev/null @@ -1,97 +0,0 @@ -package utils_test - -import ( - "testing" - - "github.com/stretchr/testify/require" - - "github.com/cloudflare/pint/internal/parser" - "github.com/cloudflare/pint/internal/parser/utils" -) - -func TestHasOuterSum(t *testing.T) { - type testCaseT struct { - expr string - output []string - } - - testCases := []testCaseT{ - { - expr: "foo", - }, - { - expr: "sum(foo)", - output: []string{"sum(foo)"}, - }, - { - expr: "sum(foo) > 0", - output: []string{"sum(foo)"}, - }, - { - expr: "sum(foo) / sum(rate(bar[2m]))", - output: []string{"sum(foo)"}, - }, - { - expr: "sum(sum(foo)) > 0", - output: []string{"sum(sum(foo))"}, - }, - { - expr: "count(sum(foo)) > 0", - }, - { - expr: "count_values(\"foo\", sum(foo)) > 0", - }, - { - expr: "floor(sum(foo)) > 0", - output: []string{"sum(foo)"}, - }, - { - expr: "ceil(sum(foo)) > 0", - output: []string{"sum(foo)"}, - }, - { - expr: "sum(foo) or sum(bar)", - output: []string{"sum(foo)", "sum(bar)"}, - }, - { - expr: "sum(foo) * on() sum(bar)", - output: []string{"sum(foo)", "sum(bar)"}, - }, - { - expr: "sum(foo) without() * on() group_left(instance) sum(deriv(foo[2m]))", - output: []string{"sum without () (foo)"}, - }, - { - expr: "sum(foo) without(job) * on() group_right(instance) sum(deriv(foo[2m]))", - output: []string{"sum(deriv(foo[2m]))"}, - }, - { - expr: "2 > foo", - }, - { - expr: "2 > sum(foo)", - }, - } - - for _, tc := range testCases { - t.Run(tc.expr, func(t *testing.T) { - n, err := parser.DecodeExpr(tc.expr) - if err != nil { - t.Error(err) - t.FailNow() - } - calls := utils.HasOuterSum(n) - if len(calls) == 0 { - if len(tc.output) > 0 { - t.Errorf("HasOuterSum() returned nil, expected %s", tc.output) - } - } else { - output := []string{} - for _, a := range calls { - output = append(output, a.String()) - } - require.Equal(t, tc.output, output, "HasOuterSum() returned wrong output") - } - }) - } -}