From 86081e8ff7bc071c9e424b799de871758a12d41b Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Wed, 18 Sep 2024 20:40:11 +0000 Subject: [PATCH 1/2] roachtest: refactor function to remove extra return value `shouldPost` used to return `(bool, string)` but, in reality, the bool value just encoded the whether the returned string was empty. Simplify that function by returning a single value and having callers be explicit about checking the skip reason. Epic: none Release note: None --- pkg/cmd/roachtest/github.go | 79 +++++++++++++++++--------------- pkg/cmd/roachtest/github_test.go | 14 +++--- 2 files changed, 49 insertions(+), 44 deletions(-) diff --git a/pkg/cmd/roachtest/github.go b/pkg/cmd/roachtest/github.go index 42d6412097c0..d96bea3a9783 100644 --- a/pkg/cmd/roachtest/github.go +++ b/pkg/cmd/roachtest/github.go @@ -102,55 +102,62 @@ func failuresAsErrorWithOwnership(failures []failure) *registry.ErrorWithOwnersh return nil } -// postIssueCondition encapsulates a condition that causes issue -// posting to be skipped. The `reason` field contains a textual -// description as to why issue posting was skipped. -type postIssueCondition struct { - cond func(g *githubIssues, t test.Test) bool - reason string -} +// postIssueCondition is a condition that causes issue posting to be +// skipped. If it returns a non-empty string, posting is skipped for +// the returned reason. +type postIssueCondition func(g *githubIssues, t test.Test) string var defaultOpts = issues.DefaultOptionsFromEnv() var skipConditions = []postIssueCondition{ - { - cond: func(g *githubIssues, _ test.Test) bool { return g.disable }, - reason: "issue posting was disabled via command line flag", + func(g *githubIssues, _ test.Test) string { + if g.disable { + return "issue posting was disabled via command line flag" + } + + return "" }, - { - cond: func(g *githubIssues, _ test.Test) bool { return !defaultOpts.CanPost() }, - reason: "GitHub API token not set", + func(g *githubIssues, _ test.Test) string { + if defaultOpts.CanPost() { + return "" + } + + return "GitHub API token not set" }, - { - cond: func(g *githubIssues, _ test.Test) bool { return !defaultOpts.IsReleaseBranch() }, - reason: fmt.Sprintf("not a release branch: %q", defaultOpts.Branch), + func(g *githubIssues, _ test.Test) string { + if defaultOpts.IsReleaseBranch() { + return "" + } + + return fmt.Sprintf("not a release branch: %q", defaultOpts.Branch) }, - { - cond: func(_ *githubIssues, t test.Test) bool { return t.Spec().(*registry.TestSpec).Run == nil }, - reason: "TestSpec.Run is nil", + func(_ *githubIssues, t test.Test) string { + if t.Spec().(*registry.TestSpec).Run == nil { + return "TestSpec.Run is nil" + } + + return "" }, - { - cond: func(_ *githubIssues, t test.Test) bool { return t.Spec().(*registry.TestSpec).Cluster.NodeCount == 0 }, - reason: "Cluster.NodeCount is zero", + func(_ *githubIssues, t test.Test) string { + if t.Spec().(*registry.TestSpec).Cluster.NodeCount == 0 { + return "Cluster.NodeCount is zero" + } + + return "" }, } -// shouldPost two values: whether GitHub posting should happen, and a -// reason for skipping (non-empty only when posting should *not* -// happen). -func (g *githubIssues) shouldPost(t test.Test) (bool, string) { - post := true - var reason string - +// shouldPost checks whether we should post a GitHub issue: if we do, +// the return value will be the empty string. Otherwise, this function +// returns the reason for not posting. +func (g *githubIssues) shouldPost(t test.Test) string { for _, sc := range skipConditions { - if sc.cond(g, t) { - post = false - reason = sc.reason - break + if skipReason := sc(g, t); skipReason != "" { + return skipReason } } - return post, reason + return "" } func (g *githubIssues) createPostRequest( @@ -311,8 +318,8 @@ func (g *githubIssues) createPostRequest( func (g *githubIssues) MaybePost( t *testImpl, l *logger.Logger, message string, ) (*issues.TestFailureIssue, error) { - doPost, skipReason := g.shouldPost(t) - if !doPost { + skipReason := g.shouldPost(t) + if skipReason != "" { l.Printf("skipping GitHub issue posting (%s)", skipReason) return nil, nil } diff --git a/pkg/cmd/roachtest/github_test.go b/pkg/cmd/roachtest/github_test.go index 4c7d645b372e..44e7fa4d82d3 100644 --- a/pkg/cmd/roachtest/github_test.go +++ b/pkg/cmd/roachtest/github_test.go @@ -67,19 +67,18 @@ func TestShouldPost(t *testing.T) { nodeCount int envGithubAPIToken string envTcBuildBranch string - expectedPost bool expectedReason string }{ /* Cases 1 - 4 verify that issues are not posted if any of the relevant criteria checks fail */ // disable - {true, 1, "token", "master", false, "issue posting was disabled via command line flag"}, + {true, 1, "token", "master", "issue posting was disabled via command line flag"}, // nodeCount - {false, 0, "token", "master", false, "Cluster.NodeCount is zero"}, + {false, 0, "token", "master", "Cluster.NodeCount is zero"}, // apiToken - {false, 1, "", "master", false, "GitHub API token not set"}, + {false, 1, "", "master", "GitHub API token not set"}, // branch - {false, 1, "token", "", false, `not a release branch: "branch-not-found-in-env"`}, - {false, 1, "token", "master", true, ""}, + {false, 1, "token", "", `not a release branch: "branch-not-found-in-env"`}, + {false, 1, "token", "master", ""}, } reg := makeTestRegistry() @@ -100,8 +99,7 @@ func TestShouldPost(t *testing.T) { ti := &testImpl{spec: testSpec} github := &githubIssues{disable: c.disableIssues} - doPost, skipReason := github.shouldPost(ti) - require.Equal(t, c.expectedPost, doPost) + skipReason := github.shouldPost(ti) require.Equal(t, c.expectedReason, skipReason) } } From ae1963011e83fc5a3ad46a8265c39996539adabe Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Wed, 18 Sep 2024 20:54:02 +0000 Subject: [PATCH 2/2] roachtest: introduce registry.NonReportableError `registry.NonReportableError` can be used in situations where an error happens in a test but we do not want to create (or update) a GitHub issue for it. The mechanism introduced here can be used either in the test runner, or in tests themselves. For now, they are only used when a VM is preempted. We have enabled spot VMs with a higher probability and backported this behaviour to several release branches. It's not really useful or actionable to have a GitHub issue for every preemption. Instead, this kind of data is better analyzed at a higher level, on an analytical database, using dashboards. Epic: none Release note: None --- pkg/cmd/roachtest/github.go | 16 ++++++++++++++++ pkg/cmd/roachtest/github_test.go | 17 ++++++++++++----- pkg/cmd/roachtest/registry/errors.go | 27 +++++++++++++++++++++++++++ pkg/cmd/roachtest/test_runner.go | 16 +++++++--------- 4 files changed, 62 insertions(+), 14 deletions(-) diff --git a/pkg/cmd/roachtest/github.go b/pkg/cmd/roachtest/github.go index d96bea3a9783..5370cdd1e6b5 100644 --- a/pkg/cmd/roachtest/github.go +++ b/pkg/cmd/roachtest/github.go @@ -102,6 +102,15 @@ func failuresAsErrorWithOwnership(failures []failure) *registry.ErrorWithOwnersh return nil } +func failuresAsNonReportableError(failures []failure) *registry.NonReportableError { + var nonReportable registry.NonReportableError + if failuresMatchingError(failures, &nonReportable) { + return &nonReportable + } + + return nil +} + // postIssueCondition is a condition that causes issue posting to be // skipped. If it returns a non-empty string, posting is skipped for // the returned reason. @@ -131,6 +140,13 @@ var skipConditions = []postIssueCondition{ return fmt.Sprintf("not a release branch: %q", defaultOpts.Branch) }, + func(_ *githubIssues, t test.Test) string { + if nonReportable := failuresAsNonReportableError(t.(*testImpl).failures()); nonReportable != nil { + return nonReportable.Error() + } + + return "" + }, func(_ *githubIssues, t test.Test) string { if t.Spec().(*registry.TestSpec).Run == nil { return "TestSpec.Run is nil" diff --git a/pkg/cmd/roachtest/github_test.go b/pkg/cmd/roachtest/github_test.go index 44e7fa4d82d3..95bf4e618431 100644 --- a/pkg/cmd/roachtest/github_test.go +++ b/pkg/cmd/roachtest/github_test.go @@ -62,23 +62,29 @@ func prefixAll(params map[string]string) map[string]string { } func TestShouldPost(t *testing.T) { + preemptionFailure := []failure{ + {errors: []error{vmPreemptionError("vm1")}}, + } testCases := []struct { disableIssues bool nodeCount int envGithubAPIToken string envTcBuildBranch string + failures []failure expectedReason string }{ /* Cases 1 - 4 verify that issues are not posted if any of the relevant criteria checks fail */ // disable - {true, 1, "token", "master", "issue posting was disabled via command line flag"}, + {true, 1, "token", "master", nil, "issue posting was disabled via command line flag"}, // nodeCount - {false, 0, "token", "master", "Cluster.NodeCount is zero"}, + {false, 0, "token", "master", nil, "Cluster.NodeCount is zero"}, // apiToken - {false, 1, "", "master", "GitHub API token not set"}, + {false, 1, "", "master", nil, "GitHub API token not set"}, // branch - {false, 1, "token", "", `not a release branch: "branch-not-found-in-env"`}, - {false, 1, "token", "master", ""}, + {false, 1, "token", "", nil, `not a release branch: "branch-not-found-in-env"`}, + // VM preemtion while test ran + {false, 1, "token", "master", preemptionFailure, "non-reportable: preempted VMs: vm1 [owner=test-eng]"}, + {false, 1, "token", "master", nil, ""}, } reg := makeTestRegistry() @@ -97,6 +103,7 @@ func TestShouldPost(t *testing.T) { } ti := &testImpl{spec: testSpec} + ti.mu.failures = c.failures github := &githubIssues{disable: c.disableIssues} skipReason := github.shouldPost(ti) diff --git a/pkg/cmd/roachtest/registry/errors.go b/pkg/cmd/roachtest/registry/errors.go index cc22098e39d9..7484ab3d28ee 100644 --- a/pkg/cmd/roachtest/registry/errors.go +++ b/pkg/cmd/roachtest/registry/errors.go @@ -28,6 +28,10 @@ type ( Err error } + NonReportableError struct { + Err error + } + errorOption func(*ErrorWithOwnership) ) @@ -53,6 +57,22 @@ func InfraFlake(ewo *ErrorWithOwnership) { ewo.InfraFlake = true } +func (nre NonReportableError) Error() string { + return fmt.Sprintf("non-reportable: %s", nre.Err) +} + +func (nre NonReportableError) Is(target error) bool { + return errors.Is(nre.Err, target) +} + +func (nre NonReportableError) Unwrap() error { + return nre.Err +} + +func (nre NonReportableError) As(reference interface{}) bool { + return errors.As(nre.Err, reference) +} + // ErrorWithOwner allows the caller to associate `err` with // `owner`. When `t.Fatal` is called with an error of this type, the // resulting GitHub issue is created and assigned to the team @@ -65,3 +85,10 @@ func ErrorWithOwner(owner Owner, err error, opts ...errorOption) ErrorWithOwners return result } + +// NonReportable wraps the given error and makes it non-reportable -- +// i.e., if it happens during a run, the error is logged in the runner +// logs, but not reported in a GitHub issue. +func NonReportable(err error) NonReportableError { + return NonReportableError{err} +} diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 5dbc7cde9b99..b93ec80e2656 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -79,11 +79,17 @@ var ( // *and* VMs were preempted. These errors are directed to Test Eng // instead of owning teams. vmPreemptionError = func(preemptedVMs string) error { - return registry.ErrorWithOwner( + infraFlakeErr := registry.ErrorWithOwner( registry.OwnerTestEng, fmt.Errorf("preempted VMs: %s", preemptedVMs), registry.WithTitleOverride("vm_preemption"), registry.InfraFlake, ) + + // The returned error is marked as non-reportable to avoid the + // noise, as we get dozens of preemptions on each nightly run. We + // have dashboards that can be used to check how often we get + // preemptions in test runs. + return registry.NonReportable(infraFlakeErr) } // vmHostError is the error that indicates that a test failed @@ -1105,14 +1111,6 @@ func (r *testRunner) runTest( // Note that this error message is referred for test selection in // pkg/cmd/roachtest/testselector/snowflake_query.sql. failureMsg = fmt.Sprintf("VMs preempted during the test run: %s\n\n**Other Failures:**\n%s", preemptedVMNames, failureMsg) - // Reset failures in the test so that the VM preemption - // error is the one that is taken into account when - // reporting the failure. Note any other failures that - // happened during the test will be present in the - // `failureMsg` used when reporting the issue. In addition, - // `failure_N.log` files should also already exist at this - // point. - t.resetFailures() t.Error(vmPreemptionError(preemptedVMNames)) } hostErrorVMNames := getHostErrorVMNames(ctx, c, l)