roachtest: introduce registry.NonReportableError

`registry.NonReportableError` can be used in situations where an error happens in a test but we do not want to create (or update) a GitHub issue for it. The mechanism introduced here can be used either in the test runner, or in tests themselves. For now, they are only used when a VM is preempted. We have enabled spot VMs with a higher probability and backported this behaviour to several release branches. It's not really useful or actionable to have a GitHub issue for every preemption. Instead, this kind of data is better analyzed at a higher level, on an analytical database, using dashboards. Epic: none Release note: None
cockroachdb · Sep 18, 2024 · ae19630 · ae19630
1 parent 86081e8
commit ae19630
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 14 deletions.
diff --git a/pkg/cmd/roachtest/github.go b/pkg/cmd/roachtest/github.go
@@ -102,6 +102,15 @@ func failuresAsErrorWithOwnership(failures []failure) *registry.ErrorWithOwnersh
 	return nil
 }
 
+func failuresAsNonReportableError(failures []failure) *registry.NonReportableError {
+	var nonReportable registry.NonReportableError
+	if failuresMatchingError(failures, &nonReportable) {
+		return &nonReportable
+	}
+
+	return nil
+}
+
 // postIssueCondition is a condition that causes issue posting to be
 // skipped. If it returns a non-empty string, posting is skipped for
 // the returned reason.
@@ -131,6 +140,13 @@ var skipConditions = []postIssueCondition{
 
 		return fmt.Sprintf("not a release branch: %q", defaultOpts.Branch)
 	},
+	func(_ *githubIssues, t test.Test) string {
+		if nonReportable := failuresAsNonReportableError(t.(*testImpl).failures()); nonReportable != nil {
+			return nonReportable.Error()
+		}
+
+		return ""
+	},
 	func(_ *githubIssues, t test.Test) string {
 		if t.Spec().(*registry.TestSpec).Run == nil {
 			return "TestSpec.Run is nil"

diff --git a/pkg/cmd/roachtest/github_test.go b/pkg/cmd/roachtest/github_test.go
@@ -62,23 +62,29 @@ func prefixAll(params map[string]string) map[string]string {
 }
 
 func TestShouldPost(t *testing.T) {
+	preemptionFailure := []failure{
+		{errors: []error{vmPreemptionError("vm1")}},
+	}
 	testCases := []struct {
 		disableIssues     bool
 		nodeCount         int
 		envGithubAPIToken string
 		envTcBuildBranch  string
+		failures          []failure
 		expectedReason    string
 	}{
 		/* Cases 1 - 4 verify that issues are not posted if any of the relevant criteria checks fail */
 		// disable
-		{true, 1, "token", "master", "issue posting was disabled via command line flag"},
+		{true, 1, "token", "master", nil, "issue posting was disabled via command line flag"},
 		// nodeCount
-		{false, 0, "token", "master", "Cluster.NodeCount is zero"},
+		{false, 0, "token", "master", nil, "Cluster.NodeCount is zero"},
 		// apiToken
-		{false, 1, "", "master", "GitHub API token not set"},
+		{false, 1, "", "master", nil, "GitHub API token not set"},
 		// branch
-		{false, 1, "token", "", `not a release branch: "branch-not-found-in-env"`},
-		{false, 1, "token", "master", ""},
+		{false, 1, "token", "", nil, `not a release branch: "branch-not-found-in-env"`},
+		// VM preemtion while test ran
+		{false, 1, "token", "master", preemptionFailure, "non-reportable: preempted VMs: vm1 [owner=test-eng]"},
+		{false, 1, "token", "master", nil, ""},
 	}
 
 	reg := makeTestRegistry()
@@ -97,6 +103,7 @@ func TestShouldPost(t *testing.T) {
 		}
 
 		ti := &testImpl{spec: testSpec}
+		ti.mu.failures = c.failures
 		github := &githubIssues{disable: c.disableIssues}
 
 		skipReason := github.shouldPost(ti)

diff --git a/pkg/cmd/roachtest/registry/errors.go b/pkg/cmd/roachtest/registry/errors.go
@@ -28,6 +28,10 @@ type (
 		Err        error
 	}
 
+	NonReportableError struct {
+		Err error
+	}
+
 	errorOption func(*ErrorWithOwnership)
 )
 
@@ -53,6 +57,22 @@ func InfraFlake(ewo *ErrorWithOwnership) {
 	ewo.InfraFlake = true
 }
 
+func (nre NonReportableError) Error() string {
+	return fmt.Sprintf("non-reportable: %s", nre.Err)
+}
+
+func (nre NonReportableError) Is(target error) bool {
+	return errors.Is(nre.Err, target)
+}
+
+func (nre NonReportableError) Unwrap() error {
+	return nre.Err
+}
+
+func (nre NonReportableError) As(reference interface{}) bool {
+	return errors.As(nre.Err, reference)
+}
+
 // ErrorWithOwner allows the caller to associate `err` with
 // `owner`. When `t.Fatal` is called with an error of this type, the
 // resulting GitHub issue is created and assigned to the team
@@ -65,3 +85,10 @@ func ErrorWithOwner(owner Owner, err error, opts ...errorOption) ErrorWithOwners
 
 	return result
 }
+
+// NonReportable wraps the given error and makes it non-reportable --
+// i.e., if it happens during a run, the error is logged in the runner
+// logs, but not reported in a GitHub issue.
+func NonReportable(err error) NonReportableError {
+	return NonReportableError{err}
+}
diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
@@ -79,11 +79,17 @@ var (
 	// *and* VMs were preempted. These errors are directed to Test Eng
 	// instead of owning teams.
 	vmPreemptionError = func(preemptedVMs string) error {
-		return registry.ErrorWithOwner(
+		infraFlakeErr := registry.ErrorWithOwner(
 			registry.OwnerTestEng, fmt.Errorf("preempted VMs: %s", preemptedVMs),
 			registry.WithTitleOverride("vm_preemption"),
 			registry.InfraFlake,
 		)
+
+		// The returned error is marked as non-reportable to avoid the
+		// noise, as we get dozens of preemptions on each nightly run.  We
+		// have dashboards that can be used to check how often we get
+		// preemptions in test runs.
+		return registry.NonReportable(infraFlakeErr)
 	}
 
 	// vmHostError is the error that indicates that a test failed
@@ -1105,14 +1111,6 @@ func (r *testRunner) runTest(
 					// Note that this error message is referred for test selection in
 					// pkg/cmd/roachtest/testselector/snowflake_query.sql.
 					failureMsg = fmt.Sprintf("VMs preempted during the test run: %s\n\n**Other Failures:**\n%s", preemptedVMNames, failureMsg)
-					// Reset failures in the test so that the VM preemption
-					// error is the one that is taken into account when
-					// reporting the failure. Note any other failures that
-					// happened during the test will be present in the
-					// `failureMsg` used when reporting the issue. In addition,
-					// `failure_N.log` files should also already exist at this
-					// point.
-					t.resetFailures()
 					t.Error(vmPreemptionError(preemptedVMNames))
 				}
 				hostErrorVMNames := getHostErrorVMNames(ctx, c, l)