From 86081e8ff7bc071c9e424b799de871758a12d41b Mon Sep 17 00:00:00 2001
From: Renato Costa <renato@cockroachlabs.com>
Date: Wed, 18 Sep 2024 20:40:11 +0000
Subject: [PATCH 1/2] roachtest: refactor function to remove extra return value

`shouldPost` used to return `(bool, string)` but, in reality, the bool
value just encoded the whether the returned string was empty. Simplify
that function by returning a single value and having callers be
explicit about checking the skip reason.

Epic: none

Release note: None
---
 pkg/cmd/roachtest/github.go      | 79 +++++++++++++++++---------------
 pkg/cmd/roachtest/github_test.go | 14 +++---
 2 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/pkg/cmd/roachtest/github.go b/pkg/cmd/roachtest/github.go
index 42d6412097c0..d96bea3a9783 100644
--- a/pkg/cmd/roachtest/github.go
+++ b/pkg/cmd/roachtest/github.go
@@ -102,55 +102,62 @@ func failuresAsErrorWithOwnership(failures []failure) *registry.ErrorWithOwnersh
 	return nil
 }
 
-// postIssueCondition encapsulates a condition that causes issue
-// posting to be skipped. The `reason` field contains a textual
-// description as to why issue posting was skipped.
-type postIssueCondition struct {
-	cond   func(g *githubIssues, t test.Test) bool
-	reason string
-}
+// postIssueCondition is a condition that causes issue posting to be
+// skipped. If it returns a non-empty string, posting is skipped for
+// the returned reason.
+type postIssueCondition func(g *githubIssues, t test.Test) string
 
 var defaultOpts = issues.DefaultOptionsFromEnv()
 
 var skipConditions = []postIssueCondition{
-	{
-		cond:   func(g *githubIssues, _ test.Test) bool { return g.disable },
-		reason: "issue posting was disabled via command line flag",
+	func(g *githubIssues, _ test.Test) string {
+		if g.disable {
+			return "issue posting was disabled via command line flag"
+		}
+
+		return ""
 	},
-	{
-		cond:   func(g *githubIssues, _ test.Test) bool { return !defaultOpts.CanPost() },
-		reason: "GitHub API token not set",
+	func(g *githubIssues, _ test.Test) string {
+		if defaultOpts.CanPost() {
+			return ""
+		}
+
+		return "GitHub API token not set"
 	},
-	{
-		cond:   func(g *githubIssues, _ test.Test) bool { return !defaultOpts.IsReleaseBranch() },
-		reason: fmt.Sprintf("not a release branch: %q", defaultOpts.Branch),
+	func(g *githubIssues, _ test.Test) string {
+		if defaultOpts.IsReleaseBranch() {
+			return ""
+		}
+
+		return fmt.Sprintf("not a release branch: %q", defaultOpts.Branch)
 	},
-	{
-		cond:   func(_ *githubIssues, t test.Test) bool { return t.Spec().(*registry.TestSpec).Run == nil },
-		reason: "TestSpec.Run is nil",
+	func(_ *githubIssues, t test.Test) string {
+		if t.Spec().(*registry.TestSpec).Run == nil {
+			return "TestSpec.Run is nil"
+		}
+
+		return ""
 	},
-	{
-		cond:   func(_ *githubIssues, t test.Test) bool { return t.Spec().(*registry.TestSpec).Cluster.NodeCount == 0 },
-		reason: "Cluster.NodeCount is zero",
+	func(_ *githubIssues, t test.Test) string {
+		if t.Spec().(*registry.TestSpec).Cluster.NodeCount == 0 {
+			return "Cluster.NodeCount is zero"
+		}
+
+		return ""
 	},
 }
 
-// shouldPost two values: whether GitHub posting should happen, and a
-// reason for skipping (non-empty only when posting should *not*
-// happen).
-func (g *githubIssues) shouldPost(t test.Test) (bool, string) {
-	post := true
-	var reason string
-
+// shouldPost checks whether we should post a GitHub issue: if we do,
+// the return value will be the empty string. Otherwise, this function
+// returns the reason for not posting.
+func (g *githubIssues) shouldPost(t test.Test) string {
 	for _, sc := range skipConditions {
-		if sc.cond(g, t) {
-			post = false
-			reason = sc.reason
-			break
+		if skipReason := sc(g, t); skipReason != "" {
+			return skipReason
 		}
 	}
 
-	return post, reason
+	return ""
 }
 
 func (g *githubIssues) createPostRequest(
@@ -311,8 +318,8 @@ func (g *githubIssues) createPostRequest(
 func (g *githubIssues) MaybePost(
 	t *testImpl, l *logger.Logger, message string,
 ) (*issues.TestFailureIssue, error) {
-	doPost, skipReason := g.shouldPost(t)
-	if !doPost {
+	skipReason := g.shouldPost(t)
+	if skipReason != "" {
 		l.Printf("skipping GitHub issue posting (%s)", skipReason)
 		return nil, nil
 	}
diff --git a/pkg/cmd/roachtest/github_test.go b/pkg/cmd/roachtest/github_test.go
index 4c7d645b372e..44e7fa4d82d3 100644
--- a/pkg/cmd/roachtest/github_test.go
+++ b/pkg/cmd/roachtest/github_test.go
@@ -67,19 +67,18 @@ func TestShouldPost(t *testing.T) {
 		nodeCount         int
 		envGithubAPIToken string
 		envTcBuildBranch  string
-		expectedPost      bool
 		expectedReason    string
 	}{
 		/* Cases 1 - 4 verify that issues are not posted if any of the relevant criteria checks fail */
 		// disable
-		{true, 1, "token", "master", false, "issue posting was disabled via command line flag"},
+		{true, 1, "token", "master", "issue posting was disabled via command line flag"},
 		// nodeCount
-		{false, 0, "token", "master", false, "Cluster.NodeCount is zero"},
+		{false, 0, "token", "master", "Cluster.NodeCount is zero"},
 		// apiToken
-		{false, 1, "", "master", false, "GitHub API token not set"},
+		{false, 1, "", "master", "GitHub API token not set"},
 		// branch
-		{false, 1, "token", "", false, `not a release branch: "branch-not-found-in-env"`},
-		{false, 1, "token", "master", true, ""},
+		{false, 1, "token", "", `not a release branch: "branch-not-found-in-env"`},
+		{false, 1, "token", "master", ""},
 	}
 
 	reg := makeTestRegistry()
@@ -100,8 +99,7 @@ func TestShouldPost(t *testing.T) {
 		ti := &testImpl{spec: testSpec}
 		github := &githubIssues{disable: c.disableIssues}
 
-		doPost, skipReason := github.shouldPost(ti)
-		require.Equal(t, c.expectedPost, doPost)
+		skipReason := github.shouldPost(ti)
 		require.Equal(t, c.expectedReason, skipReason)
 	}
 }

From ae1963011e83fc5a3ad46a8265c39996539adabe Mon Sep 17 00:00:00 2001
From: Renato Costa <renato@cockroachlabs.com>
Date: Wed, 18 Sep 2024 20:54:02 +0000
Subject: [PATCH 2/2] roachtest: introduce registry.NonReportableError

`registry.NonReportableError` can be used in situations where an error
happens in a test but we do not want to create (or update) a GitHub
issue for it. The mechanism introduced here can be used either in the
test runner, or in tests themselves. For now, they are only used when
a VM is preempted.

We have enabled spot VMs with a higher probability and backported this
behaviour to several release branches. It's not really useful or
actionable to have a GitHub issue for every preemption. Instead, this
kind of data is better analyzed at a higher level, on an analytical
database, using dashboards.

Epic: none

Release note: None
---
 pkg/cmd/roachtest/github.go          | 16 ++++++++++++++++
 pkg/cmd/roachtest/github_test.go     | 17 ++++++++++++-----
 pkg/cmd/roachtest/registry/errors.go | 27 +++++++++++++++++++++++++++
 pkg/cmd/roachtest/test_runner.go     | 16 +++++++---------
 4 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/pkg/cmd/roachtest/github.go b/pkg/cmd/roachtest/github.go
index d96bea3a9783..5370cdd1e6b5 100644
--- a/pkg/cmd/roachtest/github.go
+++ b/pkg/cmd/roachtest/github.go
@@ -102,6 +102,15 @@ func failuresAsErrorWithOwnership(failures []failure) *registry.ErrorWithOwnersh
 	return nil
 }
 
+func failuresAsNonReportableError(failures []failure) *registry.NonReportableError {
+	var nonReportable registry.NonReportableError
+	if failuresMatchingError(failures, &nonReportable) {
+		return &nonReportable
+	}
+
+	return nil
+}
+
 // postIssueCondition is a condition that causes issue posting to be
 // skipped. If it returns a non-empty string, posting is skipped for
 // the returned reason.
@@ -131,6 +140,13 @@ var skipConditions = []postIssueCondition{
 
 		return fmt.Sprintf("not a release branch: %q", defaultOpts.Branch)
 	},
+	func(_ *githubIssues, t test.Test) string {
+		if nonReportable := failuresAsNonReportableError(t.(*testImpl).failures()); nonReportable != nil {
+			return nonReportable.Error()
+		}
+
+		return ""
+	},
 	func(_ *githubIssues, t test.Test) string {
 		if t.Spec().(*registry.TestSpec).Run == nil {
 			return "TestSpec.Run is nil"
diff --git a/pkg/cmd/roachtest/github_test.go b/pkg/cmd/roachtest/github_test.go
index 44e7fa4d82d3..95bf4e618431 100644
--- a/pkg/cmd/roachtest/github_test.go
+++ b/pkg/cmd/roachtest/github_test.go
@@ -62,23 +62,29 @@ func prefixAll(params map[string]string) map[string]string {
 }
 
 func TestShouldPost(t *testing.T) {
+	preemptionFailure := []failure{
+		{errors: []error{vmPreemptionError("vm1")}},
+	}
 	testCases := []struct {
 		disableIssues     bool
 		nodeCount         int
 		envGithubAPIToken string
 		envTcBuildBranch  string
+		failures          []failure
 		expectedReason    string
 	}{
 		/* Cases 1 - 4 verify that issues are not posted if any of the relevant criteria checks fail */
 		// disable
-		{true, 1, "token", "master", "issue posting was disabled via command line flag"},
+		{true, 1, "token", "master", nil, "issue posting was disabled via command line flag"},
 		// nodeCount
-		{false, 0, "token", "master", "Cluster.NodeCount is zero"},
+		{false, 0, "token", "master", nil, "Cluster.NodeCount is zero"},
 		// apiToken
-		{false, 1, "", "master", "GitHub API token not set"},
+		{false, 1, "", "master", nil, "GitHub API token not set"},
 		// branch
-		{false, 1, "token", "", `not a release branch: "branch-not-found-in-env"`},
-		{false, 1, "token", "master", ""},
+		{false, 1, "token", "", nil, `not a release branch: "branch-not-found-in-env"`},
+		// VM preemtion while test ran
+		{false, 1, "token", "master", preemptionFailure, "non-reportable: preempted VMs: vm1 [owner=test-eng]"},
+		{false, 1, "token", "master", nil, ""},
 	}
 
 	reg := makeTestRegistry()
@@ -97,6 +103,7 @@ func TestShouldPost(t *testing.T) {
 		}
 
 		ti := &testImpl{spec: testSpec}
+		ti.mu.failures = c.failures
 		github := &githubIssues{disable: c.disableIssues}
 
 		skipReason := github.shouldPost(ti)
diff --git a/pkg/cmd/roachtest/registry/errors.go b/pkg/cmd/roachtest/registry/errors.go
index cc22098e39d9..7484ab3d28ee 100644
--- a/pkg/cmd/roachtest/registry/errors.go
+++ b/pkg/cmd/roachtest/registry/errors.go
@@ -28,6 +28,10 @@ type (
 		Err        error
 	}
 
+	NonReportableError struct {
+		Err error
+	}
+
 	errorOption func(*ErrorWithOwnership)
 )
 
@@ -53,6 +57,22 @@ func InfraFlake(ewo *ErrorWithOwnership) {
 	ewo.InfraFlake = true
 }
 
+func (nre NonReportableError) Error() string {
+	return fmt.Sprintf("non-reportable: %s", nre.Err)
+}
+
+func (nre NonReportableError) Is(target error) bool {
+	return errors.Is(nre.Err, target)
+}
+
+func (nre NonReportableError) Unwrap() error {
+	return nre.Err
+}
+
+func (nre NonReportableError) As(reference interface{}) bool {
+	return errors.As(nre.Err, reference)
+}
+
 // ErrorWithOwner allows the caller to associate `err` with
 // `owner`. When `t.Fatal` is called with an error of this type, the
 // resulting GitHub issue is created and assigned to the team
@@ -65,3 +85,10 @@ func ErrorWithOwner(owner Owner, err error, opts ...errorOption) ErrorWithOwners
 
 	return result
 }
+
+// NonReportable wraps the given error and makes it non-reportable --
+// i.e., if it happens during a run, the error is logged in the runner
+// logs, but not reported in a GitHub issue.
+func NonReportable(err error) NonReportableError {
+	return NonReportableError{err}
+}
diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
index 5dbc7cde9b99..b93ec80e2656 100644
--- a/pkg/cmd/roachtest/test_runner.go
+++ b/pkg/cmd/roachtest/test_runner.go
@@ -79,11 +79,17 @@ var (
 	// *and* VMs were preempted. These errors are directed to Test Eng
 	// instead of owning teams.
 	vmPreemptionError = func(preemptedVMs string) error {
-		return registry.ErrorWithOwner(
+		infraFlakeErr := registry.ErrorWithOwner(
 			registry.OwnerTestEng, fmt.Errorf("preempted VMs: %s", preemptedVMs),
 			registry.WithTitleOverride("vm_preemption"),
 			registry.InfraFlake,
 		)
+
+		// The returned error is marked as non-reportable to avoid the
+		// noise, as we get dozens of preemptions on each nightly run.  We
+		// have dashboards that can be used to check how often we get
+		// preemptions in test runs.
+		return registry.NonReportable(infraFlakeErr)
 	}
 
 	// vmHostError is the error that indicates that a test failed
@@ -1105,14 +1111,6 @@ func (r *testRunner) runTest(
 					// Note that this error message is referred for test selection in
 					// pkg/cmd/roachtest/testselector/snowflake_query.sql.
 					failureMsg = fmt.Sprintf("VMs preempted during the test run: %s\n\n**Other Failures:**\n%s", preemptedVMNames, failureMsg)
-					// Reset failures in the test so that the VM preemption
-					// error is the one that is taken into account when
-					// reporting the failure. Note any other failures that
-					// happened during the test will be present in the
-					// `failureMsg` used when reporting the issue. In addition,
-					// `failure_N.log` files should also already exist at this
-					// point.
-					t.resetFailures()
 					t.Error(vmPreemptionError(preemptedVMNames))
 				}
 				hostErrorVMNames := getHostErrorVMNames(ctx, c, l)