Skip to content

Commit

Permalink
roachtest: introduce registry.NonReportableError
Browse files Browse the repository at this point in the history
`registry.NonReportableError` can be used in situations where an error
happens in a test but we do not want to create (or update) a GitHub
issue for it. The mechanism introduced here can be used either in the
test runner, or in tests themselves. For now, they are only used when
a VM is preempted.

We have enabled spot VMs with a higher probability and backported this
behaviour to several release branches. It's not really useful or
actionable to have a GitHub issue for every preemption. Instead, this
kind of data is better analyzed at a higher level, on an analytical
database, using dashboards.

Epic: none

Release note: None
  • Loading branch information
renatolabs committed Sep 18, 2024
1 parent 86081e8 commit ae19630
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 14 deletions.
16 changes: 16 additions & 0 deletions pkg/cmd/roachtest/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,15 @@ func failuresAsErrorWithOwnership(failures []failure) *registry.ErrorWithOwnersh
return nil
}

func failuresAsNonReportableError(failures []failure) *registry.NonReportableError {
var nonReportable registry.NonReportableError
if failuresMatchingError(failures, &nonReportable) {
return &nonReportable
}

return nil
}

// postIssueCondition is a condition that causes issue posting to be
// skipped. If it returns a non-empty string, posting is skipped for
// the returned reason.
Expand Down Expand Up @@ -131,6 +140,13 @@ var skipConditions = []postIssueCondition{

return fmt.Sprintf("not a release branch: %q", defaultOpts.Branch)
},
func(_ *githubIssues, t test.Test) string {
if nonReportable := failuresAsNonReportableError(t.(*testImpl).failures()); nonReportable != nil {
return nonReportable.Error()
}

return ""
},
func(_ *githubIssues, t test.Test) string {
if t.Spec().(*registry.TestSpec).Run == nil {
return "TestSpec.Run is nil"
Expand Down
17 changes: 12 additions & 5 deletions pkg/cmd/roachtest/github_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,23 +62,29 @@ func prefixAll(params map[string]string) map[string]string {
}

func TestShouldPost(t *testing.T) {
preemptionFailure := []failure{
{errors: []error{vmPreemptionError("vm1")}},
}
testCases := []struct {
disableIssues bool
nodeCount int
envGithubAPIToken string
envTcBuildBranch string
failures []failure
expectedReason string
}{
/* Cases 1 - 4 verify that issues are not posted if any of the relevant criteria checks fail */
// disable
{true, 1, "token", "master", "issue posting was disabled via command line flag"},
{true, 1, "token", "master", nil, "issue posting was disabled via command line flag"},
// nodeCount
{false, 0, "token", "master", "Cluster.NodeCount is zero"},
{false, 0, "token", "master", nil, "Cluster.NodeCount is zero"},
// apiToken
{false, 1, "", "master", "GitHub API token not set"},
{false, 1, "", "master", nil, "GitHub API token not set"},
// branch
{false, 1, "token", "", `not a release branch: "branch-not-found-in-env"`},
{false, 1, "token", "master", ""},
{false, 1, "token", "", nil, `not a release branch: "branch-not-found-in-env"`},
// VM preemtion while test ran
{false, 1, "token", "master", preemptionFailure, "non-reportable: preempted VMs: vm1 [owner=test-eng]"},
{false, 1, "token", "master", nil, ""},
}

reg := makeTestRegistry()
Expand All @@ -97,6 +103,7 @@ func TestShouldPost(t *testing.T) {
}

ti := &testImpl{spec: testSpec}
ti.mu.failures = c.failures
github := &githubIssues{disable: c.disableIssues}

skipReason := github.shouldPost(ti)
Expand Down
27 changes: 27 additions & 0 deletions pkg/cmd/roachtest/registry/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ type (
Err error
}

NonReportableError struct {
Err error
}

errorOption func(*ErrorWithOwnership)
)

Expand All @@ -53,6 +57,22 @@ func InfraFlake(ewo *ErrorWithOwnership) {
ewo.InfraFlake = true
}

func (nre NonReportableError) Error() string {
return fmt.Sprintf("non-reportable: %s", nre.Err)
}

func (nre NonReportableError) Is(target error) bool {
return errors.Is(nre.Err, target)
}

func (nre NonReportableError) Unwrap() error {
return nre.Err
}

func (nre NonReportableError) As(reference interface{}) bool {
return errors.As(nre.Err, reference)
}

// ErrorWithOwner allows the caller to associate `err` with
// `owner`. When `t.Fatal` is called with an error of this type, the
// resulting GitHub issue is created and assigned to the team
Expand All @@ -65,3 +85,10 @@ func ErrorWithOwner(owner Owner, err error, opts ...errorOption) ErrorWithOwners

return result
}

// NonReportable wraps the given error and makes it non-reportable --
// i.e., if it happens during a run, the error is logged in the runner
// logs, but not reported in a GitHub issue.
func NonReportable(err error) NonReportableError {
return NonReportableError{err}
}
16 changes: 7 additions & 9 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,17 @@ var (
// *and* VMs were preempted. These errors are directed to Test Eng
// instead of owning teams.
vmPreemptionError = func(preemptedVMs string) error {
return registry.ErrorWithOwner(
infraFlakeErr := registry.ErrorWithOwner(
registry.OwnerTestEng, fmt.Errorf("preempted VMs: %s", preemptedVMs),
registry.WithTitleOverride("vm_preemption"),
registry.InfraFlake,
)

// The returned error is marked as non-reportable to avoid the
// noise, as we get dozens of preemptions on each nightly run. We
// have dashboards that can be used to check how often we get
// preemptions in test runs.
return registry.NonReportable(infraFlakeErr)
}

// vmHostError is the error that indicates that a test failed
Expand Down Expand Up @@ -1105,14 +1111,6 @@ func (r *testRunner) runTest(
// Note that this error message is referred for test selection in
// pkg/cmd/roachtest/testselector/snowflake_query.sql.
failureMsg = fmt.Sprintf("VMs preempted during the test run: %s\n\n**Other Failures:**\n%s", preemptedVMNames, failureMsg)
// Reset failures in the test so that the VM preemption
// error is the one that is taken into account when
// reporting the failure. Note any other failures that
// happened during the test will be present in the
// `failureMsg` used when reporting the issue. In addition,
// `failure_N.log` files should also already exist at this
// point.
t.resetFailures()
t.Error(vmPreemptionError(preemptedVMNames))
}
hostErrorVMNames := getHostErrorVMNames(ctx, c, l)
Expand Down

0 comments on commit ae19630

Please sign in to comment.