Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release-24.1: roachtest: don't report VM preemptions on github #131092

Merged
merged 2 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 58 additions & 35 deletions pkg/cmd/roachtest/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,55 +102,78 @@ func failuresAsErrorWithOwnership(failures []failure) *registry.ErrorWithOwnersh
return nil
}

// postIssueCondition encapsulates a condition that causes issue
// posting to be skipped. The `reason` field contains a textual
// description as to why issue posting was skipped.
type postIssueCondition struct {
cond func(g *githubIssues, t test.Test) bool
reason string
func failuresAsNonReportableError(failures []failure) *registry.NonReportableError {
var nonReportable registry.NonReportableError
if failuresMatchingError(failures, &nonReportable) {
return &nonReportable
}

return nil
}

// postIssueCondition is a condition that causes issue posting to be
// skipped. If it returns a non-empty string, posting is skipped for
// the returned reason.
type postIssueCondition func(g *githubIssues, t test.Test) string

var defaultOpts = issues.DefaultOptionsFromEnv()

var skipConditions = []postIssueCondition{
{
cond: func(g *githubIssues, _ test.Test) bool { return g.disable },
reason: "issue posting was disabled via command line flag",
func(g *githubIssues, _ test.Test) string {
if g.disable {
return "issue posting was disabled via command line flag"
}

return ""
},
func(g *githubIssues, _ test.Test) string {
if defaultOpts.CanPost() {
return ""
}

return "GitHub API token not set"
},
{
cond: func(g *githubIssues, _ test.Test) bool { return !defaultOpts.CanPost() },
reason: "GitHub API token not set",
func(g *githubIssues, _ test.Test) string {
if defaultOpts.IsReleaseBranch() {
return ""
}

return fmt.Sprintf("not a release branch: %q", defaultOpts.Branch)
},
{
cond: func(g *githubIssues, _ test.Test) bool { return !defaultOpts.IsReleaseBranch() },
reason: fmt.Sprintf("not a release branch: %q", defaultOpts.Branch),
func(_ *githubIssues, t test.Test) string {
if nonReportable := failuresAsNonReportableError(t.(*testImpl).failures()); nonReportable != nil {
return nonReportable.Error()
}

return ""
},
{
cond: func(_ *githubIssues, t test.Test) bool { return t.Spec().(*registry.TestSpec).Run == nil },
reason: "TestSpec.Run is nil",
func(_ *githubIssues, t test.Test) string {
if t.Spec().(*registry.TestSpec).Run == nil {
return "TestSpec.Run is nil"
}

return ""
},
{
cond: func(_ *githubIssues, t test.Test) bool { return t.Spec().(*registry.TestSpec).Cluster.NodeCount == 0 },
reason: "Cluster.NodeCount is zero",
func(_ *githubIssues, t test.Test) string {
if t.Spec().(*registry.TestSpec).Cluster.NodeCount == 0 {
return "Cluster.NodeCount is zero"
}

return ""
},
}

// shouldPost two values: whether GitHub posting should happen, and a
// reason for skipping (non-empty only when posting should *not*
// happen).
func (g *githubIssues) shouldPost(t test.Test) (bool, string) {
post := true
var reason string

// shouldPost checks whether we should post a GitHub issue: if we do,
// the return value will be the empty string. Otherwise, this function
// returns the reason for not posting.
func (g *githubIssues) shouldPost(t test.Test) string {
for _, sc := range skipConditions {
if sc.cond(g, t) {
post = false
reason = sc.reason
break
if skipReason := sc(g, t); skipReason != "" {
return skipReason
}
}

return post, reason
return ""
}

func (g *githubIssues) createPostRequest(
Expand Down Expand Up @@ -311,8 +334,8 @@ func (g *githubIssues) createPostRequest(
func (g *githubIssues) MaybePost(
t *testImpl, l *logger.Logger, message string,
) (*issues.TestFailureIssue, error) {
doPost, skipReason := g.shouldPost(t)
if !doPost {
skipReason := g.shouldPost(t)
if skipReason != "" {
l.Printf("skipping GitHub issue posting (%s)", skipReason)
return nil, nil
}
Expand Down
21 changes: 13 additions & 8 deletions pkg/cmd/roachtest/github_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,24 +62,29 @@ func prefixAll(params map[string]string) map[string]string {
}

func TestShouldPost(t *testing.T) {
preemptionFailure := []failure{
{errors: []error{vmPreemptionError("vm1")}},
}
testCases := []struct {
disableIssues bool
nodeCount int
envGithubAPIToken string
envTcBuildBranch string
expectedPost bool
failures []failure
expectedReason string
}{
/* Cases 1 - 4 verify that issues are not posted if any of the relevant criteria checks fail */
// disable
{true, 1, "token", "master", false, "issue posting was disabled via command line flag"},
{true, 1, "token", "master", nil, "issue posting was disabled via command line flag"},
// nodeCount
{false, 0, "token", "master", false, "Cluster.NodeCount is zero"},
{false, 0, "token", "master", nil, "Cluster.NodeCount is zero"},
// apiToken
{false, 1, "", "master", false, "GitHub API token not set"},
{false, 1, "", "master", nil, "GitHub API token not set"},
// branch
{false, 1, "token", "", false, `not a release branch: "branch-not-found-in-env"`},
{false, 1, "token", "master", true, ""},
{false, 1, "token", "", nil, `not a release branch: "branch-not-found-in-env"`},
// VM preemtion while test ran
{false, 1, "token", "master", preemptionFailure, "non-reportable: preempted VMs: vm1 [owner=test-eng]"},
{false, 1, "token", "master", nil, ""},
}

reg := makeTestRegistry()
Expand All @@ -98,10 +103,10 @@ func TestShouldPost(t *testing.T) {
}

ti := &testImpl{spec: testSpec}
ti.mu.failures = c.failures
github := &githubIssues{disable: c.disableIssues}

doPost, skipReason := github.shouldPost(ti)
require.Equal(t, c.expectedPost, doPost)
skipReason := github.shouldPost(ti)
require.Equal(t, c.expectedReason, skipReason)
}
}
Expand Down
27 changes: 27 additions & 0 deletions pkg/cmd/roachtest/registry/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ type (
Err error
}

NonReportableError struct {
Err error
}

errorOption func(*ErrorWithOwnership)
)

Expand All @@ -53,6 +57,22 @@ func InfraFlake(ewo *ErrorWithOwnership) {
ewo.InfraFlake = true
}

func (nre NonReportableError) Error() string {
return fmt.Sprintf("non-reportable: %s", nre.Err)
}

func (nre NonReportableError) Is(target error) bool {
return errors.Is(nre.Err, target)
}

func (nre NonReportableError) Unwrap() error {
return nre.Err
}

func (nre NonReportableError) As(reference interface{}) bool {
return errors.As(nre.Err, reference)
}

// ErrorWithOwner allows the caller to associate `err` with
// `owner`. When `t.Fatal` is called with an error of this type, the
// resulting GitHub issue is created and assigned to the team
Expand All @@ -65,3 +85,10 @@ func ErrorWithOwner(owner Owner, err error, opts ...errorOption) ErrorWithOwners

return result
}

// NonReportable wraps the given error and makes it non-reportable --
// i.e., if it happens during a run, the error is logged in the runner
// logs, but not reported in a GitHub issue.
func NonReportable(err error) NonReportableError {
return NonReportableError{err}
}
16 changes: 7 additions & 9 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,17 @@ var (
// *and* VMs were preempted. These errors are directed to Test Eng
// instead of owning teams.
vmPreemptionError = func(preemptedVMs string) error {
return registry.ErrorWithOwner(
infraFlakeErr := registry.ErrorWithOwner(
registry.OwnerTestEng, fmt.Errorf("preempted VMs: %s", preemptedVMs),
registry.WithTitleOverride("vm_preemption"),
registry.InfraFlake,
)

// The returned error is marked as non-reportable to avoid the
// noise, as we get dozens of preemptions on each nightly run. We
// have dashboards that can be used to check how often we get
// preemptions in test runs.
return registry.NonReportable(infraFlakeErr)
}

// vmHostError is the error that indicates that a test failed
Expand Down Expand Up @@ -1074,14 +1080,6 @@ func (r *testRunner) runTest(
// Note that this error message is referred for test selection in
// pkg/cmd/roachtest/testselector/snowflake_query.sql.
failureMsg = fmt.Sprintf("VMs preempted during the test run: %s\n\n**Other Failures:**\n%s", preemptedVMNames, failureMsg)
// Reset failures in the test so that the VM preemption
// error is the one that is taken into account when
// reporting the failure. Note any other failures that
// happened during the test will be present in the
// `failureMsg` used when reporting the issue. In addition,
// `failure_N.log` files should also already exist at this
// point.
t.resetFailures()
t.Error(vmPreemptionError(preemptedVMNames))
}
hostErrorVMNames := getHostErrorVMNames(ctx, c, l)
Expand Down