Skip to content

Commit

Permalink
roachtest: split perturbation/* tests
Browse files Browse the repository at this point in the history
This commit splits all the individual perturbations into separate files.
This will make it easier to find and update a perturbation.

Epic: none

Release note: None
  • Loading branch information
andrewbaptist committed Nov 12, 2024
1 parent 47e8ae3 commit 0cbc4e7
Show file tree
Hide file tree
Showing 10 changed files with 632 additions and 476 deletions.
12 changes: 11 additions & 1 deletion pkg/cmd/roachtest/tests/perturbation/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,17 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")

go_library(
name = "perturbation",
srcs = ["framework.go"],
srcs = [
"add_node.go",
"decommission.go",
"elastic_workload.go",
"framework.go",
"index_backfill.go",
"kv_workload.go",
"network_partition.go",
"restart_node.go",
"slow_disk.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/tests/perturbation",
visibility = ["//visibility:public"],
deps = [
Expand Down
57 changes: 57 additions & 0 deletions pkg/cmd/roachtest/tests/perturbation/add_node.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package perturbation

import (
"context"
"math/rand"
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)

// addNode will add a node during the start phase and wait for it to complete.
// It doesn't do anything during the stop phase.
type addNode struct{}

var _ perturbation = addNode{}

func (a addNode) setup() variations {
return setup(a, 5.0)
}

func (a addNode) setupMetamorphic(rng *rand.Rand) variations {
v := a.setup()
v = v.randomize(rng)
//TODO(#133606): With high vcpu and large writes, the test can fail due to
//the disk becoming saturated leading to 1-2s of fsync stall.
if v.vcpu >= 16 && v.maxBlockBytes == 4096 {
v.maxBlockBytes = 1024
}
return v
}

func (addNode) startTargetNode(ctx context.Context, t test.Test, v variations) {
}

func (a addNode) startPerturbation(ctx context.Context, t test.Test, v variations) time.Duration {
startTime := timeutil.Now()
v.startNoBackup(ctx, t, v.targetNodes())
// Wait out the time until the store is no longer suspect. The 31s is based
// on the 30s default server.time_after_store_suspect setting plus 1 sec for
// the store to propagate its gossip information.
waitDuration(ctx, 31*time.Second)
v.waitForRebalanceToStop(ctx, t)
return timeutil.Since(startTime)
}

// endPerturbation already waited for completion as part of start, so it doesn't
// need to wait again here.
func (addNode) endPerturbation(ctx context.Context, t test.Test, v variations) time.Duration {
waitDuration(ctx, v.validationDuration)
return v.validationDuration
}
93 changes: 93 additions & 0 deletions pkg/cmd/roachtest/tests/perturbation/decommission.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package perturbation

import (
"context"
"fmt"
"math/rand"
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)

// decommission will decommission the target node during the start phase. It
// allows optionally calling drain first. Draining first is the best practice
// recommendation, however it should not cause a latency impact either way.
type decommission struct {
drain bool
}

var _ perturbation = decommission{}

func (d decommission) setup() variations {
d.drain = true
return setup(d, 5.0)
}

func (d decommission) setupMetamorphic(rng *rand.Rand) variations {
v := d.setup()
d.drain = rng.Intn(2) == 0
v = v.randomize(rng)
v.perturbation = d
//TODO(#133606): With high vcpu and large writes, the test can fail due to
//the disk becoming saturated leading to 1-2s of fsync stall.
if v.vcpu >= 16 && v.maxBlockBytes == 4096 {
v.maxBlockBytes = 1024
}
return v
}

func (d decommission) startTargetNode(ctx context.Context, t test.Test, v variations) {
v.startNoBackup(ctx, t, v.targetNodes())
}

func (d decommission) startPerturbation(
ctx context.Context, t test.Test, v variations,
) time.Duration {
startTime := timeutil.Now()
// TODO(baptist): If we want to support multiple decommissions in parallel,
// run drain and decommission in separate goroutine.
if d.drain {
t.L().Printf("draining target nodes")
for _, node := range v.targetNodes() {
drainCmd := fmt.Sprintf(
"./cockroach node drain --self --certs-dir=%s --port={pgport:%d}",
install.CockroachNodeCertsDir,
node,
)
v.Run(ctx, option.WithNodes(v.Node(node)), drainCmd)
}
// Wait for all the other nodes to see the drain over gossip.
time.Sleep(10 * time.Second)
}

t.L().Printf("decommissioning nodes")
for _, node := range v.targetNodes() {
decommissionCmd := fmt.Sprintf(
"./cockroach node decommission --self --certs-dir=%s --port={pgport:%d}",
install.CockroachNodeCertsDir,
node,
)
v.Run(ctx, option.WithNodes(v.Node(node)), decommissionCmd)
}

t.L().Printf("stopping decommissioned nodes")
v.Stop(ctx, t.L(), option.DefaultStopOpts(), v.targetNodes())
return timeutil.Since(startTime)
}

// endPerturbation already waited for completion as part of start, so it doesn't
// need to wait again here.
func (d decommission) endPerturbation(
ctx context.Context, t test.Test, v variations,
) time.Duration {
waitDuration(ctx, v.validationDuration)
return v.validationDuration
}
71 changes: 71 additions & 0 deletions pkg/cmd/roachtest/tests/perturbation/elastic_workload.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package perturbation

import (
"context"
"fmt"
"math/rand"
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)

// elasticWorkload will start a workload with elastic priority. It uses the same
// characteristics as the normal workload. However since the normal workload
// runs at 50% CPU this adds another 2x the stable rate so it will be slowed
// down by AC.
// TODO(baptist): Run against the same database to hit transaction conflicts and
// priority inversions.
type elasticWorkload struct{}

var _ perturbation = elasticWorkload{}

func (e elasticWorkload) setup() variations {
return setup(e, 5.0)
}

func (e elasticWorkload) setupMetamorphic(rng *rand.Rand) variations {
v := e.setup()
// NB: Running an elastic workload can sometimes increase the latency of
// almost all regular requests. To prevent this, we set the min latency to
// 100ms instead of the default.
v.profileOptions = append(v.profileOptions, roachtestutil.ProfMinimumLatency(100*time.Millisecond))
return v.randomize(rng)
}

func (e elasticWorkload) startTargetNode(ctx context.Context, t test.Test, v variations) {
v.startNoBackup(ctx, t, v.targetNodes())
initCmd := fmt.Sprintf("./cockroach workload init kv --db elastic --splits %d {pgurl:1}", v.splits)
v.Run(ctx, option.WithNodes(v.Node(1)), initCmd)
}

func (e elasticWorkload) startPerturbation(
ctx context.Context, t test.Test, v variations,
) time.Duration {
startTime := timeutil.Now()
runCmd := fmt.Sprintf(
"./cockroach workload run kv --db elastic --txn-qos=background --duration=%s --max-block-bytes=%d --min-block-bytes=%d --concurrency=500 {pgurl%s}",
v.perturbationDuration, v.maxBlockBytes, v.maxBlockBytes, v.stableNodes())
v.Run(ctx, option.WithNodes(v.workloadNodes()), runCmd)

// Wait a few seconds to allow the latency to resume after stopping the
// workload. This makes it easier to separate the perturbation from the
// validation phases.
waitDuration(ctx, 5*time.Second)
return timeutil.Since(startTime)
}

// endPerturbation implements perturbation.
func (e elasticWorkload) endPerturbation(
ctx context.Context, t test.Test, v variations,
) time.Duration {
waitDuration(ctx, v.validationDuration)
return v.validationDuration
}
Loading

0 comments on commit 0cbc4e7

Please sign in to comment.