Skip to content

Commit

Permalink
roachtest: admission-control/disk-bandwidth-limiter test improvements
Browse files Browse the repository at this point in the history
This patch fixes a few things in this test:
- Runs the first step longer to have a fuller LSM to induce block and
  page cache misses to have some disk reads.
- Reduces the throughput of the foreground workload since it was causing
  saturation on its own.
- Assert on total bandwidth since the disk bandwidth limiter should be
  accounting for reads when determining tokens.

Fixes #129534.

Release note: None
  • Loading branch information
aadityasondhi committed Sep 26, 2024
1 parent cbc681e commit 58e396c
Showing 1 changed file with 17 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func registerDiskBandwidthOverload(r registry.Registry) {
r.Add(registry.TestSpec{
Name: "admission-control/disk-bandwidth-limiter",
Owner: registry.OwnerAdmissionControl,
Timeout: time.Hour,
Timeout: 3 * time.Hour,
Benchmark: true,
CompatibleClouds: registry.AllExceptAzure,
// TODO(aaditya): change to weekly once the test stabilizes.
Expand Down Expand Up @@ -92,21 +92,21 @@ func registerDiskBandwidthOverload(r registry.Registry) {

c.Run(ctx, option.WithNodes(c.WorkloadNode()),
"./cockroach workload init kv --drop --insert-count=400 "+
"--max-block-bytes=4096 --min-block-bytes=4096"+foregroundDB+url)
"--max-block-bytes=1024 --min-block-bytes=1024"+foregroundDB+url)

c.Run(ctx, option.WithNodes(c.WorkloadNode()),
"./cockroach workload init kv --drop --insert-count=400 "+
"--max-block-bytes=4096 --min-block-bytes=4096"+backgroundDB+url)

// Run foreground kv workload, QoS="regular".
duration := 40 * time.Minute
duration := 90 * time.Minute
m := c.NewMonitor(ctx, c.CRDBNodes())
m.Go(func(ctx context.Context) error {
t.Status(fmt.Sprintf("starting foreground kv workload thread (<%s)", time.Minute))
dur := " --duration=" + duration.String()
url := fmt.Sprintf(" {pgurl%s}", c.CRDBNodes())
cmd := "./cockroach workload run kv --histograms=perf/stats.json --concurrency=2 " +
"--splits=1000 --read-percent=50 --min-block-bytes=4096 --max-block-bytes=4096 " +
"--splits=1000 --read-percent=50 --min-block-bytes=1024 --max-block-bytes=1024 " +
"--txn-qos='regular' --tolerate-errors" + foregroundDB + dur + url
c.Run(ctx, option.WithNodes(c.WorkloadNode()), cmd)
return nil
Expand All @@ -124,8 +124,8 @@ func registerDiskBandwidthOverload(r registry.Registry) {
return nil
})

t.Status(fmt.Sprintf("waiting for workload to start and ramp up (<%s)", 10*time.Minute))
time.Sleep(10 * time.Minute)
t.Status(fmt.Sprintf("waiting for workload to start and ramp up (<%s)", 30*time.Minute))
time.Sleep(60 * time.Minute)

db := c.Conn(ctx, t.L(), len(c.CRDBNodes()))
defer db.Close()
Expand All @@ -139,11 +139,12 @@ func registerDiskBandwidthOverload(r registry.Registry) {
}

t.Status(fmt.Sprintf("setting bandwidth limit, and waiting for it to take effect. (<%s)", 2*time.Minute))
time.Sleep(2 * time.Minute)
time.Sleep(5 * time.Minute)

m.Go(func(ctx context.Context) error {
t.Status(fmt.Sprintf("starting monitoring thread (<%s)", time.Minute))
writeBWMetric := divQuery("rate(sys_host_disk_write_bytes[1m])", 1<<20 /* 1MiB */)
readBWMetric := divQuery("rate(sys_host_disk_read_bytes[1m])", 1<<20 /* 1MiB */)
getMetricVal := func(query string, label string) (float64, error) {
point, err := statCollector.CollectPoint(ctx, t.L(), timeutil.Now(), query)
if err != nil {
Expand Down Expand Up @@ -174,13 +175,19 @@ func registerDiskBandwidthOverload(r registry.Registry) {
numSuccesses := 0
for i := 0; i < numIterations; i++ {
time.Sleep(collectionIntervalSeconds * time.Second)
val, err := getMetricVal(writeBWMetric, "node")
writeVal, err := getMetricVal(writeBWMetric, "node")
if err != nil {
numErrors++
continue
}
if val > bandwidthThreshold {
t.Fatalf("write bandwidth %f over last exceeded threshold", val)
readVal, err := getMetricVal(readBWMetric, "node")
if err != nil {
numErrors++
continue
}
totalBW := writeVal + readVal
if totalBW > bandwidthThreshold {
t.Fatalf("write + read bandwidth %f (%f + %f) exceeded threshold of %f", totalBW, writeVal, readVal, bandwidthThreshold)
}
numSuccesses++
}
Expand Down

0 comments on commit 58e396c

Please sign in to comment.