Skip to content

Commit

Permalink
Merge #134698
Browse files Browse the repository at this point in the history
134698: roachtestutil: limit GetDiskUsageInBytes retries r=srosenberg a=DarrylWong

Previously, this method would infinitely retry on non context cancelled errors. However, the disk usage runner uses a done channel to signal completion, rather than cancelling the context.

This change limits the amount of retries attempted, so the runner can check the done channel and exit.

Fixes: #134611
Epic: none
Release note: none

Co-authored-by: DarrylWong <[email protected]>
  • Loading branch information
craig[bot] and DarrylWong committed Nov 13, 2024
2 parents 8dd01a7 + 40a664f commit d19aa3e
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 32 deletions.
7 changes: 5 additions & 2 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1242,8 +1242,11 @@ func saveDiskUsageToLogsDir(ctx context.Context, c cluster.Cluster) error {

// Don't hang forever.
return timeutil.RunWithTimeout(ctx, "disk usage", 20*time.Second, func(ctx context.Context) error {
return c.RunE(ctx, option.WithNodes(c.All()),
"du -c /mnt/data1 --exclude lost+found >> logs/diskusage.txt")
return c.RunE(
ctx,
option.WithNodes(c.All()),
"du {store-dir} -c --exclude lost+found >> logs/diskusage.txt",
)
})
}

Expand Down
42 changes: 13 additions & 29 deletions pkg/cmd/roachtest/roachtestutil/disk_usage.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,39 +128,23 @@ func NewDiskUsageTracker(
return &DiskUsageTracker{c: c, l: diskLogger}, nil
}

// GetDiskUsageInBytes does what's on the tin. nodeIdx starts at one.
// GetDiskUsageInBytes executes the command `du {store-dir}` on node `nodeIdx` and
// parses the result to bytes. nodeIdx starts at one.
func GetDiskUsageInBytes(
ctx context.Context, c cluster.Cluster, logger *logger.Logger, nodeIdx int,
) (int, error) {
var result install.RunResultDetails
for {
var err error
// `du` can warn if files get removed out from under it (which
// happens during RocksDB compactions, for example). Discard its
// stderr to avoid breaking Atoi later.
// TODO(bdarnell): Refactor this stack to not combine stdout and
// stderr so we don't need to do this (and the Warning check
// below).
result, err = c.RunWithDetailsSingleNode(
ctx,
logger,
option.WithNodes(c.Node(nodeIdx)),
"du -sk {store-dir} 2>/dev/null | grep -oE '^[0-9]+'",
)
if err != nil {
if ctx.Err() != nil {
return 0, ctx.Err()
}
// If `du` fails, retry.
// TODO(bdarnell): is this worth doing? It was originally added
// because of the "files removed out from under it" problem, but
// that doesn't result in a command failure, just a stderr
// message.
logger.Printf("retrying disk usage computation after spurious error: %s", err)
continue
}

break
var err error
// Retry a few times since `du` can warn if files get removed out from under
// it (which happens during pebble compactions, for example).
result, err = c.RunWithDetailsSingleNode(
ctx,
logger,
option.WithNodes(c.Node(nodeIdx)).WithShouldRetryFn(install.AlwaysTrue),
"du {store-dir} -sk 2>/dev/null | grep -oE '^[0-9]+'",
)
if err != nil {
return 0, err
}

// We need this check because sometimes the first line of the roachprod output is a warning
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1666,7 +1666,7 @@ func (r *testRunner) collectArtifacts(
// Do this before collecting logs to make sure the file gets
// downloaded below.
if err := saveDiskUsageToLogsDir(ctx, c); err != nil {
t.L().Printf("failed to fetch disk uage summary: %s", err)
t.L().Printf("failed to fetch disk usage summary: %s", err)
}
if err := c.FetchLogs(ctx, t.L()); err != nil {
t.L().Printf("failed to download logs: %s", err)
Expand Down

0 comments on commit d19aa3e

Please sign in to comment.