Skip to content

Commit

Permalink
Update all GPU tests to use the ioctl sniffer.
Browse files Browse the repository at this point in the history
Fixes issue #10885.

PiperOrigin-RevId: 686295865
  • Loading branch information
EtiennePerot authored and gvisor-bot committed Oct 17, 2024
1 parent abe38d8 commit 35ac5b2
Show file tree
Hide file tree
Showing 16 changed files with 62 additions and 69 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ simple-tests: unit-tests # Compatibility target.
.PHONY: simple-tests

# Images needed for GPU smoke tests.
gpu-smoke-images: load-basic_cuda-vector-add load-gpu_cuda-tests
gpu-smoke-images: load-gpu_cuda-tests
.PHONY: gpu-smoke-images

gpu-smoke-tests: gpu-smoke-images $(RUNTIME_BIN)
Expand Down
1 change: 0 additions & 1 deletion images/basic/cuda-vector-add/Dockerfile

This file was deleted.

4 changes: 2 additions & 2 deletions images/gpu/pytorch/Dockerfile.x86_64
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM nvidia/cuda:12.2.0-devel-ubuntu20.04
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04

RUN apt-get update && apt-get install --yes \
python3 \
Expand All @@ -10,7 +10,7 @@ RUN apt-get update && apt-get install --yes \
git

RUN python3 -m pip install --ignore-installed \
"clang~=$(clang --version | grep -oP '10\.[^-]+')" \
"clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \
torch \
torchvision \
lightning \
Expand Down
1 change: 1 addition & 0 deletions images/gpu/stable-diffusion-xl/Dockerfile.x86_64
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --yes \
golang

RUN python3 -m pip install --ignore-installed \
"clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \
diffusers \
transformers \
accelerate \
Expand Down
4 changes: 1 addition & 3 deletions test/gpu/cuda_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,9 +396,7 @@ func (*FullyCompatible) IsExpectedFailure(ctx context.Context, env *TestEnvironm

// getContainerOpts returns the container run options to run CUDA tests.
func getContainerOpts() (dockerutil.RunOpts, error) {
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
return dockerutil.RunOpts{}, fmt.Errorf("failed to get GPU run options: %w", err)
}
Expand Down
4 changes: 2 additions & 2 deletions test/gpu/ffmpeg_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func TestFffmpegEncodeGPU(t *testing.T) {
defer container.CleanUp(ctx)
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
Capabilities: "NVIDIA_DRIVER_CAPABILITIES=video",
AllowIncompatibleIoctl: true,
AllowIncompatibleIoctl: true, // TODO(gvisor.dev/issue/9452): Remove once supported in gVisor.
})
if err != nil {
t.Fatalf("Failed to get GPU run options: %v", err)
Expand All @@ -61,7 +61,7 @@ func TestFffmpegDecodeGPU(t *testing.T) {
defer container.CleanUp(ctx)
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
Capabilities: "NVIDIA_DRIVER_CAPABILITIES=video",
AllowIncompatibleIoctl: true,
AllowIncompatibleIoctl: true, // TODO(gvisor.dev/issue/9452): Remove once supported in gVisor.
})
if err != nil {
t.Fatalf("Failed to get GPU run options: %v", err)
Expand Down
4 changes: 1 addition & 3 deletions test/gpu/nccl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@ import (
func runNCCL(ctx context.Context, t *testing.T, testName string) {
t.Helper()
c := dockerutil.MakeContainer(ctx, t)
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
t.Fatalf("Failed to get GPU run options: %v", err)
}
Expand Down
4 changes: 1 addition & 3 deletions test/gpu/ollama/ollama.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,7 @@ type dockerServer struct {
// NewDocker returns a new Ollama client talking to an Ollama server that runs
// in a local Docker container.
func NewDocker(ctx context.Context, cont *dockerutil.Container, logger testutil.Logger) (*Ollama, error) {
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
return nil, fmt.Errorf("failed to get GPU run options: %w", err)
}
Expand Down
4 changes: 1 addition & 3 deletions test/gpu/pytorch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@ import (
func runPytorch(ctx context.Context, t *testing.T, scriptPath string, args ...string) {
t.Helper()
c := dockerutil.MakeContainer(ctx, t)
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
t.Fatalf("Failed to get GPU run options: %v", err)
}
Expand Down
12 changes: 5 additions & 7 deletions test/gpu/smoke_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,15 @@ func TestGPUHello(t *testing.T) {
c := dockerutil.MakeContainer(ctx, t)
defer c.CleanUp(ctx)

opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "image has too old version of libc vs sniffer",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
t.Fatalf("failed to get GPU run options: %v", err)
}
opts.Image = "basic/cuda-vector-add"
out, err := c.Run(ctx, opts)
t.Logf("cuda-vector-add output: %s", string(out))
opts.Image = "gpu/cuda-tests"
out, err := c.Run(ctx, opts, "/run_sample", "--timeout=20s", "0_Introduction/vectorAdd")
t.Logf("0_Introduction/vectorAdd output: %s", string(out))
if err != nil {
t.Fatalf("could not run cuda-vector-add: %v", err)
t.Fatalf("could not run 0_Introduction/vectorAdd: %v", err)
}
}

Expand Down
22 changes: 14 additions & 8 deletions test/gpu/sr_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
package sr_test

import (
"strings"
"testing"
"time"

Expand All @@ -34,21 +35,26 @@ func TestGPUCheckpointRestore(t *testing.T) {
c := dockerutil.MakeContainer(ctx, t)
defer c.CleanUp(ctx)

opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
t.Fatalf("failed to get GPU run options: %v", err)
}
opts.Image = "basic/cuda-vector-add"
opts.Image = "gpu/cuda-tests"
if err := c.Spawn(ctx, opts, "sleep", "infinity"); err != nil {
t.Fatalf("could not run cuda-vector-add: %v", err)
t.Fatalf("could not start cuda-tests container: %v", err)
}
defer func() {
logs, err := c.Logs(ctx)
if err != nil {
t.Errorf("Could not get container logs: %v", err)
}
t.Logf("Container logs:\n%v", logs)
}()

// Run the vector add program.
vectorAddCmd := []string{"/bin/sh", "-c", "./vectorAdd"}
if _, err := c.Exec(ctx, dockerutil.ExecOpts{}, vectorAddCmd...); err != nil {
t.Fatalf("docker exec failed: %v", err)
vectorAddCmd := []string{"/run_sample", "--timeout=20s", "0_Introduction/vectorAdd"}
if output, err := c.Exec(ctx, dockerutil.ExecOpts{}, vectorAddCmd...); err != nil {
t.Fatalf("docker exec failed: %v; output: %v", err, strings.TrimSpace(output))
}

// Create a snapshot.
Expand Down
34 changes: 16 additions & 18 deletions test/gpu/stablediffusion/stablediffusion.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ import (
// ContainerRunner is an interface to run containers.
type ContainerRunner interface {
// Run runs a container with the given image and arguments to completion,
// and returns its combined output as a byte string.
Run(ctx context.Context, image string, argv []string) ([]byte, error)
// and returns its stdout/stderr streams as two byte strings.
Run(ctx context.Context, image string, argv []string) ([]byte, []byte, error)
}

// dockerRunner runs Docker containers on the local machine.
Expand All @@ -44,31 +44,29 @@ type dockerRunner struct {
}

// Run implements `ContainerRunner.Run`.
func (dr *dockerRunner) Run(ctx context.Context, image string, argv []string) ([]byte, error) {
func (dr *dockerRunner) Run(ctx context.Context, image string, argv []string) ([]byte, []byte, error) {
cont := dockerutil.MakeContainer(ctx, dr.logger)
defer cont.CleanUp(ctx)
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
return nil, fmt.Errorf("failed to get GPU run options: %w", err)
return nil, nil, fmt.Errorf("failed to get GPU run options: %w", err)
}
opts.Image = image
if err := cont.Spawn(ctx, opts, argv...); err != nil {
return nil, fmt.Errorf("could not start Stable Diffusion container: %v", err)
return nil, nil, fmt.Errorf("could not start Stable Diffusion container: %v", err)
}
waitErr := cont.Wait(ctx)
logs, logsErr := cont.Logs(ctx)
stdout, stderr, streamsErr := cont.OutputStreams(ctx)
if waitErr != nil {
if logsErr == nil {
return nil, fmt.Errorf("container exited with error: %v; logs: %v", waitErr, logs)
if streamsErr == nil {
return nil, nil, fmt.Errorf("container exited with error: %v; stderr: %v", waitErr, stderr)
}
return nil, fmt.Errorf("container exited with error: %v (cannot get logs: %v)", waitErr, logsErr)
return nil, nil, fmt.Errorf("container exited with error: %v (cannot get output streams: %v)", waitErr, streamsErr)
}
if logsErr != nil {
return nil, fmt.Errorf("could not get container logs: %v", logsErr)
if streamsErr != nil {
return nil, nil, fmt.Errorf("could not get container output streams: %v", streamsErr)
}
return []byte(logs), nil
return []byte(stdout), []byte(stderr), nil
}

// XL generates images using Stable Diffusion XL.
Expand Down Expand Up @@ -209,13 +207,13 @@ func (xl *XL) Generate(ctx context.Context, prompt *XLPrompt) (*XLImage, error)
argv = append(argv, "--warm")
}
argv = append(argv, prompt.Query)
output, err := xl.runner.Run(ctx, xl.image, argv)
stdout, stderr, err := xl.runner.Run(ctx, xl.image, argv)
if err != nil {
return nil, err
}
xlImage := &XLImage{Prompt: prompt}
if err := json.Unmarshal(output, &xlImage.data); err != nil {
return nil, fmt.Errorf("malformed JSON output %q: %w", string(output), err)
if err := json.Unmarshal(stdout, &xlImage.data); err != nil {
return nil, fmt.Errorf("malformed JSON output %q: %w; stderr: %v", string(stdout), err, string(stderr))
}
return xlImage, nil
}
4 changes: 1 addition & 3 deletions test/gpu/vllm/vllm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ func doVLLMTest(b *testing.B) {
}

// Run vllm.
runOpts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
runOpts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
b.Fatalf("failed to get GPU run options: %v", err)
}
Expand Down
18 changes: 9 additions & 9 deletions test/kubernetes/benchmarks/stablediffusion_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ type kubernetesPodRunner struct {
}

// Run implements `stablediffusion.ContainerRunner.Run`.
func (r *kubernetesPodRunner) Run(ctx context.Context, image string, argv []string) ([]byte, error) {
func (r *kubernetesPodRunner) Run(ctx context.Context, image string, argv []string) ([]byte, []byte, error) {
// Build pod spec.
const stableDiffusionXLPodName = "stable-diffusion-xl"
stableDiffusionXLPod := &v13.Pod{
Expand All @@ -81,11 +81,11 @@ func (r *kubernetesPodRunner) Run(ctx context.Context, image string, argv []stri
}
stableDiffusionXLPod, err := r.cluster.ConfigurePodForRuntimeTestNodepool(stableDiffusionXLPod)
if err != nil {
return nil, fmt.Errorf("failed to configure pod: %v", err)
return nil, nil, fmt.Errorf("failed to configure pod: %v", err)
}
stableDiffusionXLPod, err = testcluster.MaybeSetContainerResources(stableDiffusionXLPod, stableDiffusionXLPod.ObjectMeta.Name, testcluster.ContainerResourcesRequest{GPU: true})
if err != nil {
return nil, fmt.Errorf("failed to set container resources: %v", err)
return nil, nil, fmt.Errorf("failed to set container resources: %v", err)
}

// Delete pod that may possibly exist from a previous iteration.
Expand All @@ -95,27 +95,27 @@ func (r *kubernetesPodRunner) Run(ctx context.Context, image string, argv []stri
// Start new client pod and wait for it.
stableDiffusionXLPod, err = r.cluster.CreatePod(ctx, stableDiffusionXLPod)
if err != nil {
return nil, fmt.Errorf("failed to create stable diffusion XL pod: %v", err)
return nil, nil, fmt.Errorf("failed to create stable diffusion XL pod: %v", err)
}
defer r.cluster.DeletePod(ctx, stableDiffusionXLPod)
if err := r.cluster.WaitForPodCompleted(ctx, stableDiffusionXLPod); err != nil {
logs, logsErr := r.cluster.ReadPodLogs(ctx, stableDiffusionXLPod)
logs = strings.TrimSpace(logs)
if logsErr != nil {
return nil, fmt.Errorf("failed to run Stable Diffusion XL (%w) and to read logs from the pod: %v", err, logsErr)
return nil, nil, fmt.Errorf("failed to run Stable Diffusion XL (%w) and to read logs from the pod: %v", err, logsErr)
}
if logs == "" {
return nil, fmt.Errorf("failed to run Stable Diffusion XL: %w (pod logs are empty)", err)
return nil, nil, fmt.Errorf("failed to run Stable Diffusion XL: %w (pod logs are empty)", err)
}
return nil, fmt.Errorf("failed to run Stable Diffusion XL: %w (pod logs: %v)", err, logs)
return nil, nil, fmt.Errorf("failed to run Stable Diffusion XL: %w (pod logs: %v)", err, logs)
}

// All good, get logs.
logs, err := r.cluster.ReadPodLogs(ctx, stableDiffusionXLPod)
if err != nil {
return nil, fmt.Errorf("failed to read logs from pod %q: %v", stableDiffusionXLPod.GetName(), err)
return nil, nil, fmt.Errorf("failed to read logs from pod %q: %v", stableDiffusionXLPod.GetName(), err)
}
return []byte(logs), nil
return []byte(logs), nil, nil
}

// doStableDiffusionXLTest runs Stable Diffusion XL benchmarks for a single cluster.
Expand Down
11 changes: 6 additions & 5 deletions tools/ioctl_sniffer/run_sniffer.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,14 @@ func Main(ctx context.Context) error {

// Merge results from each connection.
finalResults := server.AllResults()
if *enforceCompatibility != "" && finalResults.HasUnsupportedIoctl() {
return fmt.Errorf("unsupported ioctls found: %v", finalResults)
if finalResults.HasUnsupportedIoctl() {
if *enforceCompatibility != "" {
return fmt.Errorf("unsupported ioctls found: %v", finalResults)
}
log.Infof("============== Unsupported ioctls ==============")
log.Infof("%v", finalResults)
}

log.Infof("============== Unsupported ioctls ==============")
log.Infof("%v", finalResults)

if cmdErr != nil {
return fmt.Errorf("command exited with error: %w", cmdErr)
}
Expand Down
2 changes: 1 addition & 1 deletion tools/ioctl_sniffer/sniffer/sniffer.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ func Init() error {
return fmt.Errorf("failed to parse host driver version: %w", err)
}

log.Infof("Host driver version: %v", driverVer)
log.Debugf("Host driver version: %v", driverVer)

suppFrontendIoctls, suppUvmIoctls, suppControlCmds, suppAllocClasses, ok := nvproxy.SupportedIoctls(driverVer)
if !ok {
Expand Down

0 comments on commit 35ac5b2

Please sign in to comment.