From e13cf36ad71b72c7237f53024bf021acf4ab6b20 Mon Sep 17 00:00:00 2001 From: Etienne Perot Date: Tue, 22 Oct 2024 16:18:44 -0700 Subject: [PATCH] Update all GPU tests to use the ioctl sniffer. Fixes issue #10885. PiperOrigin-RevId: 688728104 --- Makefile | 2 +- images/basic/cuda-vector-add/Dockerfile | 1 - images/gpu/pytorch/Dockerfile.x86_64 | 4 +-- .../gpu/stable-diffusion-xl/Dockerfile.x86_64 | 1 + test/gpu/cuda_test.go | 4 +-- test/gpu/ffmpeg_test.go | 4 +-- test/gpu/nccl_test.go | 4 +-- test/gpu/ollama/ollama.go | 4 +-- test/gpu/pytorch_test.go | 4 +-- test/gpu/smoke_test.go | 12 +++---- test/gpu/sr_test.go | 22 +++++++----- test/gpu/stablediffusion/stablediffusion.go | 34 +++++++++---------- test/gpu/vllm/vllm_test.go | 4 +-- .../benchmarks/stablediffusion_test.go | 18 +++++----- tools/ioctl_sniffer/run_sniffer.go | 11 +++--- tools/ioctl_sniffer/sniffer/sniffer.go | 2 +- 16 files changed, 62 insertions(+), 69 deletions(-) delete mode 100644 images/basic/cuda-vector-add/Dockerfile diff --git a/Makefile b/Makefile index a7e379f864..1c6bb7957b 100644 --- a/Makefile +++ b/Makefile @@ -285,7 +285,7 @@ simple-tests: unit-tests # Compatibility target. .PHONY: simple-tests # Images needed for GPU smoke tests. -gpu-smoke-images: load-basic_cuda-vector-add load-gpu_cuda-tests +gpu-smoke-images: load-gpu_cuda-tests .PHONY: gpu-smoke-images gpu-smoke-tests: gpu-smoke-images $(RUNTIME_BIN) diff --git a/images/basic/cuda-vector-add/Dockerfile b/images/basic/cuda-vector-add/Dockerfile deleted file mode 100644 index 112d045e10..0000000000 --- a/images/basic/cuda-vector-add/Dockerfile +++ /dev/null @@ -1 +0,0 @@ -FROM gcr.io/google_containers/cuda-vector-add:v0.1 \ No newline at end of file diff --git a/images/gpu/pytorch/Dockerfile.x86_64 b/images/gpu/pytorch/Dockerfile.x86_64 index ea29b768ea..fed65ef7e5 100644 --- a/images/gpu/pytorch/Dockerfile.x86_64 +++ b/images/gpu/pytorch/Dockerfile.x86_64 @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.2.0-devel-ubuntu20.04 +FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 RUN apt-get update && apt-get install --yes \ python3 \ @@ -10,7 +10,7 @@ RUN apt-get update && apt-get install --yes \ git RUN python3 -m pip install --ignore-installed \ - "clang~=$(clang --version | grep -oP '10\.[^-]+')" \ + "clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \ torch \ torchvision \ lightning \ diff --git a/images/gpu/stable-diffusion-xl/Dockerfile.x86_64 b/images/gpu/stable-diffusion-xl/Dockerfile.x86_64 index 9660a4c03e..e7404d487c 100644 --- a/images/gpu/stable-diffusion-xl/Dockerfile.x86_64 +++ b/images/gpu/stable-diffusion-xl/Dockerfile.x86_64 @@ -14,6 +14,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --yes \ golang RUN python3 -m pip install --ignore-installed \ + "clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \ diffusers \ transformers \ accelerate \ diff --git a/test/gpu/cuda_test.go b/test/gpu/cuda_test.go index 2fb25f431e..dec522b0fa 100644 --- a/test/gpu/cuda_test.go +++ b/test/gpu/cuda_test.go @@ -396,9 +396,7 @@ func (*FullyCompatible) IsExpectedFailure(ctx context.Context, env *TestEnvironm // getContainerOpts returns the container run options to run CUDA tests. func getContainerOpts() (dockerutil.RunOpts, error) { - opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ - DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works", - }) + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{}) if err != nil { return dockerutil.RunOpts{}, fmt.Errorf("failed to get GPU run options: %w", err) } diff --git a/test/gpu/ffmpeg_test.go b/test/gpu/ffmpeg_test.go index abf5548fec..9c819791fe 100644 --- a/test/gpu/ffmpeg_test.go +++ b/test/gpu/ffmpeg_test.go @@ -35,7 +35,7 @@ func TestFffmpegEncodeGPU(t *testing.T) { defer container.CleanUp(ctx) opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ Capabilities: "NVIDIA_DRIVER_CAPABILITIES=video", - AllowIncompatibleIoctl: true, + AllowIncompatibleIoctl: true, // TODO(gvisor.dev/issue/9452): Remove once supported in gVisor. }) if err != nil { t.Fatalf("Failed to get GPU run options: %v", err) @@ -61,7 +61,7 @@ func TestFffmpegDecodeGPU(t *testing.T) { defer container.CleanUp(ctx) opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ Capabilities: "NVIDIA_DRIVER_CAPABILITIES=video", - AllowIncompatibleIoctl: true, + AllowIncompatibleIoctl: true, // TODO(gvisor.dev/issue/9452): Remove once supported in gVisor. }) if err != nil { t.Fatalf("Failed to get GPU run options: %v", err) diff --git a/test/gpu/nccl_test.go b/test/gpu/nccl_test.go index b82282711f..866cb31eb5 100644 --- a/test/gpu/nccl_test.go +++ b/test/gpu/nccl_test.go @@ -27,9 +27,7 @@ import ( func runNCCL(ctx context.Context, t *testing.T, testName string) { t.Helper() c := dockerutil.MakeContainer(ctx, t) - opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ - DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works", - }) + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{}) if err != nil { t.Fatalf("Failed to get GPU run options: %v", err) } diff --git a/test/gpu/ollama/ollama.go b/test/gpu/ollama/ollama.go index 7443d8682b..6dcd06540e 100644 --- a/test/gpu/ollama/ollama.go +++ b/test/gpu/ollama/ollama.go @@ -149,9 +149,7 @@ type dockerServer struct { // NewDocker returns a new Ollama client talking to an Ollama server that runs // in a local Docker container. func NewDocker(ctx context.Context, cont *dockerutil.Container, logger testutil.Logger) (*Ollama, error) { - opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ - DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works", - }) + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{}) if err != nil { return nil, fmt.Errorf("failed to get GPU run options: %w", err) } diff --git a/test/gpu/pytorch_test.go b/test/gpu/pytorch_test.go index 05d74a53a2..bf9ca9f403 100644 --- a/test/gpu/pytorch_test.go +++ b/test/gpu/pytorch_test.go @@ -26,9 +26,7 @@ import ( func runPytorch(ctx context.Context, t *testing.T, scriptPath string, args ...string) { t.Helper() c := dockerutil.MakeContainer(ctx, t) - opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ - DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works", - }) + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{}) if err != nil { t.Fatalf("Failed to get GPU run options: %v", err) } diff --git a/test/gpu/smoke_test.go b/test/gpu/smoke_test.go index 67d5406a60..d4137c39db 100644 --- a/test/gpu/smoke_test.go +++ b/test/gpu/smoke_test.go @@ -27,17 +27,15 @@ func TestGPUHello(t *testing.T) { c := dockerutil.MakeContainer(ctx, t) defer c.CleanUp(ctx) - opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ - DisableSnifferReason: "image has too old version of libc vs sniffer", - }) + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{}) if err != nil { t.Fatalf("failed to get GPU run options: %v", err) } - opts.Image = "basic/cuda-vector-add" - out, err := c.Run(ctx, opts) - t.Logf("cuda-vector-add output: %s", string(out)) + opts.Image = "gpu/cuda-tests" + out, err := c.Run(ctx, opts, "/run_sample", "--timeout=120s", "0_Introduction/vectorAdd") + t.Logf("0_Introduction/vectorAdd output: %s", string(out)) if err != nil { - t.Fatalf("could not run cuda-vector-add: %v", err) + t.Fatalf("could not run 0_Introduction/vectorAdd: %v", err) } } diff --git a/test/gpu/sr_test.go b/test/gpu/sr_test.go index e8ca6cfc8e..9fd9b5db61 100644 --- a/test/gpu/sr_test.go +++ b/test/gpu/sr_test.go @@ -16,6 +16,7 @@ package sr_test import ( + "strings" "testing" "time" @@ -34,21 +35,26 @@ func TestGPUCheckpointRestore(t *testing.T) { c := dockerutil.MakeContainer(ctx, t) defer c.CleanUp(ctx) - opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ - DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works", - }) + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{}) if err != nil { t.Fatalf("failed to get GPU run options: %v", err) } - opts.Image = "basic/cuda-vector-add" + opts.Image = "gpu/cuda-tests" if err := c.Spawn(ctx, opts, "sleep", "infinity"); err != nil { - t.Fatalf("could not run cuda-vector-add: %v", err) + t.Fatalf("could not start cuda-tests container: %v", err) } + defer func() { + logs, err := c.Logs(ctx) + if err != nil { + t.Errorf("Could not get container logs: %v", err) + } + t.Logf("Container logs:\n%v", logs) + }() // Run the vector add program. - vectorAddCmd := []string{"/bin/sh", "-c", "./vectorAdd"} - if _, err := c.Exec(ctx, dockerutil.ExecOpts{}, vectorAddCmd...); err != nil { - t.Fatalf("docker exec failed: %v", err) + vectorAddCmd := []string{"/run_sample", "--timeout=120s", "0_Introduction/vectorAdd"} + if output, err := c.Exec(ctx, dockerutil.ExecOpts{}, vectorAddCmd...); err != nil { + t.Fatalf("docker exec failed: %v; output: %v", err, strings.TrimSpace(output)) } // Create a snapshot. diff --git a/test/gpu/stablediffusion/stablediffusion.go b/test/gpu/stablediffusion/stablediffusion.go index 44499221a7..be1407b1f7 100644 --- a/test/gpu/stablediffusion/stablediffusion.go +++ b/test/gpu/stablediffusion/stablediffusion.go @@ -34,8 +34,8 @@ import ( // ContainerRunner is an interface to run containers. type ContainerRunner interface { // Run runs a container with the given image and arguments to completion, - // and returns its combined output as a byte string. - Run(ctx context.Context, image string, argv []string) ([]byte, error) + // and returns its stdout/stderr streams as two byte strings. + Run(ctx context.Context, image string, argv []string) ([]byte, []byte, error) } // dockerRunner runs Docker containers on the local machine. @@ -44,31 +44,29 @@ type dockerRunner struct { } // Run implements `ContainerRunner.Run`. -func (dr *dockerRunner) Run(ctx context.Context, image string, argv []string) ([]byte, error) { +func (dr *dockerRunner) Run(ctx context.Context, image string, argv []string) ([]byte, []byte, error) { cont := dockerutil.MakeContainer(ctx, dr.logger) defer cont.CleanUp(ctx) - opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ - DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works", - }) + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{}) if err != nil { - return nil, fmt.Errorf("failed to get GPU run options: %w", err) + return nil, nil, fmt.Errorf("failed to get GPU run options: %w", err) } opts.Image = image if err := cont.Spawn(ctx, opts, argv...); err != nil { - return nil, fmt.Errorf("could not start Stable Diffusion container: %v", err) + return nil, nil, fmt.Errorf("could not start Stable Diffusion container: %v", err) } waitErr := cont.Wait(ctx) - logs, logsErr := cont.Logs(ctx) + stdout, stderr, streamsErr := cont.OutputStreams(ctx) if waitErr != nil { - if logsErr == nil { - return nil, fmt.Errorf("container exited with error: %v; logs: %v", waitErr, logs) + if streamsErr == nil { + return nil, nil, fmt.Errorf("container exited with error: %v; stderr: %v", waitErr, stderr) } - return nil, fmt.Errorf("container exited with error: %v (cannot get logs: %v)", waitErr, logsErr) + return nil, nil, fmt.Errorf("container exited with error: %v (cannot get output streams: %v)", waitErr, streamsErr) } - if logsErr != nil { - return nil, fmt.Errorf("could not get container logs: %v", logsErr) + if streamsErr != nil { + return nil, nil, fmt.Errorf("could not get container output streams: %v", streamsErr) } - return []byte(logs), nil + return []byte(stdout), []byte(stderr), nil } // XL generates images using Stable Diffusion XL. @@ -209,13 +207,13 @@ func (xl *XL) Generate(ctx context.Context, prompt *XLPrompt) (*XLImage, error) argv = append(argv, "--warm") } argv = append(argv, prompt.Query) - output, err := xl.runner.Run(ctx, xl.image, argv) + stdout, stderr, err := xl.runner.Run(ctx, xl.image, argv) if err != nil { return nil, err } xlImage := &XLImage{Prompt: prompt} - if err := json.Unmarshal(output, &xlImage.data); err != nil { - return nil, fmt.Errorf("malformed JSON output %q: %w", string(output), err) + if err := json.Unmarshal(stdout, &xlImage.data); err != nil { + return nil, fmt.Errorf("malformed JSON output %q: %w; stderr: %v", string(stdout), err, string(stderr)) } return xlImage, nil } diff --git a/test/gpu/vllm/vllm_test.go b/test/gpu/vllm/vllm_test.go index 2c84ce7505..5ee98a2228 100644 --- a/test/gpu/vllm/vllm_test.go +++ b/test/gpu/vllm/vllm_test.go @@ -51,9 +51,7 @@ func doVLLMTest(b *testing.B) { } // Run vllm. - runOpts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ - DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works", - }) + runOpts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{}) if err != nil { b.Fatalf("failed to get GPU run options: %v", err) } diff --git a/test/kubernetes/benchmarks/stablediffusion_test.go b/test/kubernetes/benchmarks/stablediffusion_test.go index 8b288d15bd..ce03612e9e 100644 --- a/test/kubernetes/benchmarks/stablediffusion_test.go +++ b/test/kubernetes/benchmarks/stablediffusion_test.go @@ -56,7 +56,7 @@ type kubernetesPodRunner struct { } // Run implements `stablediffusion.ContainerRunner.Run`. -func (r *kubernetesPodRunner) Run(ctx context.Context, image string, argv []string) ([]byte, error) { +func (r *kubernetesPodRunner) Run(ctx context.Context, image string, argv []string) ([]byte, []byte, error) { // Build pod spec. const stableDiffusionXLPodName = "stable-diffusion-xl" stableDiffusionXLPod := &v13.Pod{ @@ -81,11 +81,11 @@ func (r *kubernetesPodRunner) Run(ctx context.Context, image string, argv []stri } stableDiffusionXLPod, err := r.cluster.ConfigurePodForRuntimeTestNodepool(stableDiffusionXLPod) if err != nil { - return nil, fmt.Errorf("failed to configure pod: %v", err) + return nil, nil, fmt.Errorf("failed to configure pod: %v", err) } stableDiffusionXLPod, err = testcluster.MaybeSetContainerResources(stableDiffusionXLPod, stableDiffusionXLPod.ObjectMeta.Name, testcluster.ContainerResourcesRequest{GPU: true}) if err != nil { - return nil, fmt.Errorf("failed to set container resources: %v", err) + return nil, nil, fmt.Errorf("failed to set container resources: %v", err) } // Delete pod that may possibly exist from a previous iteration. @@ -95,27 +95,27 @@ func (r *kubernetesPodRunner) Run(ctx context.Context, image string, argv []stri // Start new client pod and wait for it. stableDiffusionXLPod, err = r.cluster.CreatePod(ctx, stableDiffusionXLPod) if err != nil { - return nil, fmt.Errorf("failed to create stable diffusion XL pod: %v", err) + return nil, nil, fmt.Errorf("failed to create stable diffusion XL pod: %v", err) } defer r.cluster.DeletePod(ctx, stableDiffusionXLPod) if err := r.cluster.WaitForPodCompleted(ctx, stableDiffusionXLPod); err != nil { logs, logsErr := r.cluster.ReadPodLogs(ctx, stableDiffusionXLPod) logs = strings.TrimSpace(logs) if logsErr != nil { - return nil, fmt.Errorf("failed to run Stable Diffusion XL (%w) and to read logs from the pod: %v", err, logsErr) + return nil, nil, fmt.Errorf("failed to run Stable Diffusion XL (%w) and to read logs from the pod: %v", err, logsErr) } if logs == "" { - return nil, fmt.Errorf("failed to run Stable Diffusion XL: %w (pod logs are empty)", err) + return nil, nil, fmt.Errorf("failed to run Stable Diffusion XL: %w (pod logs are empty)", err) } - return nil, fmt.Errorf("failed to run Stable Diffusion XL: %w (pod logs: %v)", err, logs) + return nil, nil, fmt.Errorf("failed to run Stable Diffusion XL: %w (pod logs: %v)", err, logs) } // All good, get logs. logs, err := r.cluster.ReadPodLogs(ctx, stableDiffusionXLPod) if err != nil { - return nil, fmt.Errorf("failed to read logs from pod %q: %v", stableDiffusionXLPod.GetName(), err) + return nil, nil, fmt.Errorf("failed to read logs from pod %q: %v", stableDiffusionXLPod.GetName(), err) } - return []byte(logs), nil + return []byte(logs), nil, nil } // doStableDiffusionXLTest runs Stable Diffusion XL benchmarks for a single cluster. diff --git a/tools/ioctl_sniffer/run_sniffer.go b/tools/ioctl_sniffer/run_sniffer.go index d0dd56692c..1a0327b89d 100644 --- a/tools/ioctl_sniffer/run_sniffer.go +++ b/tools/ioctl_sniffer/run_sniffer.go @@ -121,13 +121,14 @@ func Main(ctx context.Context) error { // Merge results from each connection. finalResults := server.AllResults() - if *enforceCompatibility != "" && finalResults.HasUnsupportedIoctl() { - return fmt.Errorf("unsupported ioctls found: %v", finalResults) + if finalResults.HasUnsupportedIoctl() { + if *enforceCompatibility != "" { + return fmt.Errorf("unsupported ioctls found: %v", finalResults) + } + log.Infof("============== Unsupported ioctls ==============") + log.Infof("%v", finalResults) } - log.Infof("============== Unsupported ioctls ==============") - log.Infof("%v", finalResults) - if cmdErr != nil { return fmt.Errorf("command exited with error: %w", cmdErr) } diff --git a/tools/ioctl_sniffer/sniffer/sniffer.go b/tools/ioctl_sniffer/sniffer/sniffer.go index a99e6dc0ad..a4c212328c 100644 --- a/tools/ioctl_sniffer/sniffer/sniffer.go +++ b/tools/ioctl_sniffer/sniffer/sniffer.go @@ -188,7 +188,7 @@ func Init() error { return fmt.Errorf("failed to parse host driver version: %w", err) } - log.Infof("Host driver version: %v", driverVer) + log.Debugf("Host driver version: %v", driverVer) suppFrontendIoctls, suppUvmIoctls, suppControlCmds, suppAllocClasses, ok := nvproxy.SupportedIoctls(driverVer) if !ok {