Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update all GPU tests to use the ioctl sniffer. #11045

Merged
merged 1 commit into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ simple-tests: unit-tests # Compatibility target.
.PHONY: simple-tests

# Images needed for GPU smoke tests.
gpu-smoke-images: load-basic_cuda-vector-add load-gpu_cuda-tests
gpu-smoke-images: load-gpu_cuda-tests
.PHONY: gpu-smoke-images

gpu-smoke-tests: gpu-smoke-images $(RUNTIME_BIN)
Expand Down
1 change: 0 additions & 1 deletion images/basic/cuda-vector-add/Dockerfile

This file was deleted.

4 changes: 2 additions & 2 deletions images/gpu/pytorch/Dockerfile.x86_64
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM nvidia/cuda:12.2.0-devel-ubuntu20.04
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04

RUN apt-get update && apt-get install --yes \
python3 \
Expand All @@ -10,7 +10,7 @@ RUN apt-get update && apt-get install --yes \
git

RUN python3 -m pip install --ignore-installed \
"clang~=$(clang --version | grep -oP '10\.[^-]+')" \
"clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \
torch \
torchvision \
lightning \
Expand Down
1 change: 1 addition & 0 deletions images/gpu/stable-diffusion-xl/Dockerfile.x86_64
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --yes \
golang

RUN python3 -m pip install --ignore-installed \
"clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \
diffusers \
transformers \
accelerate \
Expand Down
4 changes: 1 addition & 3 deletions test/gpu/cuda_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,9 +396,7 @@ func (*FullyCompatible) IsExpectedFailure(ctx context.Context, env *TestEnvironm

// getContainerOpts returns the container run options to run CUDA tests.
func getContainerOpts() (dockerutil.RunOpts, error) {
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
return dockerutil.RunOpts{}, fmt.Errorf("failed to get GPU run options: %w", err)
}
Expand Down
4 changes: 2 additions & 2 deletions test/gpu/ffmpeg_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func TestFffmpegEncodeGPU(t *testing.T) {
defer container.CleanUp(ctx)
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
Capabilities: "NVIDIA_DRIVER_CAPABILITIES=video",
AllowIncompatibleIoctl: true,
AllowIncompatibleIoctl: true, // TODO(gvisor.dev/issue/9452): Remove once supported in gVisor.
})
if err != nil {
t.Fatalf("Failed to get GPU run options: %v", err)
Expand All @@ -61,7 +61,7 @@ func TestFffmpegDecodeGPU(t *testing.T) {
defer container.CleanUp(ctx)
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
Capabilities: "NVIDIA_DRIVER_CAPABILITIES=video",
AllowIncompatibleIoctl: true,
AllowIncompatibleIoctl: true, // TODO(gvisor.dev/issue/9452): Remove once supported in gVisor.
})
if err != nil {
t.Fatalf("Failed to get GPU run options: %v", err)
Expand Down
4 changes: 1 addition & 3 deletions test/gpu/nccl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@ import (
func runNCCL(ctx context.Context, t *testing.T, testName string) {
t.Helper()
c := dockerutil.MakeContainer(ctx, t)
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
t.Fatalf("Failed to get GPU run options: %v", err)
}
Expand Down
4 changes: 1 addition & 3 deletions test/gpu/ollama/ollama.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,7 @@ type dockerServer struct {
// NewDocker returns a new Ollama client talking to an Ollama server that runs
// in a local Docker container.
func NewDocker(ctx context.Context, cont *dockerutil.Container, logger testutil.Logger) (*Ollama, error) {
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
return nil, fmt.Errorf("failed to get GPU run options: %w", err)
}
Expand Down
4 changes: 1 addition & 3 deletions test/gpu/pytorch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@ import (
func runPytorch(ctx context.Context, t *testing.T, scriptPath string, args ...string) {
t.Helper()
c := dockerutil.MakeContainer(ctx, t)
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
t.Fatalf("Failed to get GPU run options: %v", err)
}
Expand Down
12 changes: 5 additions & 7 deletions test/gpu/smoke_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,15 @@ func TestGPUHello(t *testing.T) {
c := dockerutil.MakeContainer(ctx, t)
defer c.CleanUp(ctx)

opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "image has too old version of libc vs sniffer",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
t.Fatalf("failed to get GPU run options: %v", err)
}
opts.Image = "basic/cuda-vector-add"
out, err := c.Run(ctx, opts)
t.Logf("cuda-vector-add output: %s", string(out))
opts.Image = "gpu/cuda-tests"
out, err := c.Run(ctx, opts, "/run_sample", "--timeout=120s", "0_Introduction/vectorAdd")
t.Logf("0_Introduction/vectorAdd output: %s", string(out))
if err != nil {
t.Fatalf("could not run cuda-vector-add: %v", err)
t.Fatalf("could not run 0_Introduction/vectorAdd: %v", err)
}
}

Expand Down
22 changes: 14 additions & 8 deletions test/gpu/sr_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
package sr_test

import (
"strings"
"testing"
"time"

Expand All @@ -34,21 +35,26 @@ func TestGPUCheckpointRestore(t *testing.T) {
c := dockerutil.MakeContainer(ctx, t)
defer c.CleanUp(ctx)

opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
t.Fatalf("failed to get GPU run options: %v", err)
}
opts.Image = "basic/cuda-vector-add"
opts.Image = "gpu/cuda-tests"
if err := c.Spawn(ctx, opts, "sleep", "infinity"); err != nil {
t.Fatalf("could not run cuda-vector-add: %v", err)
t.Fatalf("could not start cuda-tests container: %v", err)
}
defer func() {
logs, err := c.Logs(ctx)
if err != nil {
t.Errorf("Could not get container logs: %v", err)
}
t.Logf("Container logs:\n%v", logs)
}()

// Run the vector add program.
vectorAddCmd := []string{"/bin/sh", "-c", "./vectorAdd"}
if _, err := c.Exec(ctx, dockerutil.ExecOpts{}, vectorAddCmd...); err != nil {
t.Fatalf("docker exec failed: %v", err)
vectorAddCmd := []string{"/run_sample", "--timeout=120s", "0_Introduction/vectorAdd"}
if output, err := c.Exec(ctx, dockerutil.ExecOpts{}, vectorAddCmd...); err != nil {
t.Fatalf("docker exec failed: %v; output: %v", err, strings.TrimSpace(output))
}

// Create a snapshot.
Expand Down
34 changes: 16 additions & 18 deletions test/gpu/stablediffusion/stablediffusion.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ import (
// ContainerRunner is an interface to run containers.
type ContainerRunner interface {
// Run runs a container with the given image and arguments to completion,
// and returns its combined output as a byte string.
Run(ctx context.Context, image string, argv []string) ([]byte, error)
// and returns its stdout/stderr streams as two byte strings.
Run(ctx context.Context, image string, argv []string) ([]byte, []byte, error)
}

// dockerRunner runs Docker containers on the local machine.
Expand All @@ -44,31 +44,29 @@ type dockerRunner struct {
}

// Run implements `ContainerRunner.Run`.
func (dr *dockerRunner) Run(ctx context.Context, image string, argv []string) ([]byte, error) {
func (dr *dockerRunner) Run(ctx context.Context, image string, argv []string) ([]byte, []byte, error) {
cont := dockerutil.MakeContainer(ctx, dr.logger)
defer cont.CleanUp(ctx)
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
return nil, fmt.Errorf("failed to get GPU run options: %w", err)
return nil, nil, fmt.Errorf("failed to get GPU run options: %w", err)
}
opts.Image = image
if err := cont.Spawn(ctx, opts, argv...); err != nil {
return nil, fmt.Errorf("could not start Stable Diffusion container: %v", err)
return nil, nil, fmt.Errorf("could not start Stable Diffusion container: %v", err)
}
waitErr := cont.Wait(ctx)
logs, logsErr := cont.Logs(ctx)
stdout, stderr, streamsErr := cont.OutputStreams(ctx)
if waitErr != nil {
if logsErr == nil {
return nil, fmt.Errorf("container exited with error: %v; logs: %v", waitErr, logs)
if streamsErr == nil {
return nil, nil, fmt.Errorf("container exited with error: %v; stderr: %v", waitErr, stderr)
}
return nil, fmt.Errorf("container exited with error: %v (cannot get logs: %v)", waitErr, logsErr)
return nil, nil, fmt.Errorf("container exited with error: %v (cannot get output streams: %v)", waitErr, streamsErr)
}
if logsErr != nil {
return nil, fmt.Errorf("could not get container logs: %v", logsErr)
if streamsErr != nil {
return nil, nil, fmt.Errorf("could not get container output streams: %v", streamsErr)
}
return []byte(logs), nil
return []byte(stdout), []byte(stderr), nil
}

// XL generates images using Stable Diffusion XL.
Expand Down Expand Up @@ -209,13 +207,13 @@ func (xl *XL) Generate(ctx context.Context, prompt *XLPrompt) (*XLImage, error)
argv = append(argv, "--warm")
}
argv = append(argv, prompt.Query)
output, err := xl.runner.Run(ctx, xl.image, argv)
stdout, stderr, err := xl.runner.Run(ctx, xl.image, argv)
if err != nil {
return nil, err
}
xlImage := &XLImage{Prompt: prompt}
if err := json.Unmarshal(output, &xlImage.data); err != nil {
return nil, fmt.Errorf("malformed JSON output %q: %w", string(output), err)
if err := json.Unmarshal(stdout, &xlImage.data); err != nil {
return nil, fmt.Errorf("malformed JSON output %q: %w; stderr: %v", string(stdout), err, string(stderr))
}
return xlImage, nil
}
4 changes: 1 addition & 3 deletions test/gpu/vllm/vllm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ func doVLLMTest(b *testing.B) {
}

// Run vllm.
runOpts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
DisableSnifferReason: "TODO(gvisor.dev/issue/10885): Verify that this test works",
})
runOpts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
if err != nil {
b.Fatalf("failed to get GPU run options: %v", err)
}
Expand Down
18 changes: 9 additions & 9 deletions test/kubernetes/benchmarks/stablediffusion_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ type kubernetesPodRunner struct {
}

// Run implements `stablediffusion.ContainerRunner.Run`.
func (r *kubernetesPodRunner) Run(ctx context.Context, image string, argv []string) ([]byte, error) {
func (r *kubernetesPodRunner) Run(ctx context.Context, image string, argv []string) ([]byte, []byte, error) {
// Build pod spec.
const stableDiffusionXLPodName = "stable-diffusion-xl"
stableDiffusionXLPod := &v13.Pod{
Expand All @@ -81,11 +81,11 @@ func (r *kubernetesPodRunner) Run(ctx context.Context, image string, argv []stri
}
stableDiffusionXLPod, err := r.cluster.ConfigurePodForRuntimeTestNodepool(stableDiffusionXLPod)
if err != nil {
return nil, fmt.Errorf("failed to configure pod: %v", err)
return nil, nil, fmt.Errorf("failed to configure pod: %v", err)
}
stableDiffusionXLPod, err = testcluster.MaybeSetContainerResources(stableDiffusionXLPod, stableDiffusionXLPod.ObjectMeta.Name, testcluster.ContainerResourcesRequest{GPU: true})
if err != nil {
return nil, fmt.Errorf("failed to set container resources: %v", err)
return nil, nil, fmt.Errorf("failed to set container resources: %v", err)
}

// Delete pod that may possibly exist from a previous iteration.
Expand All @@ -95,27 +95,27 @@ func (r *kubernetesPodRunner) Run(ctx context.Context, image string, argv []stri
// Start new client pod and wait for it.
stableDiffusionXLPod, err = r.cluster.CreatePod(ctx, stableDiffusionXLPod)
if err != nil {
return nil, fmt.Errorf("failed to create stable diffusion XL pod: %v", err)
return nil, nil, fmt.Errorf("failed to create stable diffusion XL pod: %v", err)
}
defer r.cluster.DeletePod(ctx, stableDiffusionXLPod)
if err := r.cluster.WaitForPodCompleted(ctx, stableDiffusionXLPod); err != nil {
logs, logsErr := r.cluster.ReadPodLogs(ctx, stableDiffusionXLPod)
logs = strings.TrimSpace(logs)
if logsErr != nil {
return nil, fmt.Errorf("failed to run Stable Diffusion XL (%w) and to read logs from the pod: %v", err, logsErr)
return nil, nil, fmt.Errorf("failed to run Stable Diffusion XL (%w) and to read logs from the pod: %v", err, logsErr)
}
if logs == "" {
return nil, fmt.Errorf("failed to run Stable Diffusion XL: %w (pod logs are empty)", err)
return nil, nil, fmt.Errorf("failed to run Stable Diffusion XL: %w (pod logs are empty)", err)
}
return nil, fmt.Errorf("failed to run Stable Diffusion XL: %w (pod logs: %v)", err, logs)
return nil, nil, fmt.Errorf("failed to run Stable Diffusion XL: %w (pod logs: %v)", err, logs)
}

// All good, get logs.
logs, err := r.cluster.ReadPodLogs(ctx, stableDiffusionXLPod)
if err != nil {
return nil, fmt.Errorf("failed to read logs from pod %q: %v", stableDiffusionXLPod.GetName(), err)
return nil, nil, fmt.Errorf("failed to read logs from pod %q: %v", stableDiffusionXLPod.GetName(), err)
}
return []byte(logs), nil
return []byte(logs), nil, nil
}

// doStableDiffusionXLTest runs Stable Diffusion XL benchmarks for a single cluster.
Expand Down
11 changes: 6 additions & 5 deletions tools/ioctl_sniffer/run_sniffer.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,14 @@ func Main(ctx context.Context) error {

// Merge results from each connection.
finalResults := server.AllResults()
if *enforceCompatibility != "" && finalResults.HasUnsupportedIoctl() {
return fmt.Errorf("unsupported ioctls found: %v", finalResults)
if finalResults.HasUnsupportedIoctl() {
if *enforceCompatibility != "" {
return fmt.Errorf("unsupported ioctls found: %v", finalResults)
}
log.Infof("============== Unsupported ioctls ==============")
log.Infof("%v", finalResults)
}

log.Infof("============== Unsupported ioctls ==============")
log.Infof("%v", finalResults)

if cmdErr != nil {
return fmt.Errorf("command exited with error: %w", cmdErr)
}
Expand Down
2 changes: 1 addition & 1 deletion tools/ioctl_sniffer/sniffer/sniffer.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ func Init() error {
return fmt.Errorf("failed to parse host driver version: %w", err)
}

log.Infof("Host driver version: %v", driverVer)
log.Debugf("Host driver version: %v", driverVer)

suppFrontendIoctls, suppUvmIoctls, suppControlCmds, suppAllocClasses, ok := nvproxy.SupportedIoctls(driverVer)
if !ok {
Expand Down
Loading