Update CUDA test compatibility to keep up with added gVisor support.

These CUDA tests were initially broken in gVisor but now appear to pass. The test now also verifies that all capabilities are enabled when running. PiperOrigin-RevId: 711880073
google · Jan 7, 2025 · ebf9577 · ebf9577
1 parent 7af7cb0
commit ebf9577
Show file tree

Hide file tree

Showing 8 changed files with 144 additions and 47 deletions.
diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml
@@ -198,16 +198,16 @@ steps:
     agents:
       queue: cos-canary-gpu
   - <<: *source_test_continuous
-    label: ":fish: CUDA tests"
+    label: ":fish: CUDA tests (NOSUBMIT)"
     # This is its own test rather than being part of the GPU tests,
     # because it takes around 30 minutes to run.
-    parallelism: 8
-    timeout_in_minutes: 60
+    parallelism: 32
+    timeout_in_minutes: 120
     retry:
       <<: *retry_settings
     commands:
       - make sudo TARGETS=//tools/gpu:main ARGS="install --latest" || cat /var/log/nvidia-installer.log
-      - make cuda-tests
+      - make cuda-tests ARGS="--cuda_verify_compatibility=true"
     agents:
       queue: gpu
   - <<: *common

diff --git a/.buildkite/release.yaml b/.buildkite/release.yaml
@@ -1,15 +1,16 @@
 agents:
   queue: release
 _templates:
+  retry_settings: &retry_settings
+    automatic:
+      - exit_status: -1
+        limit: 10
+      - exit_status: "*"
+        limit: 2
   common: &common
     timeout_in_minutes: 180
     retry:
-      automatic:
-        - exit_status: -1
-          limit: 10
-        - exit_status: "*"
-          limit: 2
-
+      <<: *retry_settings
 notify:
   - email: "[email protected]"
     if: build.state == "failed"
@@ -75,6 +76,18 @@ steps:
       - make gpu-all-tests
     agents:
       queue: gpu
+  - label: ":fish: CUDA tests"
+    # This is its own test rather than being part of the GPU tests,
+    # because it takes around 30 minutes to run.
+    parallelism: 32
+    timeout_in_minutes: 120
+    retry:
+      <<: *retry_settings
+    commands:
+      - make sudo TARGETS=//tools/gpu:main ARGS="install --latest" || cat /var/log/nvidia-installer.log
+      - make cuda-tests ARGS="--cuda_verify_compatibility=true"
+    agents:
+      queue: gpu
   - <<: *common
     label: ":screwdriver: All GPU Drivers Test"
     parallelism: 8

diff --git a/Makefile b/Makefile
@@ -331,8 +331,8 @@ cos-gpu-all-tests: gpu-images cos-gpu-smoke-tests $(RUNTIME_BIN)
 	@$(call sudo,test/gpu:sniffer_test,--runtime=$(RUNTIME) -test.v --cos-gpu $(ARGS))
 .PHONY: cos-gpu-all-tests
 
-cuda-tests: load-gpu_cuda-tests $(RUNTIME_BIN)
-	@$(call install_runtime,$(RUNTIME),--nvproxy=true --nvproxy-docker=true)
+cuda-tests: load-basic_alpine load-gpu_cuda-tests $(RUNTIME_BIN)
+	@$(call install_runtime,$(RUNTIME),--nvproxy=true --nvproxy-docker=true --nvproxy-allowed-driver-capabilities=all)
 	@$(call sudo,test/gpu:cuda_test,--runtime=$(RUNTIME) -test.v $(ARGS))
 .PHONY: cuda-tests
 

diff --git a/images/gpu/cuda-tests/list_features.cu b/images/gpu/cuda-tests/list_features.cu
@@ -49,4 +49,15 @@ int main(int argc, char *argv[]) {
                            CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED,
                            cuda_device));
   printFeature("COMPRESSIBLE_MEMORY", isCompressionAvailable != 0);
+  bool p2pAvailable = false;
+  int gpuCount = -1;
+  CHECK_CUDA(cudaGetDeviceCount(&gpuCount));
+  if (gpuCount >= 2) {
+    int canAccessAToB = -1;
+    CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessAToB, 0, 1));
+    int canAccessBToA = -1;
+    CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessBToA, 1, 0));
+    p2pAvailable = canAccessAToB > 0 && canAccessBToA > 0;
+  }
+  printFeature("P2P", p2pAvailable);
 }
diff --git a/images/gpu/cuda-tests/list_features.sh b/images/gpu/cuda-tests/list_features.sh
@@ -20,6 +20,13 @@
 
 set -euo pipefail
 
+if [[ "${NVIDIA_DRIVER_CAPABILITIES:-}" != "all" ]]; then
+  echo "NVIDIA_DRIVER_CAPABILITIES is not set to 'all'." >&2
+  echo "It is set to: '${NVIDIA_DRIVER_CAPABILITIES:-}'" >&2
+  echo "Please set it to 'all' and try again." >&2
+  exit 1
+fi
+
 cd /
 nvcc list_features.cu -lcuda -o list_features
 ./list_features

diff --git a/images/gpu/cuda-tests/run_sample.go b/images/gpu/cuda-tests/run_sample.go
@@ -997,6 +997,9 @@ func (st *SampleTest) RunLibNVVMTest(ctx context.Context) error {
 // Main is the main method of this program.
 func Main(ctx context.Context) (int, error) {
 	flag.Parse()
+	if nvCaps := os.Getenv("NVIDIA_DRIVER_CAPABILITIES"); nvCaps != "all" {
+		return 1, fmt.Errorf("NVIDIA_DRIVER_CAPABILITIES is not set to 'all' (got %q); please set it to 'all' and try again", nvCaps)
+	}
 	cleanupCtx, cleanupCancel := context.WithTimeout(ctx, *timeoutFlag)
 	defer cleanupCancel()
 	deadline, _ := cleanupCtx.Deadline()
@@ -1007,15 +1010,15 @@ func Main(ctx context.Context) (int, error) {
 	defer x.Shutdown(cleanupCtx)
 	testsCtx, testsCancel := context.WithDeadline(cleanupCtx, deadline.Add(-10*time.Second))
 	defer testsCancel()
-	failed := false
 	numTests := 0
 	exitCode := 1
+	var lastErr error
 	for _, testName := range flag.Args() {
 		numTests++
 		st, err := NewSampleTest(testName, x)
 		if err != nil {
 			log("> Invalid test %q: %s", testName, err)
-			failed = true
+			lastErr = fmt.Errorf("invalid test %q: %w", testName, err)
 			continue
 		}
 		log("> Running test: %s", testName)
@@ -1024,7 +1027,7 @@ func Main(ctx context.Context) (int, error) {
 		testCancel()
 		if err != nil {
 			log("> Test failed: %s (%s)", testName, err)
-			failed = true
+			lastErr = fmt.Errorf("test %q failed: %w", testName, err)
 			if exitErr := (*exec.ExitError)(nil); errors.As(err, &exitErr) && exitErr.ExitCode() > 0 {
 				exitCode = exitErr.ExitCode()
 			}
@@ -1035,14 +1038,19 @@ func Main(ctx context.Context) (int, error) {
 	if numTests == 0 {
 		return 1, fmt.Errorf("no tests to run, failing vacuously; specify test names as positional arguments")
 	}
-	if failed {
-		if numTests == 1 {
-			// If there was a single test to run, pass along its error code.
-			return exitCode, fmt.Errorf("test failed")
-		}
-		return 1, errors.New("one or more tests failed")
+	if lastErr == nil {
+		return 0, nil
+	}
+	if numTests != 1 {
+		return 1, fmt.Errorf("one or more tests failed (last error: %w)", lastErr)
+	}
+	// If there was a single test to run, pass along its error code if it
+	// had one. (It may not have had one in case the test failed for another
+	// reason, e.g. error setting up the test prior to running it.)
+	if exitCode == 0 {
+		exitCode = 1
 	}
-	return 0, nil
+	return exitCode, fmt.Errorf("test failed: %w", lastErr)
 }
 
 func main() {

diff --git a/pkg/test/dockerutil/container.go b/pkg/test/dockerutil/container.go
@@ -1000,7 +1000,8 @@ func (cp *ContainerPool) String() string {
 			sb.WriteString(", ")
 		}
 		status := cp.statuses[container]
-		sb.WriteString(container.Name)
+		sb.WriteString("#")
+		sb.WriteString(strconv.Itoa(i))
 		sb.WriteString("[")
 		sb.WriteString(status.state.String())
 		sb.WriteString("]")

diff --git a/test/gpu/cuda_test.go b/test/gpu/cuda_test.go
@@ -23,6 +23,7 @@ import (
 	"math"
 	"os"
 	"runtime"
+	"slices"
 	"strconv"
 	"strings"
 	"sync"
@@ -73,9 +74,7 @@ var (
 var testCompatibility = map[string]Compatibility{
 	"0_Introduction/simpleAttributes": RequiresFeatures(FeaturePersistentL2Caching),
 	"0_Introduction/simpleCUDA2GL":    RequiresFeatures(FeatureGL),
-	"0_Introduction/simpleIPC":        &BrokenInGVisor{OnlyWhenMultipleGPU: true},
-	"0_Introduction/simpleP2P":        MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
-	"0_Introduction/vectorAddMMAP":    &BrokenInGVisor{OnlyWhenMultipleGPU: true},
+	"0_Introduction/simpleP2P":        &RequiresP2P{},
 	"2_Concepts_and_Techniques/cuHook": &BrokenEverywhere{
 		Reason: "Requires ancient version of glibc (<=2.33)",
 	},
@@ -90,12 +89,12 @@ var testCompatibility = map[string]Compatibility{
 	),
 	"2_Concepts_and_Techniques/EGLSync_CUDAEvent_Interop":  &OnlyOnWindows{},
 	"2_Concepts_and_Techniques/streamOrderedAllocationIPC": &BrokenInGVisor{},
-	"2_Concepts_and_Techniques/streamOrderedAllocationP2P": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
+	"2_Concepts_and_Techniques/streamOrderedAllocationP2P": &RequiresP2P{},
 	"3_CUDA_Features/bf16TensorCoreGemm":                   RequiresFeatures(FeatureTensorCores),
 	"3_CUDA_Features/cdpAdvancedQuicksort":                 RequiresFeatures(FeatureDynamicParallelism),
 	"3_CUDA_Features/cudaCompressibleMemory":               RequiresFeatures(FeatureCompressibleMemory),
 	"3_CUDA_Features/dmmaTensorCoreGemm":                   RequiresFeatures(FeatureTensorCores),
-	"3_CUDA_Features/memMapIPCDrv":                         MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
+	"3_CUDA_Features/memMapIPCDrv":                         &RequiresMultiGPU{},
 	"3_CUDA_Features/tf32TensorCoreGemm":                   RequiresFeatures(FeatureTensorCores),
 	"4_CUDA_Libraries/conjugateGradientMultiDeviceCG":      MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
 	"4_CUDA_Libraries/cudaNvSci":                           &RequiresNvSci{},
@@ -105,14 +104,14 @@ var testCompatibility = map[string]Compatibility{
 	"4_CUDA_Libraries/cuDLAStandaloneMode":                 &OnlyOnWindows{},
 	"4_CUDA_Libraries/cuDLALayerwiseStatsHybrid":           &OnlyOnWindows{},
 	"4_CUDA_Libraries/cuDLALayerwiseStatsStandalone":       &OnlyOnWindows{},
-	"4_CUDA_Libraries/simpleCUFFT_2d_MGPU":                 MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
-	"4_CUDA_Libraries/simpleCUFFT_MGPU":                    MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
+	"4_CUDA_Libraries/simpleCUFFT_2d_MGPU":                 &RequiresMultiGPU{},
+	"4_CUDA_Libraries/simpleCUFFT_MGPU":                    &RequiresMultiGPU{},
 	"5_Domain_Specific/fluidsD3D9":                         &OnlyOnWindows{},
 	"5_Domain_Specific/fluidsGL":                           RequiresFeatures(FeatureGL),
 	"5_Domain_Specific/fluidsGLES":                         &OnlyOnWindows{},
 	"5_Domain_Specific/nbody_opengles":                     &OnlyOnWindows{},
 	"5_Domain_Specific/nbody_screen":                       &OnlyOnWindows{},
-	"5_Domain_Specific/p2pBandwidthLatencyTest":            &BrokenInGVisor{OnlyWhenMultipleGPU: true},
+	"5_Domain_Specific/p2pBandwidthLatencyTest":            &RequiresP2P{},
 	"5_Domain_Specific/postProcessGL":                      RequiresFeatures(FeatureGL),
 	"5_Domain_Specific/simpleD3D10":                        &OnlyOnWindows{},
 	"5_Domain_Specific/simpleD3D10RenderTarget":            &OnlyOnWindows{},
@@ -133,8 +132,11 @@ var testCompatibility = map[string]Compatibility{
 }
 
 // flakyTests is a list of tests that are flaky.
-// These will be retried up to 3 times in parallel before running serially.
-var flakyTests = map[string]struct{}{}
+// These will be retried up to 3 times in parallel before running 3 times
+// serially.
+var flakyTests = map[string]struct{}{
+	"3_CUDA_Features/cdpAdvancedQuicksort": {},
+}
 
 // exclusiveTests is a list of tests that must run exclusively (i.e. with
 // no other test running on the machine at the same time), or they will
@@ -145,6 +147,13 @@ var flakyTests = map[string]struct{}{}
 // causing spurious failures for the tests that happen to be running in
 // parallel with them.
 var exclusiveTests = map[string]struct{}{
+	// Can fail due to
+	// "launch failed because launch would exceed cudaLimitDevRuntimePendingLaunchCount"
+	// when running in parallel with other tests.
+	"3_CUDA_Features/cdpAdvancedQuicksort": {},
+
+	// Performance-intensive tests that tend to make other concurrent tests
+	// flake due to their high resource usage.
 	"6_Performance/alignedTypes":      {},
 	"6_Performance/transpose":         {},
 	"6_Performance/UnifiedMemoryPerf": {},
@@ -153,12 +162,7 @@ var exclusiveTests = map[string]struct{}{
 // alwaysSkippedTests don't run at all, ever, and are not verified when
 // --cuda_verify_compatibility is set.
 // Each test is mapped to a reason why it should be skipped.
-var alwaysSkippedTests = map[string]string{
-	// These tests seem to flake in gVisor, but consistently within the same
-	// run of the overall test, so they cannot be included in `flakyTests`.
-	"0_Introduction/simpleAssert":       "Flaky in gVisor",
-	"0_Introduction/simpleAssert_nvrtc": "Flaky in gVisor",
-}
+var alwaysSkippedTests = map[string]string{}
 
 // Feature is a feature as listed by /list_features.sh.
 type Feature string
@@ -170,6 +174,7 @@ const (
 	FeatureGL                  Feature = "GL"
 	FeatureTensorCores         Feature = "TENSOR_CORES"
 	FeatureCompressibleMemory  Feature = "COMPRESSIBLE_MEMORY"
+	FeatureP2P                 Feature = "P2P"
 )
 
 // allFeatures is a list of all CUDA features above.
@@ -179,6 +184,7 @@ var allFeatures = []Feature{
 	FeatureGL,
 	FeatureTensorCores,
 	FeatureCompressibleMemory,
+	FeatureP2P,
 }
 
 // TestEnvironment represents the environment in which a sample test runs.
@@ -228,10 +234,6 @@ type BrokenInGVisor struct {
 	// This is for tests that can run on a single or multiple GPUs alike,
 	// but specifically fail in gVisor when run with multiple GPUs.
 	OnlyWhenMultipleGPU bool
-
-	// KnownToHang may be set to true for short tests which can hang instead
-	// of failing. This avoids waiting ~forever for them to finish.
-	KnownToHang bool
 }
 
 // WillFail implements `Compatibility.WillFail`.
@@ -273,6 +275,34 @@ func (*RequiresMultiGPU) IsExpectedFailure(ctx context.Context, env *TestEnviron
 	return nil
 }
 
+// RequiresMultiGPU implements `Compatibility` for tests that require
+// peer-to-peer communication between GPUs.
+// Implies RequiresMultiGPU, so tests do not need to specify both.
+type RequiresP2P struct{}
+
+// WillFail implements `Compatibility.WillFail`.
+func (*RequiresP2P) WillFail(ctx context.Context, env *TestEnvironment) string {
+	if notEnoughGPUs := (&RequiresMultiGPU{}).WillFail(ctx, env); notEnoughGPUs != "" {
+		return notEnoughGPUs
+	}
+	if hasP2P := env.Features[FeatureP2P]; !hasP2P {
+		return "Requires P2P support"
+	}
+	return ""
+}
+
+// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
+func (*RequiresP2P) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
+	if err := (&RequiresMultiGPU{}).IsExpectedFailure(ctx, env, logs, exitCode); err == nil {
+		return nil
+	}
+	const wantLog = "Peer to Peer access is not available amongst GPUs in the system, waiving test"
+	if strings.Contains(logs, wantLog) {
+		return nil
+	}
+	return fmt.Errorf("exit code %d and logs %q, expected EXIT_WAIVED (%d) or log message %q", exitCode, logs, exitCodeWaived, wantLog)
+}
+
 // requiresFeatures implements `Compatibility` for tests that require
 // specific features.
 type requiresFeatures struct {
@@ -294,7 +324,13 @@ func (r *requiresFeatures) WillFail(ctx context.Context, env *TestEnvironment) s
 }
 
 // IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
-func (*requiresFeatures) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
+func (r *requiresFeatures) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
+	if slices.Contains(r.features, FeatureGL) && !env.Features[FeatureGL] && strings.Contains(logs, `code=999(cudaErrorUnknown) "cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)"`) {
+		// Some GL-requiring tests such as `5_Domain_Specific/postProcessGL`
+		// and `5_Domain_Specific/fluidsGL` will incorrectly detect that GL
+		// is supported, and fail with this error message rather than waiving.
+		return nil
+	}
 	if exitCode != exitCodeWaived {
 		return fmt.Errorf("exit code %d, expected EXIT_WAIVED (%d)", exitCode, exitCodeWaived)
 	}
@@ -396,7 +432,9 @@ func (*FullyCompatible) IsExpectedFailure(ctx context.Context, env *TestEnvironm
 
 // getContainerOpts returns the container run options to run CUDA tests.
 func getContainerOpts() (dockerutil.RunOpts, error) {
-	opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
+	opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
+		Capabilities: dockerutil.AllGPUCapabilities,
+	})
 	if err != nil {
 		return dockerutil.RunOpts{}, fmt.Errorf("failed to get GPU run options: %w", err)
 	}
@@ -444,8 +482,27 @@ func GetEnvironment(ctx context.Context, t *testing.T) (*TestEnvironment, error)
 	}
 	if runtimeIsGVisor {
 		testLog(t, "Runtime is detected as gVisor")
+		runtimeArgs, err := dockerutil.RuntimeArgs()
+		if err != nil {
+			t.Fatalf("Failed to get runtime arguments: %v", err)
+		}
+		foundNVCaps := ""
+		const nvCapsPrefixFlag = "--nvproxy-allowed-driver-capabilities"
+		for i, arg := range runtimeArgs {
+			if strings.HasPrefix(arg, nvCapsPrefixFlag+"=") {
+				foundNVCaps = strings.TrimPrefix(arg, nvCapsPrefixFlag+"=")
+			} else if arg == "--nvproxy-allowed-driver-capabilities" && i < len(runtimeArgs)-1 {
+				foundNVCaps = runtimeArgs[i+1]
+			}
+		}
+		if foundNVCaps == "" {
+			return nil, fmt.Errorf("did not find --nvproxy-allowed-driver-capabilities=all flag in gVisor runtime arguments, please specify it for this test")
+		}
+		if foundNVCaps != "all" {
+			return nil, fmt.Errorf("found --nvproxy-allowed-driver-capabilities=%q flag in gVisor runtime arguments, please specify --nvproxy-allowed-driver-capabilities=all for this test", foundNVCaps)
+		}
 	} else {
-		testLog(t, "Runtime is detected as not gVisor")
+		testLog(t, "Runtime is detected as non-gVisor")
 	}
 	featuresContainer := dockerutil.MakeContainer(ctx, t)
 	defer featuresContainer.CleanUp(ctx)
@@ -825,7 +882,7 @@ func TestCUDA(t *testing.T) {
 			)
 		}
 	} else if poolUtilization := cp.Utilization(); poolUtilization < 0.6 {
-		testLog(t, "WARNING: Pool utilization was only %.1f%%.", poolUtilization*100.0)
+		testLog(t, "WARNING: Container pool utilization was only %.1f%% during the test.", poolUtilization*100.0)
 		testLog(t, "This test can be made faster and more efficient with proper test categorization,")
 		testLog(t, "by identifying flaky tests and exclusive-requiring tests.")
 		testLog(t, "Consider going over the logs to identify such tests and categorize them accordingly.")