Skip to content

Commit

Permalink
Update CUDA test compatibility to keep up with added gVisor support.
Browse files Browse the repository at this point in the history
These CUDA tests were initially broken in gVisor but now appear to pass.

The test now also verifies that all capabilities are enabled when running.

PiperOrigin-RevId: 711880073
  • Loading branch information
EtiennePerot authored and gvisor-bot committed Jan 7, 2025
1 parent 7af7cb0 commit ebf9577
Show file tree
Hide file tree
Showing 8 changed files with 144 additions and 47 deletions.
8 changes: 4 additions & 4 deletions .buildkite/pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -198,16 +198,16 @@ steps:
agents:
queue: cos-canary-gpu
- <<: *source_test_continuous
label: ":fish: CUDA tests"
label: ":fish: CUDA tests (NOSUBMIT)"
# This is its own test rather than being part of the GPU tests,
# because it takes around 30 minutes to run.
parallelism: 8
timeout_in_minutes: 60
parallelism: 32
timeout_in_minutes: 120
retry:
<<: *retry_settings
commands:
- make sudo TARGETS=//tools/gpu:main ARGS="install --latest" || cat /var/log/nvidia-installer.log
- make cuda-tests
- make cuda-tests ARGS="--cuda_verify_compatibility=true"
agents:
queue: gpu
- <<: *common
Expand Down
25 changes: 19 additions & 6 deletions .buildkite/release.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
agents:
queue: release
_templates:
retry_settings: &retry_settings
automatic:
- exit_status: -1
limit: 10
- exit_status: "*"
limit: 2
common: &common
timeout_in_minutes: 180
retry:
automatic:
- exit_status: -1
limit: 10
- exit_status: "*"
limit: 2

<<: *retry_settings
notify:
- email: "[email protected]"
if: build.state == "failed"
Expand Down Expand Up @@ -75,6 +76,18 @@ steps:
- make gpu-all-tests
agents:
queue: gpu
- label: ":fish: CUDA tests"
# This is its own test rather than being part of the GPU tests,
# because it takes around 30 minutes to run.
parallelism: 32
timeout_in_minutes: 120
retry:
<<: *retry_settings
commands:
- make sudo TARGETS=//tools/gpu:main ARGS="install --latest" || cat /var/log/nvidia-installer.log
- make cuda-tests ARGS="--cuda_verify_compatibility=true"
agents:
queue: gpu
- <<: *common
label: ":screwdriver: All GPU Drivers Test"
parallelism: 8
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -331,8 +331,8 @@ cos-gpu-all-tests: gpu-images cos-gpu-smoke-tests $(RUNTIME_BIN)
@$(call sudo,test/gpu:sniffer_test,--runtime=$(RUNTIME) -test.v --cos-gpu $(ARGS))
.PHONY: cos-gpu-all-tests

cuda-tests: load-gpu_cuda-tests $(RUNTIME_BIN)
@$(call install_runtime,$(RUNTIME),--nvproxy=true --nvproxy-docker=true)
cuda-tests: load-basic_alpine load-gpu_cuda-tests $(RUNTIME_BIN)
@$(call install_runtime,$(RUNTIME),--nvproxy=true --nvproxy-docker=true --nvproxy-allowed-driver-capabilities=all)
@$(call sudo,test/gpu:cuda_test,--runtime=$(RUNTIME) -test.v $(ARGS))
.PHONY: cuda-tests

Expand Down
11 changes: 11 additions & 0 deletions images/gpu/cuda-tests/list_features.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,15 @@ int main(int argc, char *argv[]) {
CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED,
cuda_device));
printFeature("COMPRESSIBLE_MEMORY", isCompressionAvailable != 0);
bool p2pAvailable = false;
int gpuCount = -1;
CHECK_CUDA(cudaGetDeviceCount(&gpuCount));
if (gpuCount >= 2) {
int canAccessAToB = -1;
CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessAToB, 0, 1));
int canAccessBToA = -1;
CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessBToA, 1, 0));
p2pAvailable = canAccessAToB > 0 && canAccessBToA > 0;
}
printFeature("P2P", p2pAvailable);
}
7 changes: 7 additions & 0 deletions images/gpu/cuda-tests/list_features.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@

set -euo pipefail

if [[ "${NVIDIA_DRIVER_CAPABILITIES:-}" != "all" ]]; then
echo "NVIDIA_DRIVER_CAPABILITIES is not set to 'all'." >&2
echo "It is set to: '${NVIDIA_DRIVER_CAPABILITIES:-}'" >&2
echo "Please set it to 'all' and try again." >&2
exit 1
fi

cd /
nvcc list_features.cu -lcuda -o list_features
./list_features
Expand Down
28 changes: 18 additions & 10 deletions images/gpu/cuda-tests/run_sample.go
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,9 @@ func (st *SampleTest) RunLibNVVMTest(ctx context.Context) error {
// Main is the main method of this program.
func Main(ctx context.Context) (int, error) {
flag.Parse()
if nvCaps := os.Getenv("NVIDIA_DRIVER_CAPABILITIES"); nvCaps != "all" {
return 1, fmt.Errorf("NVIDIA_DRIVER_CAPABILITIES is not set to 'all' (got %q); please set it to 'all' and try again", nvCaps)
}
cleanupCtx, cleanupCancel := context.WithTimeout(ctx, *timeoutFlag)
defer cleanupCancel()
deadline, _ := cleanupCtx.Deadline()
Expand All @@ -1007,15 +1010,15 @@ func Main(ctx context.Context) (int, error) {
defer x.Shutdown(cleanupCtx)
testsCtx, testsCancel := context.WithDeadline(cleanupCtx, deadline.Add(-10*time.Second))
defer testsCancel()
failed := false
numTests := 0
exitCode := 1
var lastErr error
for _, testName := range flag.Args() {
numTests++
st, err := NewSampleTest(testName, x)
if err != nil {
log("> Invalid test %q: %s", testName, err)
failed = true
lastErr = fmt.Errorf("invalid test %q: %w", testName, err)
continue
}
log("> Running test: %s", testName)
Expand All @@ -1024,7 +1027,7 @@ func Main(ctx context.Context) (int, error) {
testCancel()
if err != nil {
log("> Test failed: %s (%s)", testName, err)
failed = true
lastErr = fmt.Errorf("test %q failed: %w", testName, err)
if exitErr := (*exec.ExitError)(nil); errors.As(err, &exitErr) && exitErr.ExitCode() > 0 {
exitCode = exitErr.ExitCode()
}
Expand All @@ -1035,14 +1038,19 @@ func Main(ctx context.Context) (int, error) {
if numTests == 0 {
return 1, fmt.Errorf("no tests to run, failing vacuously; specify test names as positional arguments")
}
if failed {
if numTests == 1 {
// If there was a single test to run, pass along its error code.
return exitCode, fmt.Errorf("test failed")
}
return 1, errors.New("one or more tests failed")
if lastErr == nil {
return 0, nil
}
if numTests != 1 {
return 1, fmt.Errorf("one or more tests failed (last error: %w)", lastErr)
}
// If there was a single test to run, pass along its error code if it
// had one. (It may not have had one in case the test failed for another
// reason, e.g. error setting up the test prior to running it.)
if exitCode == 0 {
exitCode = 1
}
return 0, nil
return exitCode, fmt.Errorf("test failed: %w", lastErr)
}

func main() {
Expand Down
3 changes: 2 additions & 1 deletion pkg/test/dockerutil/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -1000,7 +1000,8 @@ func (cp *ContainerPool) String() string {
sb.WriteString(", ")
}
status := cp.statuses[container]
sb.WriteString(container.Name)
sb.WriteString("#")
sb.WriteString(strconv.Itoa(i))
sb.WriteString("[")
sb.WriteString(status.state.String())
sb.WriteString("]")
Expand Down
105 changes: 81 additions & 24 deletions test/gpu/cuda_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"math"
"os"
"runtime"
"slices"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -73,9 +74,7 @@ var (
var testCompatibility = map[string]Compatibility{
"0_Introduction/simpleAttributes": RequiresFeatures(FeaturePersistentL2Caching),
"0_Introduction/simpleCUDA2GL": RequiresFeatures(FeatureGL),
"0_Introduction/simpleIPC": &BrokenInGVisor{OnlyWhenMultipleGPU: true},
"0_Introduction/simpleP2P": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
"0_Introduction/vectorAddMMAP": &BrokenInGVisor{OnlyWhenMultipleGPU: true},
"0_Introduction/simpleP2P": &RequiresP2P{},
"2_Concepts_and_Techniques/cuHook": &BrokenEverywhere{
Reason: "Requires ancient version of glibc (<=2.33)",
},
Expand All @@ -90,12 +89,12 @@ var testCompatibility = map[string]Compatibility{
),
"2_Concepts_and_Techniques/EGLSync_CUDAEvent_Interop": &OnlyOnWindows{},
"2_Concepts_and_Techniques/streamOrderedAllocationIPC": &BrokenInGVisor{},
"2_Concepts_and_Techniques/streamOrderedAllocationP2P": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
"2_Concepts_and_Techniques/streamOrderedAllocationP2P": &RequiresP2P{},
"3_CUDA_Features/bf16TensorCoreGemm": RequiresFeatures(FeatureTensorCores),
"3_CUDA_Features/cdpAdvancedQuicksort": RequiresFeatures(FeatureDynamicParallelism),
"3_CUDA_Features/cudaCompressibleMemory": RequiresFeatures(FeatureCompressibleMemory),
"3_CUDA_Features/dmmaTensorCoreGemm": RequiresFeatures(FeatureTensorCores),
"3_CUDA_Features/memMapIPCDrv": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
"3_CUDA_Features/memMapIPCDrv": &RequiresMultiGPU{},
"3_CUDA_Features/tf32TensorCoreGemm": RequiresFeatures(FeatureTensorCores),
"4_CUDA_Libraries/conjugateGradientMultiDeviceCG": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
"4_CUDA_Libraries/cudaNvSci": &RequiresNvSci{},
Expand All @@ -105,14 +104,14 @@ var testCompatibility = map[string]Compatibility{
"4_CUDA_Libraries/cuDLAStandaloneMode": &OnlyOnWindows{},
"4_CUDA_Libraries/cuDLALayerwiseStatsHybrid": &OnlyOnWindows{},
"4_CUDA_Libraries/cuDLALayerwiseStatsStandalone": &OnlyOnWindows{},
"4_CUDA_Libraries/simpleCUFFT_2d_MGPU": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
"4_CUDA_Libraries/simpleCUFFT_MGPU": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
"4_CUDA_Libraries/simpleCUFFT_2d_MGPU": &RequiresMultiGPU{},
"4_CUDA_Libraries/simpleCUFFT_MGPU": &RequiresMultiGPU{},
"5_Domain_Specific/fluidsD3D9": &OnlyOnWindows{},
"5_Domain_Specific/fluidsGL": RequiresFeatures(FeatureGL),
"5_Domain_Specific/fluidsGLES": &OnlyOnWindows{},
"5_Domain_Specific/nbody_opengles": &OnlyOnWindows{},
"5_Domain_Specific/nbody_screen": &OnlyOnWindows{},
"5_Domain_Specific/p2pBandwidthLatencyTest": &BrokenInGVisor{OnlyWhenMultipleGPU: true},
"5_Domain_Specific/p2pBandwidthLatencyTest": &RequiresP2P{},
"5_Domain_Specific/postProcessGL": RequiresFeatures(FeatureGL),
"5_Domain_Specific/simpleD3D10": &OnlyOnWindows{},
"5_Domain_Specific/simpleD3D10RenderTarget": &OnlyOnWindows{},
Expand All @@ -133,8 +132,11 @@ var testCompatibility = map[string]Compatibility{
}

// flakyTests is a list of tests that are flaky.
// These will be retried up to 3 times in parallel before running serially.
var flakyTests = map[string]struct{}{}
// These will be retried up to 3 times in parallel before running 3 times
// serially.
var flakyTests = map[string]struct{}{
"3_CUDA_Features/cdpAdvancedQuicksort": {},
}

// exclusiveTests is a list of tests that must run exclusively (i.e. with
// no other test running on the machine at the same time), or they will
Expand All @@ -145,6 +147,13 @@ var flakyTests = map[string]struct{}{}
// causing spurious failures for the tests that happen to be running in
// parallel with them.
var exclusiveTests = map[string]struct{}{
// Can fail due to
// "launch failed because launch would exceed cudaLimitDevRuntimePendingLaunchCount"
// when running in parallel with other tests.
"3_CUDA_Features/cdpAdvancedQuicksort": {},

// Performance-intensive tests that tend to make other concurrent tests
// flake due to their high resource usage.
"6_Performance/alignedTypes": {},
"6_Performance/transpose": {},
"6_Performance/UnifiedMemoryPerf": {},
Expand All @@ -153,12 +162,7 @@ var exclusiveTests = map[string]struct{}{
// alwaysSkippedTests don't run at all, ever, and are not verified when
// --cuda_verify_compatibility is set.
// Each test is mapped to a reason why it should be skipped.
var alwaysSkippedTests = map[string]string{
// These tests seem to flake in gVisor, but consistently within the same
// run of the overall test, so they cannot be included in `flakyTests`.
"0_Introduction/simpleAssert": "Flaky in gVisor",
"0_Introduction/simpleAssert_nvrtc": "Flaky in gVisor",
}
var alwaysSkippedTests = map[string]string{}

// Feature is a feature as listed by /list_features.sh.
type Feature string
Expand All @@ -170,6 +174,7 @@ const (
FeatureGL Feature = "GL"
FeatureTensorCores Feature = "TENSOR_CORES"
FeatureCompressibleMemory Feature = "COMPRESSIBLE_MEMORY"
FeatureP2P Feature = "P2P"
)

// allFeatures is a list of all CUDA features above.
Expand All @@ -179,6 +184,7 @@ var allFeatures = []Feature{
FeatureGL,
FeatureTensorCores,
FeatureCompressibleMemory,
FeatureP2P,
}

// TestEnvironment represents the environment in which a sample test runs.
Expand Down Expand Up @@ -228,10 +234,6 @@ type BrokenInGVisor struct {
// This is for tests that can run on a single or multiple GPUs alike,
// but specifically fail in gVisor when run with multiple GPUs.
OnlyWhenMultipleGPU bool

// KnownToHang may be set to true for short tests which can hang instead
// of failing. This avoids waiting ~forever for them to finish.
KnownToHang bool
}

// WillFail implements `Compatibility.WillFail`.
Expand Down Expand Up @@ -273,6 +275,34 @@ func (*RequiresMultiGPU) IsExpectedFailure(ctx context.Context, env *TestEnviron
return nil
}

// RequiresMultiGPU implements `Compatibility` for tests that require
// peer-to-peer communication between GPUs.
// Implies RequiresMultiGPU, so tests do not need to specify both.
type RequiresP2P struct{}

// WillFail implements `Compatibility.WillFail`.
func (*RequiresP2P) WillFail(ctx context.Context, env *TestEnvironment) string {
if notEnoughGPUs := (&RequiresMultiGPU{}).WillFail(ctx, env); notEnoughGPUs != "" {
return notEnoughGPUs
}
if hasP2P := env.Features[FeatureP2P]; !hasP2P {
return "Requires P2P support"
}
return ""
}

// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
func (*RequiresP2P) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
if err := (&RequiresMultiGPU{}).IsExpectedFailure(ctx, env, logs, exitCode); err == nil {
return nil
}
const wantLog = "Peer to Peer access is not available amongst GPUs in the system, waiving test"
if strings.Contains(logs, wantLog) {
return nil
}
return fmt.Errorf("exit code %d and logs %q, expected EXIT_WAIVED (%d) or log message %q", exitCode, logs, exitCodeWaived, wantLog)
}

// requiresFeatures implements `Compatibility` for tests that require
// specific features.
type requiresFeatures struct {
Expand All @@ -294,7 +324,13 @@ func (r *requiresFeatures) WillFail(ctx context.Context, env *TestEnvironment) s
}

// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
func (*requiresFeatures) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
func (r *requiresFeatures) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
if slices.Contains(r.features, FeatureGL) && !env.Features[FeatureGL] && strings.Contains(logs, `code=999(cudaErrorUnknown) "cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)"`) {
// Some GL-requiring tests such as `5_Domain_Specific/postProcessGL`
// and `5_Domain_Specific/fluidsGL` will incorrectly detect that GL
// is supported, and fail with this error message rather than waiving.
return nil
}
if exitCode != exitCodeWaived {
return fmt.Errorf("exit code %d, expected EXIT_WAIVED (%d)", exitCode, exitCodeWaived)
}
Expand Down Expand Up @@ -396,7 +432,9 @@ func (*FullyCompatible) IsExpectedFailure(ctx context.Context, env *TestEnvironm

// getContainerOpts returns the container run options to run CUDA tests.
func getContainerOpts() (dockerutil.RunOpts, error) {
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
Capabilities: dockerutil.AllGPUCapabilities,
})
if err != nil {
return dockerutil.RunOpts{}, fmt.Errorf("failed to get GPU run options: %w", err)
}
Expand Down Expand Up @@ -444,8 +482,27 @@ func GetEnvironment(ctx context.Context, t *testing.T) (*TestEnvironment, error)
}
if runtimeIsGVisor {
testLog(t, "Runtime is detected as gVisor")
runtimeArgs, err := dockerutil.RuntimeArgs()
if err != nil {
t.Fatalf("Failed to get runtime arguments: %v", err)
}
foundNVCaps := ""
const nvCapsPrefixFlag = "--nvproxy-allowed-driver-capabilities"
for i, arg := range runtimeArgs {
if strings.HasPrefix(arg, nvCapsPrefixFlag+"=") {
foundNVCaps = strings.TrimPrefix(arg, nvCapsPrefixFlag+"=")
} else if arg == "--nvproxy-allowed-driver-capabilities" && i < len(runtimeArgs)-1 {
foundNVCaps = runtimeArgs[i+1]
}
}
if foundNVCaps == "" {
return nil, fmt.Errorf("did not find --nvproxy-allowed-driver-capabilities=all flag in gVisor runtime arguments, please specify it for this test")
}
if foundNVCaps != "all" {
return nil, fmt.Errorf("found --nvproxy-allowed-driver-capabilities=%q flag in gVisor runtime arguments, please specify --nvproxy-allowed-driver-capabilities=all for this test", foundNVCaps)
}
} else {
testLog(t, "Runtime is detected as not gVisor")
testLog(t, "Runtime is detected as non-gVisor")
}
featuresContainer := dockerutil.MakeContainer(ctx, t)
defer featuresContainer.CleanUp(ctx)
Expand Down Expand Up @@ -825,7 +882,7 @@ func TestCUDA(t *testing.T) {
)
}
} else if poolUtilization := cp.Utilization(); poolUtilization < 0.6 {
testLog(t, "WARNING: Pool utilization was only %.1f%%.", poolUtilization*100.0)
testLog(t, "WARNING: Container pool utilization was only %.1f%% during the test.", poolUtilization*100.0)
testLog(t, "This test can be made faster and more efficient with proper test categorization,")
testLog(t, "by identifying flaky tests and exclusive-requiring tests.")
testLog(t, "Consider going over the logs to identify such tests and categorize them accordingly.")
Expand Down

0 comments on commit ebf9577

Please sign in to comment.