Add GPU driver installation (#497)

google · Oct 11, 2024 · 796705b · 796705b
1 parent 6405dbb
commit 796705b
Show file tree

Hide file tree

Showing 17 changed files with 611 additions and 4 deletions.
diff --git a/launcher/cloudbuild.yaml b/launcher/cloudbuild.yaml
@@ -242,6 +242,37 @@ steps:
     gcloud builds submit --config=test_oda_with_signed_container.yaml --region us-west1 \
       --substitutions _IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID}
     exit
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: GpuDriverInstallationHardenedImageTests
+  waitFor: ['HardenedImageBuild']
+  env:
+  - 'OUTPUT_IMAGE_PREFIX=$_OUTPUT_IMAGE_PREFIX'
+  - 'OUTPUT_IMAGE_SUFFIX=$_OUTPUT_IMAGE_SUFFIX'
+  - 'PROJECT_ID=$PROJECT_ID'
+  script: |
+    #!/usr/bin/env bash
+
+    cd launcher/image/test
+    echo "running gpu driver installation image tests on ${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX}"
+    gcloud builds submit --config=test_gpu_driver_installation_cloudbuild.yaml --region us-west1 \
+      --substitutions _IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID}
+    exit
+# TODO: Enable these tests for debug image once gpu qouta is setup for the build project.
+# - name: 'gcr.io/cloud-builders/gcloud'
+#   id: GpuDriverInstallationDebugImageTests
+#   waitFor: ['DebugImageBuild']
+#   env:
+#   - 'OUTPUT_IMAGE_PREFIX=$_OUTPUT_IMAGE_PREFIX'
+#   - 'OUTPUT_IMAGE_SUFFIX=$_OUTPUT_IMAGE_SUFFIX'
+#   - 'PROJECT_ID=$PROJECT_ID'
+#   script: |
+#     #!/usr/bin/env bash
+
+#     cd launcher/image/test
+#     echo "running gpu driver installation image tests on ${OUTPUT_IMAGE_PREFIX}-debug-${OUTPUT_IMAGE_SUFFIX}"
+#     gcloud builds submit --config=test_gpu_driver_installation_cloudbuild.yaml --region us-west1 \
+#       --substitutions _IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-debug-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID}
+#     exit
 - name: 'gcr.io/cloud-builders/gcloud'
   id: MountTests
   waitFor: ['HardenedImageBuild']

diff --git a/launcher/container_runner.go b/launcher/container_runner.go
@@ -29,6 +29,7 @@ import (
 	"github.com/google/go-tpm-tools/cel"
 	"github.com/google/go-tpm-tools/client"
 	"github.com/google/go-tpm-tools/launcher/agent"
+	"github.com/google/go-tpm-tools/launcher/internal/gpu"
 	"github.com/google/go-tpm-tools/launcher/internal/signaturediscovery"
 	"github.com/google/go-tpm-tools/launcher/internal/systemctl"
 	"github.com/google/go-tpm-tools/launcher/launcherfile"
@@ -159,6 +160,33 @@ func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.To
 		specOpts = append(specOpts, oci.WithDevShmSize(launchSpec.DevShmSize))
 	}
 
+	if launchSpec.Experiments.EnableGpuDriverInstallation && launchSpec.InstallGpuDriver {
+		gpuMounts := []specs.Mount{
+			{
+				Type:        "volume",
+				Source:      fmt.Sprintf("%s/lib64", gpu.InstallationHostDir),
+				Destination: fmt.Sprintf("%s/lib64", gpu.InstallationContainerDir),
+				Options:     []string{"rbind", "rw"},
+			}, {
+				Type:        "volume",
+				Source:      fmt.Sprintf("%s/bin", gpu.InstallationHostDir),
+				Destination: fmt.Sprintf("%s/bin", gpu.InstallationContainerDir),
+				Options:     []string{"rbind", "rw"},
+			},
+		}
+		specOpts = append(specOpts, oci.WithMounts(gpuMounts))
+
+		gpuDeviceFiles, err := listFilesWithPrefix("/dev", "nvidia")
+		if err != nil {
+			return nil, fmt.Errorf("failed to list gpu device files: [%w]", err)
+		}
+
+		for _, deviceFile := range gpuDeviceFiles {
+			logger.Printf("device file : %s", deviceFile)
+			specOpts = append(specOpts, oci.WithDevices(deviceFile, deviceFile, "crw-rw-rw-"))
+		}
+	}
+
 	container, err = cdClient.NewContainer(
 		ctx,
 		containerID,

diff --git a/launcher/go.mod b/launcher/go.mod
@@ -4,6 +4,7 @@ go 1.21
 
 require (
 	cloud.google.com/go/compute/metadata v0.5.0
+	cos.googlesource.com/cos/tools.git v0.0.0-20241008015903-8431fe581b1f
 	github.com/cenkalti/backoff/v4 v4.2.1
 	github.com/containerd/containerd v1.7.16
 	github.com/coreos/go-systemd/v22 v22.5.0

diff --git a/launcher/go.sum b/launcher/go.sum
@@ -69,6 +69,8 @@ contrib.go.opencensus.io/exporter/stackdriver v0.13.5/go.mod h1:aXENhDJ1Y4lIg4EU
 contrib.go.opencensus.io/exporter/stackdriver v0.13.8/go.mod h1:huNtlWx75MwO7qMs0KrMxPZXzNNWebav1Sq/pm02JdQ=
 contrib.go.opencensus.io/integrations/ocsql v0.1.4/go.mod h1:8DsSdjz3F+APR+0z0WkU1aRorQCFfRxvqjUUPMbF3fE=
 contrib.go.opencensus.io/resource v0.1.1/go.mod h1:F361eGI91LCmW1I/Saf+rX0+OFcigGlFvXwEGEnkRLA=
+cos.googlesource.com/cos/tools.git v0.0.0-20241008015903-8431fe581b1f h1:ZW+Ej7pWHJiyWX5HEUAtI//+WnwWTO5ar8O50BHzq7A=
+cos.googlesource.com/cos/tools.git v0.0.0-20241008015903-8431fe581b1f/go.mod h1:kR1xqosfojOExETz3TTtfhKKruTPF9W5EJmcn3a4JVI=
 dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
 github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 h1:bvDV9vkmnHYOMsOr4WLk+Vo07yKIzd94sVoIqshQ4bU=
 github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8=

diff --git a/launcher/image/test/create_gpu_vm.sh b/launcher/image/test/create_gpu_vm.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+local OPTIND
+set -euxo pipefail
+
+print_usage() {
+    echo "usage: test_launcher.sh [-i imageName] [-p projectName] [-m metadata]"
+    echo "  -i <imageName>: which image name to use for the VM"
+    echo "  -p <imageProject>: which image project to use for the VM"
+    echo "  -m <metadata>: metadata variables on VM creation; passed directly into gcloud"
+    echo "  -f <metadataFromFile>: read a metadata value from a file; specified in format key=filePath"
+    echo "  -n <instanceName>: instance name"
+    echo "  -z <instanceZone>: instance zone"
+    echo "  -v <machineType>: type of machine for VM"
+    echo "  -g <gpuType>: type of GPU to use for the VM"
+    echo "  -c <gpuCount>: number of GPU(s) to use for the VM"
+    exit 1
+}
+
+create_vm() {
+  if [ -z "$IMAGE_NAME" ]; then
+    echo "Empty image name supplied."
+    exit 1
+  fi
+
+  if [ -z "$GPU_TYPE" ]; then
+    echo "Empty gpu type supplied."
+    exit 1
+  fi
+
+  if [ -z "$GPU_COUNT" ]; then
+    echo "Empty gpu count supplied."
+    exit 1
+  fi
+
+  if [ -z "$MACHINE_TYPE" ]; then
+    echo "Empty machine type supplied."
+    exit 1
+  fi
+
+  APPEND_METADATA=''
+  if ! [ -z "$METADATA" ]; then
+    APPEND_METADATA="--metadata ${METADATA}"
+  fi
+
+  APPEND_METADATA_FILE=''
+  if ! [ -z "$METADATA_FILE" ]; then
+    APPEND_METADATA_FILE="--metadata-from-file ${METADATA_FILE}"
+  fi
+
+  echo 'Creating VM' ${VM_NAME} 'with image' $IMAGE_NAME
+
+  # check the active account
+  gcloud auth list
+
+  gcloud compute instances create $VM_NAME \
+    --maintenance-policy=TERMINATE \
+    --machine-type=$MACHINE_TYPE \
+    --boot-disk-size=$DISK_SIZE_GB \
+    --accelerator=count=$GPU_COUNT,type=$GPU_TYPE \
+    --scopes=cloud-platform \
+    --zone=$ZONE \
+    --image=$IMAGE_NAME \
+    --image-project=$PROJECT_NAME \
+    --shielded-secure-boot $APPEND_METADATA \
+    $APPEND_METADATA_FILE
+}
+
+IMAGE_NAME=''
+METADATA_FILE=''
+METADATA=''
+PROJECT_NAME=''
+VM_NAME=''
+ZONE=''
+MACHINE_TYPE=''
+GPU_TYPE=''
+GPU_COUNT=''
+DISK_SIZE_GB=100
+
+
+# In getopts, a ':' following a letter means that that flag takes an argument.
+# For example, i: means -i takes an additional argument.
+while getopts 'i:f:m:p:n:z:v:g:c:' flag; do
+  case "${flag}" in
+    i) IMAGE_NAME=${OPTARG} ;;
+    f) METADATA_FILE=${OPTARG} ;;
+    m) METADATA=${OPTARG} ;;
+    p) PROJECT_NAME=${OPTARG} ;;
+    n) VM_NAME=${OPTARG} ;;
+    z) ZONE=${OPTARG} ;;
+    v) MACHINE_TYPE=${OPTARG} ;;
+    g) GPU_TYPE=${OPTARG} ;;
+    c) GPU_COUNT=${OPTARG} ;;
+    *) print_usage ;;
+  esac
+done
+
+create_vm
diff --git a/launcher/image/test/scripts/gpu/test_gpu_nogpu.sh b/launcher/image/test/scripts/gpu/test_gpu_nogpu.sh
@@ -0,0 +1,17 @@
+ #!/bin/bash
+set -euo pipefail
+source util/read_serial.sh
+
+# This test requires the workload to run and printing
+# corresponding messages to the serial console.
+SERIAL_OUTPUT=$(read_serial $1 $2) 
+print_serial=false
+
+if echo $SERIAL_OUTPUT | grep -q 'failed to get the gpu type info'
+then
+    echo "- no gpu verified"
+else
+    echo "FAILED: gpu not detected"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_serial=true
+fi
diff --git a/launcher/image/test/scripts/gpu/test_gpu_unsupported_gputype.sh b/launcher/image/test/scripts/gpu/test_gpu_unsupported_gputype.sh
@@ -0,0 +1,17 @@
+ #!/bin/bash
+set -euo pipefail
+source util/read_serial.sh
+
+# This test requires the workload to run and printing
+# corresponding messages to the serial console.
+SERIAL_OUTPUT=$(read_serial $1 $2) 
+print_serial=false
+
+if echo $SERIAL_OUTPUT | grep -q 'unsupported gpu type'
+then
+    echo "- unsupported gpu types verified"
+else
+    echo "FAILED: gpu type is not supported"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_serial=true
+fi
diff --git a/launcher/image/test/scripts/gpu/test_gpu_workload.sh b/launcher/image/test/scripts/gpu/test_gpu_workload.sh
@@ -0,0 +1,17 @@
+ #!/bin/bash
+set -euo pipefail
+source util/read_serial.sh
+
+# This test requires the workload to run and printing
+# corresponding messages to the serial console.
+SERIAL_OUTPUT=$(read_serial $1 $2) 
+print_serial=false
+
+if echo $SERIAL_OUTPUT | grep -q 'Test PASSED'
+then
+    echo "- gpu workload running verified"
+else
+    echo "FAILED: gpu workload not running"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_serial=true
+fi
diff --git a/launcher/image/test/test_gpu_driver_installation_cloudbuild.yaml b/launcher/image/test/test_gpu_driver_installation_cloudbuild.yaml
@@ -0,0 +1,112 @@
+substitutions:
+  '_IMAGE_NAME': ''
+  '_IMAGE_PROJECT': ''
+  '_CLEANUP': 'true'
+  '_VM_NAME_PREFIX': 'cs-gpu-test'
+  '_ZONE': 'us-central1-f'
+  '_WORKLOAD_IMAGE': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/gpu/cuda-vector-add:latest'
+steps:
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: CreateShieldedVMWithSingleGPU
+  entrypoint: 'bash'
+  env:
+  - 'BUILD_ID=$BUILD_ID'
+  args: ['create_gpu_vm.sh','-i', '${_IMAGE_NAME}',
+          '-p', '${_IMAGE_PROJECT}',
+          '-m', 'tee-image-reference=${_WORKLOAD_IMAGE},tee-container-log-redirect=true,tee-install-gpu-driver=true',
+          '-n', '${_VM_NAME_PREFIX}-${BUILD_ID}',
+          '-z', '${_ZONE}',
+          '-v', 'n1-standard-4',
+          '-g', 'nvidia-tesla-t4',
+          '-c', '1'
+        ]
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: CreateShieldedVMWithMultipleGPU
+  entrypoint: 'bash'
+  env:
+  - 'BUILD_ID=$BUILD_ID'
+  args: ['create_gpu_vm.sh','-i', '${_IMAGE_NAME}',
+          '-p', '${_IMAGE_PROJECT}',
+          '-m', 'tee-image-reference=${_WORKLOAD_IMAGE},tee-container-log-redirect=true,tee-install-gpu-driver=true',
+          '-n', '${_VM_NAME_PREFIX}-${BUILD_ID}-mul',
+          '-z', '${_ZONE}',
+          '-v', 'n1-standard-4',
+          '-g', 'nvidia-tesla-t4',
+          '-c', '2'
+        ]
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: CreateShieldedVMWithUnsupportedGPU
+  entrypoint: 'bash'
+  env:
+  - 'BUILD_ID=$BUILD_ID'
+  args: ['create_gpu_vm.sh','-i', '${_IMAGE_NAME}',
+          '-p', '${_IMAGE_PROJECT}',
+          '-m', 'tee-image-reference=${_WORKLOAD_IMAGE},tee-container-log-redirect=true,tee-install-gpu-driver=true',
+          '-n', '${_VM_NAME_PREFIX}-${BUILD_ID}-unsup',
+          '-z', '${_ZONE}',
+          '-v', 'n1-standard-4',
+          '-g', 'nvidia-tesla-p100',
+          '-c', '1'
+        ]
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: CreateShieldedVMWithNoGPU
+  entrypoint: 'bash'
+  env:
+  - 'BUILD_ID=$BUILD_ID'
+  args: ['create_vm.sh','-i', '${_IMAGE_NAME}',
+          '-p', '${_IMAGE_PROJECT}',
+          '-m', 'tee-image-reference=${_WORKLOAD_IMAGE},tee-container-log-redirect=true,tee-install-gpu-driver=true',
+          '-n', '${_VM_NAME_PREFIX}-${BUILD_ID}-nogpu',
+          '-z', '${_ZONE}',
+        ]
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: SingleGpuWorkloadTest
+  entrypoint: 'bash'
+  args: ['scripts/gpu/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}']
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: MultipleGpuWorkloadTest
+  entrypoint: 'bash'
+  args: ['scripts/gpu/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-mul', '${_ZONE}']
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: UnsupportedGpuWorkloadTest
+  entrypoint: 'bash'
+  args: ['scripts/gpu/test_gpu_unsupported_gputype.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-unsup', '${_ZONE}']
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: NoGpuWorkloadTest
+  entrypoint: 'bash'
+  args: ['scripts/gpu/test_gpu_nogpu.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-nogpu', '${_ZONE}']
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: SingleGpuCleanUp
+  entrypoint: 'bash'
+  env:
+  - 'CLEANUP=$_CLEANUP'
+  args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}']
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: MultipleGpuCleanUp
+  entrypoint: 'bash'
+  env:
+  - 'CLEANUP=$_CLEANUP'
+  args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-mul', '${_ZONE}']
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: UnsupportedGpuVmCleanUp
+  entrypoint: 'bash'
+  env:
+  - 'CLEANUP=$_CLEANUP'
+  args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-unsup', '${_ZONE}']
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: NoGpuVmCleanUp
+  entrypoint: 'bash'
+  env:
+  - 'CLEANUP=$_CLEANUP'
+  args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-nogpu', '${_ZONE}']
+# Must come after cleanup.
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: NoGpuVmCheckFailure
+  entrypoint: 'bash'
+  env:
+  - 'BUILD_ID=$BUILD_ID'
+  args: ['check_failure.sh']
+
+# options:
+#   pool:
+#     name: 'projects/confidential-space-images-dev/locations/us-west1/workerPools/cs-image-build-vpc'
diff --git a/launcher/image/testworkloads/gpu/cuda-vector-add/Dockerfile b/launcher/image/testworkloads/gpu/cuda-vector-add/Dockerfile
@@ -0,0 +1,7 @@
+# From current directory:
+# gcloud builds submit --tag us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/gpu/cuda-vector-add:latest --project confidential-space-images-dev
+FROM gcr.io/google_containers/cuda-vector-add:v0.1
+
+LABEL "tee.launch_policy.allow_env_override"="ALLOWED_OVERRIDE"
+LABEL "tee.launch_policy.allow_cmd_override"="true"
+LABEL "tee.launch_policy.log_redirect"="always"
diff --git a/launcher/internal/experiments/experiments.go b/launcher/internal/experiments/experiments.go
@@ -11,8 +11,9 @@ import (
 // Failure to unmarshal the experiment JSON data will result in an empty object being returned
 // to treat experiment flags as their default value. The error should still be checked.
 type Experiments struct {
-	EnableTestFeatureForImage bool
-	EnableTempFSMount         bool
+	EnableTestFeatureForImage   bool
+	EnableTempFSMount           bool
+	EnableGpuDriverInstallation bool
 }
 
 // New takes a filepath, opens the file, and calls ReadJsonInput with the contents

diff --git a/launcher/internal/gpu/config.go b/launcher/internal/gpu/config.go
@@ -0,0 +1,8 @@
+package gpu
+
+const (
+	// InstallationHostDir is the directory where gpu drivers will be installed on the host machine.
+	InstallationHostDir = "/var/lib/nvidia"
+	// InstallationContainerDir is the directory where gpu drivers will be available on the workload container.
+	InstallationContainerDir = "/usr/local/nvidia"
+)