Skip to content

Commit

Permalink
Add GPU driver installation (#497)
Browse files Browse the repository at this point in the history
  • Loading branch information
meetrajvala authored Oct 11, 2024
1 parent 6405dbb commit 796705b
Show file tree
Hide file tree
Showing 17 changed files with 611 additions and 4 deletions.
31 changes: 31 additions & 0 deletions launcher/cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,37 @@ steps:
gcloud builds submit --config=test_oda_with_signed_container.yaml --region us-west1 \
--substitutions _IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID}
exit
- name: 'gcr.io/cloud-builders/gcloud'
id: GpuDriverInstallationHardenedImageTests
waitFor: ['HardenedImageBuild']
env:
- 'OUTPUT_IMAGE_PREFIX=$_OUTPUT_IMAGE_PREFIX'
- 'OUTPUT_IMAGE_SUFFIX=$_OUTPUT_IMAGE_SUFFIX'
- 'PROJECT_ID=$PROJECT_ID'
script: |
#!/usr/bin/env bash
cd launcher/image/test
echo "running gpu driver installation image tests on ${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX}"
gcloud builds submit --config=test_gpu_driver_installation_cloudbuild.yaml --region us-west1 \
--substitutions _IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID}
exit
# TODO: Enable these tests for debug image once gpu qouta is setup for the build project.
# - name: 'gcr.io/cloud-builders/gcloud'
# id: GpuDriverInstallationDebugImageTests
# waitFor: ['DebugImageBuild']
# env:
# - 'OUTPUT_IMAGE_PREFIX=$_OUTPUT_IMAGE_PREFIX'
# - 'OUTPUT_IMAGE_SUFFIX=$_OUTPUT_IMAGE_SUFFIX'
# - 'PROJECT_ID=$PROJECT_ID'
# script: |
# #!/usr/bin/env bash

# cd launcher/image/test
# echo "running gpu driver installation image tests on ${OUTPUT_IMAGE_PREFIX}-debug-${OUTPUT_IMAGE_SUFFIX}"
# gcloud builds submit --config=test_gpu_driver_installation_cloudbuild.yaml --region us-west1 \
# --substitutions _IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-debug-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID}
# exit
- name: 'gcr.io/cloud-builders/gcloud'
id: MountTests
waitFor: ['HardenedImageBuild']
Expand Down
28 changes: 28 additions & 0 deletions launcher/container_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/google/go-tpm-tools/cel"
"github.com/google/go-tpm-tools/client"
"github.com/google/go-tpm-tools/launcher/agent"
"github.com/google/go-tpm-tools/launcher/internal/gpu"
"github.com/google/go-tpm-tools/launcher/internal/signaturediscovery"
"github.com/google/go-tpm-tools/launcher/internal/systemctl"
"github.com/google/go-tpm-tools/launcher/launcherfile"
Expand Down Expand Up @@ -159,6 +160,33 @@ func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.To
specOpts = append(specOpts, oci.WithDevShmSize(launchSpec.DevShmSize))
}

if launchSpec.Experiments.EnableGpuDriverInstallation && launchSpec.InstallGpuDriver {
gpuMounts := []specs.Mount{
{
Type: "volume",
Source: fmt.Sprintf("%s/lib64", gpu.InstallationHostDir),
Destination: fmt.Sprintf("%s/lib64", gpu.InstallationContainerDir),
Options: []string{"rbind", "rw"},
}, {
Type: "volume",
Source: fmt.Sprintf("%s/bin", gpu.InstallationHostDir),
Destination: fmt.Sprintf("%s/bin", gpu.InstallationContainerDir),
Options: []string{"rbind", "rw"},
},
}
specOpts = append(specOpts, oci.WithMounts(gpuMounts))

gpuDeviceFiles, err := listFilesWithPrefix("/dev", "nvidia")
if err != nil {
return nil, fmt.Errorf("failed to list gpu device files: [%w]", err)
}

for _, deviceFile := range gpuDeviceFiles {
logger.Printf("device file : %s", deviceFile)
specOpts = append(specOpts, oci.WithDevices(deviceFile, deviceFile, "crw-rw-rw-"))
}
}

container, err = cdClient.NewContainer(
ctx,
containerID,
Expand Down
1 change: 1 addition & 0 deletions launcher/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.21

require (
cloud.google.com/go/compute/metadata v0.5.0
cos.googlesource.com/cos/tools.git v0.0.0-20241008015903-8431fe581b1f
github.com/cenkalti/backoff/v4 v4.2.1
github.com/containerd/containerd v1.7.16
github.com/coreos/go-systemd/v22 v22.5.0
Expand Down
2 changes: 2 additions & 0 deletions launcher/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ contrib.go.opencensus.io/exporter/stackdriver v0.13.5/go.mod h1:aXENhDJ1Y4lIg4EU
contrib.go.opencensus.io/exporter/stackdriver v0.13.8/go.mod h1:huNtlWx75MwO7qMs0KrMxPZXzNNWebav1Sq/pm02JdQ=
contrib.go.opencensus.io/integrations/ocsql v0.1.4/go.mod h1:8DsSdjz3F+APR+0z0WkU1aRorQCFfRxvqjUUPMbF3fE=
contrib.go.opencensus.io/resource v0.1.1/go.mod h1:F361eGI91LCmW1I/Saf+rX0+OFcigGlFvXwEGEnkRLA=
cos.googlesource.com/cos/tools.git v0.0.0-20241008015903-8431fe581b1f h1:ZW+Ej7pWHJiyWX5HEUAtI//+WnwWTO5ar8O50BHzq7A=
cos.googlesource.com/cos/tools.git v0.0.0-20241008015903-8431fe581b1f/go.mod h1:kR1xqosfojOExETz3TTtfhKKruTPF9W5EJmcn3a4JVI=
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 h1:bvDV9vkmnHYOMsOr4WLk+Vo07yKIzd94sVoIqshQ4bU=
github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8=
Expand Down
97 changes: 97 additions & 0 deletions launcher/image/test/create_gpu_vm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/bin/bash
local OPTIND
set -euxo pipefail

print_usage() {
echo "usage: test_launcher.sh [-i imageName] [-p projectName] [-m metadata]"
echo " -i <imageName>: which image name to use for the VM"
echo " -p <imageProject>: which image project to use for the VM"
echo " -m <metadata>: metadata variables on VM creation; passed directly into gcloud"
echo " -f <metadataFromFile>: read a metadata value from a file; specified in format key=filePath"
echo " -n <instanceName>: instance name"
echo " -z <instanceZone>: instance zone"
echo " -v <machineType>: type of machine for VM"
echo " -g <gpuType>: type of GPU to use for the VM"
echo " -c <gpuCount>: number of GPU(s) to use for the VM"
exit 1
}

create_vm() {
if [ -z "$IMAGE_NAME" ]; then
echo "Empty image name supplied."
exit 1
fi

if [ -z "$GPU_TYPE" ]; then
echo "Empty gpu type supplied."
exit 1
fi

if [ -z "$GPU_COUNT" ]; then
echo "Empty gpu count supplied."
exit 1
fi

if [ -z "$MACHINE_TYPE" ]; then
echo "Empty machine type supplied."
exit 1
fi

APPEND_METADATA=''
if ! [ -z "$METADATA" ]; then
APPEND_METADATA="--metadata ${METADATA}"
fi

APPEND_METADATA_FILE=''
if ! [ -z "$METADATA_FILE" ]; then
APPEND_METADATA_FILE="--metadata-from-file ${METADATA_FILE}"
fi

echo 'Creating VM' ${VM_NAME} 'with image' $IMAGE_NAME

# check the active account
gcloud auth list

gcloud compute instances create $VM_NAME \
--maintenance-policy=TERMINATE \
--machine-type=$MACHINE_TYPE \
--boot-disk-size=$DISK_SIZE_GB \
--accelerator=count=$GPU_COUNT,type=$GPU_TYPE \
--scopes=cloud-platform \
--zone=$ZONE \
--image=$IMAGE_NAME \
--image-project=$PROJECT_NAME \
--shielded-secure-boot $APPEND_METADATA \
$APPEND_METADATA_FILE
}

IMAGE_NAME=''
METADATA_FILE=''
METADATA=''
PROJECT_NAME=''
VM_NAME=''
ZONE=''
MACHINE_TYPE=''
GPU_TYPE=''
GPU_COUNT=''
DISK_SIZE_GB=100


# In getopts, a ':' following a letter means that that flag takes an argument.
# For example, i: means -i takes an additional argument.
while getopts 'i:f:m:p:n:z:v:g:c:' flag; do
case "${flag}" in
i) IMAGE_NAME=${OPTARG} ;;
f) METADATA_FILE=${OPTARG} ;;
m) METADATA=${OPTARG} ;;
p) PROJECT_NAME=${OPTARG} ;;
n) VM_NAME=${OPTARG} ;;
z) ZONE=${OPTARG} ;;
v) MACHINE_TYPE=${OPTARG} ;;
g) GPU_TYPE=${OPTARG} ;;
c) GPU_COUNT=${OPTARG} ;;
*) print_usage ;;
esac
done

create_vm
17 changes: 17 additions & 0 deletions launcher/image/test/scripts/gpu/test_gpu_nogpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash
set -euo pipefail
source util/read_serial.sh

# This test requires the workload to run and printing
# corresponding messages to the serial console.
SERIAL_OUTPUT=$(read_serial $1 $2)
print_serial=false

if echo $SERIAL_OUTPUT | grep -q 'failed to get the gpu type info'
then
echo "- no gpu verified"
else
echo "FAILED: gpu not detected"
echo 'TEST FAILED.' > /workspace/status.txt
print_serial=true
fi
17 changes: 17 additions & 0 deletions launcher/image/test/scripts/gpu/test_gpu_unsupported_gputype.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash
set -euo pipefail
source util/read_serial.sh

# This test requires the workload to run and printing
# corresponding messages to the serial console.
SERIAL_OUTPUT=$(read_serial $1 $2)
print_serial=false

if echo $SERIAL_OUTPUT | grep -q 'unsupported gpu type'
then
echo "- unsupported gpu types verified"
else
echo "FAILED: gpu type is not supported"
echo 'TEST FAILED.' > /workspace/status.txt
print_serial=true
fi
17 changes: 17 additions & 0 deletions launcher/image/test/scripts/gpu/test_gpu_workload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash
set -euo pipefail
source util/read_serial.sh

# This test requires the workload to run and printing
# corresponding messages to the serial console.
SERIAL_OUTPUT=$(read_serial $1 $2)
print_serial=false

if echo $SERIAL_OUTPUT | grep -q 'Test PASSED'
then
echo "- gpu workload running verified"
else
echo "FAILED: gpu workload not running"
echo 'TEST FAILED.' > /workspace/status.txt
print_serial=true
fi
112 changes: 112 additions & 0 deletions launcher/image/test/test_gpu_driver_installation_cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
substitutions:
'_IMAGE_NAME': ''
'_IMAGE_PROJECT': ''
'_CLEANUP': 'true'
'_VM_NAME_PREFIX': 'cs-gpu-test'
'_ZONE': 'us-central1-f'
'_WORKLOAD_IMAGE': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/gpu/cuda-vector-add:latest'
steps:
- name: 'gcr.io/cloud-builders/gcloud'
id: CreateShieldedVMWithSingleGPU
entrypoint: 'bash'
env:
- 'BUILD_ID=$BUILD_ID'
args: ['create_gpu_vm.sh','-i', '${_IMAGE_NAME}',
'-p', '${_IMAGE_PROJECT}',
'-m', 'tee-image-reference=${_WORKLOAD_IMAGE},tee-container-log-redirect=true,tee-install-gpu-driver=true',
'-n', '${_VM_NAME_PREFIX}-${BUILD_ID}',
'-z', '${_ZONE}',
'-v', 'n1-standard-4',
'-g', 'nvidia-tesla-t4',
'-c', '1'
]
- name: 'gcr.io/cloud-builders/gcloud'
id: CreateShieldedVMWithMultipleGPU
entrypoint: 'bash'
env:
- 'BUILD_ID=$BUILD_ID'
args: ['create_gpu_vm.sh','-i', '${_IMAGE_NAME}',
'-p', '${_IMAGE_PROJECT}',
'-m', 'tee-image-reference=${_WORKLOAD_IMAGE},tee-container-log-redirect=true,tee-install-gpu-driver=true',
'-n', '${_VM_NAME_PREFIX}-${BUILD_ID}-mul',
'-z', '${_ZONE}',
'-v', 'n1-standard-4',
'-g', 'nvidia-tesla-t4',
'-c', '2'
]
- name: 'gcr.io/cloud-builders/gcloud'
id: CreateShieldedVMWithUnsupportedGPU
entrypoint: 'bash'
env:
- 'BUILD_ID=$BUILD_ID'
args: ['create_gpu_vm.sh','-i', '${_IMAGE_NAME}',
'-p', '${_IMAGE_PROJECT}',
'-m', 'tee-image-reference=${_WORKLOAD_IMAGE},tee-container-log-redirect=true,tee-install-gpu-driver=true',
'-n', '${_VM_NAME_PREFIX}-${BUILD_ID}-unsup',
'-z', '${_ZONE}',
'-v', 'n1-standard-4',
'-g', 'nvidia-tesla-p100',
'-c', '1'
]
- name: 'gcr.io/cloud-builders/gcloud'
id: CreateShieldedVMWithNoGPU
entrypoint: 'bash'
env:
- 'BUILD_ID=$BUILD_ID'
args: ['create_vm.sh','-i', '${_IMAGE_NAME}',
'-p', '${_IMAGE_PROJECT}',
'-m', 'tee-image-reference=${_WORKLOAD_IMAGE},tee-container-log-redirect=true,tee-install-gpu-driver=true',
'-n', '${_VM_NAME_PREFIX}-${BUILD_ID}-nogpu',
'-z', '${_ZONE}',
]
- name: 'gcr.io/cloud-builders/gcloud'
id: SingleGpuWorkloadTest
entrypoint: 'bash'
args: ['scripts/gpu/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}']
- name: 'gcr.io/cloud-builders/gcloud'
id: MultipleGpuWorkloadTest
entrypoint: 'bash'
args: ['scripts/gpu/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-mul', '${_ZONE}']
- name: 'gcr.io/cloud-builders/gcloud'
id: UnsupportedGpuWorkloadTest
entrypoint: 'bash'
args: ['scripts/gpu/test_gpu_unsupported_gputype.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-unsup', '${_ZONE}']
- name: 'gcr.io/cloud-builders/gcloud'
id: NoGpuWorkloadTest
entrypoint: 'bash'
args: ['scripts/gpu/test_gpu_nogpu.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-nogpu', '${_ZONE}']
- name: 'gcr.io/cloud-builders/gcloud'
id: SingleGpuCleanUp
entrypoint: 'bash'
env:
- 'CLEANUP=$_CLEANUP'
args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}']
- name: 'gcr.io/cloud-builders/gcloud'
id: MultipleGpuCleanUp
entrypoint: 'bash'
env:
- 'CLEANUP=$_CLEANUP'
args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-mul', '${_ZONE}']
- name: 'gcr.io/cloud-builders/gcloud'
id: UnsupportedGpuVmCleanUp
entrypoint: 'bash'
env:
- 'CLEANUP=$_CLEANUP'
args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-unsup', '${_ZONE}']
- name: 'gcr.io/cloud-builders/gcloud'
id: NoGpuVmCleanUp
entrypoint: 'bash'
env:
- 'CLEANUP=$_CLEANUP'
args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-nogpu', '${_ZONE}']
# Must come after cleanup.
- name: 'gcr.io/cloud-builders/gcloud'
id: NoGpuVmCheckFailure
entrypoint: 'bash'
env:
- 'BUILD_ID=$BUILD_ID'
args: ['check_failure.sh']

# options:
# pool:
# name: 'projects/confidential-space-images-dev/locations/us-west1/workerPools/cs-image-build-vpc'
7 changes: 7 additions & 0 deletions launcher/image/testworkloads/gpu/cuda-vector-add/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# From current directory:
# gcloud builds submit --tag us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/gpu/cuda-vector-add:latest --project confidential-space-images-dev
FROM gcr.io/google_containers/cuda-vector-add:v0.1

LABEL "tee.launch_policy.allow_env_override"="ALLOWED_OVERRIDE"
LABEL "tee.launch_policy.allow_cmd_override"="true"
LABEL "tee.launch_policy.log_redirect"="always"
5 changes: 3 additions & 2 deletions launcher/internal/experiments/experiments.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ import (
// Failure to unmarshal the experiment JSON data will result in an empty object being returned
// to treat experiment flags as their default value. The error should still be checked.
type Experiments struct {
EnableTestFeatureForImage bool
EnableTempFSMount bool
EnableTestFeatureForImage bool
EnableTempFSMount bool
EnableGpuDriverInstallation bool
}

// New takes a filepath, opens the file, and calls ReadJsonInput with the contents
Expand Down
8 changes: 8 additions & 0 deletions launcher/internal/gpu/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package gpu

const (
// InstallationHostDir is the directory where gpu drivers will be installed on the host machine.
InstallationHostDir = "/var/lib/nvidia"
// InstallationContainerDir is the directory where gpu drivers will be available on the workload container.
InstallationContainerDir = "/usr/local/nvidia"
)
Loading

0 comments on commit 796705b

Please sign in to comment.