Skip to content

Commit

Permalink
Merge pull request #198 from klueska/explicit-envvar-for-mask
Browse files Browse the repository at this point in the history
Add explicit envvar to control if we mask /proc/driver/nvidia/params
  • Loading branch information
klueska authored Oct 30, 2024
2 parents dfe844b + 6f2c2aa commit 32805fe
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 9 deletions.
1 change: 1 addition & 0 deletions demo/clusters/kind/install-dra-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJEC
--set deviceClasses="{${deviceClasses}}" \
${NVIDIA_CTK_PATH:+--set nvidiaCtkPath=${NVIDIA_CTK_PATH}} \
${NVIDIA_DRIVER_ROOT:+--set nvidiaDriverRoot=${NVIDIA_DRIVER_ROOT}} \
${MASK_NVIDIA_DRIVER_PARAMS:+--set maskNvidiaDriverParams=${MASK_NVIDIA_DRIVER_PARAMS}} \
--wait

set +x
Expand Down
1 change: 0 additions & 1 deletion demo/clusters/nvkind/install-dra-driver.sh

This file was deleted.

22 changes: 22 additions & 0 deletions demo/clusters/nvkind/install-dra-driver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash

# Copyright 2024 NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# A reference to the current directory where this script is located
CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"

: ${MASK_NVIDIA_DRIVER_PARAMS:="true"}
export MASK_NVIDIA_DRIVER_PARAMS
exec ${CURRENT_DIR}/../kind/install-dra-driver.sh
15 changes: 7 additions & 8 deletions deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,11 @@ spec:
command: ["bash", "-c"]
args:
- |-
# TODO: Masking of the params file is done below to allow nvkind to
# selectively exclude certain GPUs from being visible to the driver.
# At present, this is only feasible with a host-mounted driver where
# /dev in this container already has GPU devices present (as brought
# in via the --privileged flag from docker/podman when using nvkind).
# In the future we should revisit this to find a more robust method
# of supporting this.
if [ "${NVIDIA_DRIVER_ROOT}" = "/" ]; then
# Conditionally mask the params file to prevent this container from
# recreating any missing GPU device nodes. This is necessary, for
# example, when running under nvkind to limit the set GPUs governed
# by the plugin even though it has cgroup access to all of them.
if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then
cp /proc/driver/nvidia/params root/gpu-params
sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params
mount --bind root/gpu-params /proc/driver/nvidia/params
Expand All @@ -72,6 +69,8 @@ spec:
resources:
{{- toYaml .Values.kubeletPlugin.containers.plugin.resources | nindent 10 }}
env:
- name: MASK_NVIDIA_DRIVER_PARAMS
value: "{{ .Values.maskNvidiaDriverParams }}"
- name: NVIDIA_CTK_PATH
value: "{{ .Values.nvidiaCtkPath }}"
- name: NVIDIA_DRIVER_ROOT
Expand Down
11 changes: 11 additions & 0 deletions deployments/helm/k8s-dra-driver/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,17 @@ allowDefaultNamespace: false

deviceClasses: ["gpu", "mig", "imex"]

# Masking of the params file is typically done to allow nvkind to
# selectively exclude certain GPUs from being visible to the
# underlying GPU driver. Unfortunately, kind doesn't let you choose
# which device nodes to inject into each worker node (they all come in
# via the --priviliged flag passed to docker/podman). Because of
# this, all workers see all GPUs by default. By masking the params
# file we can prevent a container from recreating any missing GPU
# device nodes and limit its view to only those device nodes that
# nvkind decided to allow in.
maskNvidiaDriverParams: false

imagePullSecrets: []
image:
repository: nvcr.io/nvidia/cloud-native/k8s-dra-driver
Expand Down

0 comments on commit 32805fe

Please sign in to comment.