Skip to content

Commit

Permalink
Merge pull request #90 from cdesiniotis/mig-manager-custom-nvidia-ctk…
Browse files Browse the repository at this point in the history
…-path

[mig-manager] Add --nvidia-cdi-hook-path flag
  • Loading branch information
cdesiniotis authored Jul 10, 2024
2 parents 3176177 + 52509d0 commit eb3bbe9
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 5 deletions.
11 changes: 10 additions & 1 deletion cmd/nvidia-mig-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ const (
DefaultGPUClientsNamespace = "default"
DefaultNvidiaDriverRoot = "/run/nvidia/driver"
DefaultDriverRootCtrPath = "/run/nvidia/driver"
DefaultNvidiaCDIHookPath = "/usr/local/nvidia/toolkit/nvidia-cdi-hook"
)

var (
Expand All @@ -71,6 +72,7 @@ var (
driverRootCtrPath string
devRoot string
devRootCtrPath string
nvidiaCDIHookPath string
)

type GPUClients struct {
Expand Down Expand Up @@ -250,6 +252,13 @@ func main() {
Destination: &devRootCtrPath,
EnvVars: []string{"DEV_ROOT_CTR_PATH"},
},
&cli.StringFlag{
Name: "nvidia-cdi-hook-path",
Value: DefaultNvidiaCDIHookPath,
Usage: "Path to nvidia-cdi-hook binary on the host.",
Destination: &nvidiaCDIHookPath,
EnvVars: []string{"NVIDIA_CDI_HOOK_PATH"},
},
}

err := c.Run(os.Args)
Expand Down Expand Up @@ -380,7 +389,7 @@ func runScript(migConfigValue string, driverLibraryPath string, nvidiaSMIPath st
"-p", defaultGPUClientsNamespaceFlag,
}
if cdiEnabledFlag {
args = append(args, "-e", "-t", driverRoot, "-a", driverRootCtrPath, "-b", devRoot, "-j", devRootCtrPath, "-l", driverLibraryPath, "-q", nvidiaSMIPath)
args = append(args, "-e", "-t", driverRoot, "-a", driverRootCtrPath, "-b", devRoot, "-j", devRootCtrPath, "-l", driverLibraryPath, "-q", nvidiaSMIPath, "-s", nvidiaCDIHookPath)
}
if withRebootFlag {
args = append(args, "-r")
Expand Down
17 changes: 13 additions & 4 deletions deployments/container/reconfigure-mig.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,14 @@ DEV_ROOT=""
DEV_ROOT_CTR_PATH=""
DRIVER_LIBRARY_PATH=""
NVIDIA_SMI_PATH=""
NVIDIA_CDI_HOOK_PATH=""

export SYSTEMD_LOG_LEVEL="info"

function usage() {
echo "USAGE:"
echo " ${0} -h "
echo " ${0} -n <node> -f <config-file> -c <selected-config> -p <default-gpu-clients-namespace> [-e -t <driver-root> -a <driver-root-ctr-path> -b <dev-root> -j <dev-root-ctr-path> -l <driver-library-path> -q <nvidia-smi-path> ] [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
echo " ${0} -n <node> -f <config-file> -c <selected-config> -p <default-gpu-clients-namespace> [-e -t <driver-root> -a <driver-root-ctr-path> -b <dev-root> -j <dev-root-ctr-path> -l <driver-library-path> -q <nvidia-smi-path> -s <nvidia-cdi-hook-path> ] [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
echo ""
echo "OPTIONS:"
echo " -h Display this help message"
Expand All @@ -60,9 +61,10 @@ function usage() {
echo " -j <dev-root-ctr-path> Root path to the NVIDIA device nodes mounted in the container"
echo " -l <driver-library-path> Path to libnvidia-ml.so.1 in the container"
echo " -q <nvidia-smi-path> Path to nvidia-smi in the container"
echo " -s <nvidia-cdi-hook-path> Path to nvidia-cdi-hook on the host"
}

while getopts "hrden:f:c:m:i:o:g:k:p:t:a:b:j:l:q:" opt; do
while getopts "hrden:f:c:m:i:o:g:k:p:t:a:b:j:l:q:s:" opt; do
case ${opt} in
h ) # process option h
usage; exit 0
Expand Down Expand Up @@ -121,7 +123,10 @@ while getopts "hrden:f:c:m:i:o:g:k:p:t:a:b:j:l:q:" opt; do
q ) # process option q
NVIDIA_SMI_PATH=${OPTARG}
;;
\? ) echo "Usage: ${0} -n <node> -f <config-file> -c <selected-config> -p <default-gpu-clients-namespace> [-e -t <driver-root> -a <driver-root-ctr-path> -b <dev-root> -j <dev-root-ctr-path> -l <driver-library-path> -q <nvidia-smi-path> ] [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
s ) # process option s
NVIDIA_CDI_HOOK_PATH=${OPTARG}
;;
\? ) echo "Usage: ${0} -n <node> -f <config-file> -c <selected-config> -p <default-gpu-clients-namespace> [-e -t <driver-root> -a <driver-root-ctr-path> -b <dev-root> -j <dev-root-ctr-path> -l <driver-library-path> -q <nvidia-smi-path> -s <nvidia-cdi-hook-path> ] [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
;;
esac
done
Expand Down Expand Up @@ -167,6 +172,10 @@ if [ "${CDI_ENABLED}" = "true" ]; then
usage; exit 1
fi
fi
if [ "${NVIDIA_CDI_HOOK_PATH}" = "" ]; then
echo "Error: missing -s <nvidia-cdi-hook-path> flag"
usage; exit 1
fi
fi

HOST_GPU_CLIENT_SERVICES=(${HOST_GPU_CLIENT_SERVICES//,/ })
Expand Down Expand Up @@ -599,7 +608,7 @@ if [ "${CDI_ENABLED}" = "true" ]; then
--dev-root=${DEV_ROOT_CTR_PATH} \
--vendor="management.nvidia.com" \
--class="gpu" \
--nvidia-ctk-path="/usr/local/nvidia/toolkit/nvidia-ctk" | \
--nvidia-cdi-hook-path=${NVIDIA_CDI_HOOK_PATH} | \
nvidia-ctk cdi transform root \
--from=$DRIVER_ROOT_CTR_PATH \
--to=$DRIVER_ROOT \
Expand Down
9 changes: 9 additions & 0 deletions deployments/systemd/packages/Dockerfile.rpm
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ ARG BASE_IMAGE=undefined
ARG GOLANG_VERSION=undefined
FROM ${BASE_IMAGE} as go-build

RUN sed -i -e "s|mirrorlist=|#mirrorlist=|g" \
-e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" \
/etc/yum.repos.d/CentOS-*

RUN yum install -y \
ca-certificates \
gcc \
Expand Down Expand Up @@ -52,6 +56,11 @@ RUN make PREFIX=/artifacts cmds

# build package
FROM ${BASE_IMAGE}

RUN sed -i -e "s|mirrorlist=|#mirrorlist=|g" \
-e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" \
/etc/yum.repos.d/CentOS-*

RUN yum install -y rpm-build

# envs for packaging
Expand Down

0 comments on commit eb3bbe9

Please sign in to comment.