From fec66795a00e19dcd4586db77469f4fafe2f664b Mon Sep 17 00:00:00 2001 From: KeithMnemonic Date: Fri, 2 Aug 2024 11:56:00 -0400 Subject: [PATCH] Add a new nvidia-gpu-services.target to ensure proper startup order --- deployments/systemd/README.md | 7 +++++++ deployments/systemd/install.sh | 5 +++++ deployments/systemd/nvidia-gpu-reset.target | 16 ++++++++++++++++ deployments/systemd/nvidia-mig-manager.service | 6 +++--- deployments/systemd/uninstall.sh | 4 ++++ 5 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 deployments/systemd/nvidia-gpu-reset.target diff --git a/deployments/systemd/README.md b/deployments/systemd/README.md index 5ffc6aca..463de5c8 100644 --- a/deployments/systemd/README.md +++ b/deployments/systemd/README.md @@ -39,10 +39,17 @@ because it runs a container with `go` in it to download and build the latest `nvidia-mig-parted` before installing it. We plan to relax this requirement in the near future. +**Note:** When the `nvidia-mig-manager.service` is installed, an additional target, +`nvidia-gpu-reset.target` is also installed. This new target is used to ensure the +`nvidia-mig-manager.service` is started after the `nvidia-fabricmanager.service` and +`nvidia-persistenced`. In addition, this target allows applications like dcgm and nvsm +to be started only after the `nvidia-mig-manager.service` has started. + The following files will be added as part of this installation: * `/usr/bin/nvidia-mig-parted` * `/usr/lib/systemd/system/nvidia-mig-manager.service` +* `/usr/lib/systemd/system/nvidia-gpu-reset.target.service` * `/etc/systemd/system/nvidia-mig-manager.service.d/override.conf` * `/etc/profile.d/nvidia-mig-parted.sh` * `/etc/nvidia-mig-manager/utils.sh` diff --git a/deployments/systemd/install.sh b/deployments/systemd/install.sh index c24d736f..52f82c61 100755 --- a/deployments/systemd/install.sh +++ b/deployments/systemd/install.sh @@ -19,6 +19,9 @@ SERVICE_ROOT="nvidia-mig-manager" SERVICE_NAME="${SERVICE_ROOT}.service" +GPU_TARGET_ROOT="nvidia-gpu-reset" +GPU_TARGET_NAME="${GPU_TARGET_ROOT}.target" + MIG_PARTED_NAME="nvidia-mig-parted" MIG_PARTED_GO_GET_PATH="github.com/NVIDIA/mig-parted/cmd/${MIG_PARTED_NAME}" @@ -52,6 +55,7 @@ ${DOCKER} run --rm \ " cp ${SERVICE_NAME} ${SYSTEMD_DIR} +cp ${GPU_TARGET_NAME} ${SYSTEMD_DIR} cp ${MIG_PARTED_NAME}.sh ${PROFILED_DIR} cp override.conf ${OVERRIDE_DIR} cp service.sh ${CONFIG_DIR} @@ -62,6 +66,7 @@ cp hooks-minimal.yaml ${CONFIG_DIR} cp config-default.yaml ${CONFIG_DIR} chmod a+r ${SYSTEMD_DIR}/${SERVICE_NAME} +chmod a+r ${SYSTEMD_DIR}/${GPU_TARGET_NAME} chmod a+r ${PROFILED_DIR}/${MIG_PARTED_NAME}.sh chmod a+r ${OVERRIDE_DIR}/override.conf chmod a+r ${CONFIG_DIR}/service.sh diff --git a/deployments/systemd/nvidia-gpu-reset.target b/deployments/systemd/nvidia-gpu-reset.target new file mode 100644 index 00000000..b9265d74 --- /dev/null +++ b/deployments/systemd/nvidia-gpu-reset.target @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[Unit] +Description=Nvidia GPU System Target diff --git a/deployments/systemd/nvidia-mig-manager.service b/deployments/systemd/nvidia-mig-manager.service index 26d6318c..13f69bcd 100644 --- a/deployments/systemd/nvidia-mig-manager.service +++ b/deployments/systemd/nvidia-mig-manager.service @@ -15,12 +15,12 @@ [Unit] Description=Configure MIG on NVIDIA GPUs DefaultDependencies=no -After=sysinit.target local-fs.target -Before=basic.target nvidia-persistenced.service systemd-resolved.service +After=nvidia-persistenced.service nvidia-fabricmanager.service +Before=nvidia-gpu-services.target [Service] Type=oneshot ExecStart=-/bin/bash /etc/nvidia-mig-manager/service.sh [Install] -WantedBy=multi-user.target +WantedBy=nvidia-gpu-services.target diff --git a/deployments/systemd/uninstall.sh b/deployments/systemd/uninstall.sh index 0ace1866..7d28d92e 100755 --- a/deployments/systemd/uninstall.sh +++ b/deployments/systemd/uninstall.sh @@ -17,6 +17,9 @@ SERVICE_ROOT="nvidia-mig-manager" SERVICE_NAME="${SERVICE_ROOT}.service" +GPU_TARGET_ROOT="nvidia-gpu-reset" +GPU_TARGET_NAME="${GPU_TARGET_ROOT}.target" + MIG_PARTED_NAME="nvidia-mig-parted" MIG_PARTED_GO_GET_PATH="github.com/NVIDIA/mig-parted/cmd/${MIG_PARTED_NAME}" @@ -36,4 +39,5 @@ rm -rf ${OVERRIDE_DIR} rm ${BINARY_DIR}/${MIG_PARTED_NAME} rm ${SYSTEMD_DIR}/${SERVICE_NAME} +rm ${SYSTEMD_DIR}/${GPU_TARGET_NAME} rm ${PROFILED_DIR}/${MIG_PARTED_NAME}.sh