Skip to content

Commit

Permalink
Add a new nvidia-gpu-services.target to ensure proper startup order
Browse files Browse the repository at this point in the history
  • Loading branch information
KeithMnemonic committed Aug 12, 2024
1 parent f4acac9 commit ac269f9
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 3 deletions.
7 changes: 7 additions & 0 deletions deployments/systemd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,17 @@ because it runs a container with `go` in it to download and build the latest
`nvidia-mig-parted` before installing it. We plan to relax this requirement in
the near future.

**Note:** When the `nvidia-mig-manager.service` is installed, an additional target,
`nvidia-gpu-reset.target` is also installed. This new target is used to ensure the
`nvidia-mig-manager.service` is started after the `nvidia-fabricmanager.service` and
`nvidia-persistenced.service`. In addition, this target allows applications like dcgm
and nvsm to be started only after the `nvidia-mig-manager.service` has started.

The following files will be added as part of this installation:

* `/usr/bin/nvidia-mig-parted`
* `/usr/lib/systemd/system/nvidia-mig-manager.service`
* `/usr/lib/systemd/system/nvidia-gpu-reset.target.service`
* `/etc/systemd/system/nvidia-mig-manager.service.d/override.conf`
* `/etc/profile.d/nvidia-mig-parted.sh`
* `/etc/nvidia-mig-manager/utils.sh`
Expand Down
5 changes: 5 additions & 0 deletions deployments/systemd/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
SERVICE_ROOT="nvidia-mig-manager"
SERVICE_NAME="${SERVICE_ROOT}.service"

GPU_TARGET_ROOT="nvidia-gpu-reset"
GPU_TARGET_NAME="${GPU_TARGET_ROOT}.target"

MIG_PARTED_NAME="nvidia-mig-parted"
MIG_PARTED_GO_GET_PATH="github.com/NVIDIA/mig-parted/cmd/${MIG_PARTED_NAME}"

Expand Down Expand Up @@ -52,6 +55,7 @@ ${DOCKER} run --rm \
"

cp ${SERVICE_NAME} ${SYSTEMD_DIR}
cp ${GPU_TARGET_NAME} ${SYSTEMD_DIR}
cp ${MIG_PARTED_NAME}.sh ${PROFILED_DIR}
cp override.conf ${OVERRIDE_DIR}
cp service.sh ${CONFIG_DIR}
Expand All @@ -62,6 +66,7 @@ cp hooks-minimal.yaml ${CONFIG_DIR}
cp config-default.yaml ${CONFIG_DIR}

chmod a+r ${SYSTEMD_DIR}/${SERVICE_NAME}
chmod a+r ${SYSTEMD_DIR}/${GPU_TARGET_NAME}
chmod a+r ${PROFILED_DIR}/${MIG_PARTED_NAME}.sh
chmod a+r ${OVERRIDE_DIR}/override.conf
chmod a+r ${CONFIG_DIR}/service.sh
Expand Down
20 changes: 20 additions & 0 deletions deployments/systemd/nvidia-gpu-reset.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This new target is used to ensure the nvidia-mig-manager.service is started
# after the nvidia-fabricmanager.service`and nvidia-persistenced.service.
# In addition, this target allows applications like dcgm and nvsm
# to be started only after the nvidia-mig-manager.service has started.
[Unit]
Description=Nvidia GPU System Target
6 changes: 3 additions & 3 deletions deployments/systemd/nvidia-mig-manager.service
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
[Unit]
Description=Configure MIG on NVIDIA GPUs
DefaultDependencies=no
After=sysinit.target local-fs.target
Before=basic.target nvidia-persistenced.service systemd-resolved.service
After=nvidia-persistenced.service nvidia-fabricmanager.service
Before=nvidia-gpu-services.target

[Service]
Type=oneshot
ExecStart=-/bin/bash /etc/nvidia-mig-manager/service.sh

[Install]
WantedBy=multi-user.target
WantedBy=nvidia-gpu-services.target
4 changes: 4 additions & 0 deletions deployments/systemd/uninstall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
SERVICE_ROOT="nvidia-mig-manager"
SERVICE_NAME="${SERVICE_ROOT}.service"

GPU_TARGET_ROOT="nvidia-gpu-reset"
GPU_TARGET_NAME="${GPU_TARGET_ROOT}.target"

MIG_PARTED_NAME="nvidia-mig-parted"
MIG_PARTED_GO_GET_PATH="github.com/NVIDIA/mig-parted/cmd/${MIG_PARTED_NAME}"

Expand All @@ -36,4 +39,5 @@ rm -rf ${OVERRIDE_DIR}

rm ${BINARY_DIR}/${MIG_PARTED_NAME}
rm ${SYSTEMD_DIR}/${SERVICE_NAME}
rm ${SYSTEMD_DIR}/${GPU_TARGET_NAME}
rm ${PROFILED_DIR}/${MIG_PARTED_NAME}.sh

0 comments on commit ac269f9

Please sign in to comment.