From debd99b05329d9cdf16489175506e8d474fc9af2 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Fri, 27 Sep 2024 12:02:54 -0400 Subject: [PATCH] chore: fixup missing nvml library Signed-off-by: Maryam Tahhan --- build/Dockerfile | 4 ++-- pkg/sensors/accelerator/device/sources/dcgm.go | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/build/Dockerfile b/build/Dockerfile index fc1e074505..148a035cdd 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -7,7 +7,7 @@ COPY . . RUN if [[ "$INSTALL_DCGM" == "true" ]]; then \ dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \ - yum install -y datacenter-gpu-manager; \ + yum install -y datacenter-gpu-manager libnvidia-ml; \ fi; \ if [[ "$INSTALL_HABANA" == "true" ]]; then \ rpm -Uvh https://vault.habana.ai/artifactory/rhel/9/9.2/habanalabs-firmware-tools-1.15.1-15.el9.x86_64.rpm --nodeps; \ @@ -45,7 +45,7 @@ RUN set -e -x ;\ yum install -y cpuid; \ if [[ "$INSTALL_DCGM" == "true" ]]; then \ dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \ - yum install -y datacenter-gpu-manager; \ + yum install -y datacenter-gpu-manager libnvidia-ml; \ fi; \ if [[ "$INSTALL_HABANA" == "true" ]]; then \ rpm -Uvh https://vault.habana.ai/artifactory/rhel/9/9.2/habanalabs-firmware-tools-1.15.1-15.el9.x86_64.rpm --nodeps; \ diff --git a/pkg/sensors/accelerator/device/sources/dcgm.go b/pkg/sensors/accelerator/device/sources/dcgm.go index 61b3cdff34..d3e9c5fb4e 100644 --- a/pkg/sensors/accelerator/device/sources/dcgm.go +++ b/pkg/sensors/accelerator/device/sources/dcgm.go @@ -54,6 +54,7 @@ type GPUDcgm struct { devices map[int]device.GPUDevice migDevices map[int]map[int]device.GPUDevice // list of mig devices for each GPU instance libInited bool + nvmlInited bool deviceGroupName string deviceGroupHandle dcgm.GroupHandle fieldGroupName string @@ -145,16 +146,19 @@ func (d *GPUDcgm) InitLib() (err error) { } klog.Info("Started DCGM in the Embedded mode ") } - + d.nvmlInited = false d.devices = make(map[int]device.GPUDevice) d.cleanup = cleanup dcgm.FieldsInit() if err := d.initNVML(); err != nil { + klog.Errorf("Could not init NVML. Error: %s", err) d.Shutdown() return err } + d.nvmlInited = true if err := d.loadDevices(); err != nil { + klog.Errorf("Could not load Devices. Error: %s", err) d.Shutdown() return err } @@ -281,7 +285,9 @@ func (d *GPUDcgm) SetDeviceCollectionSupported(supported bool) { } func (d *GPUDcgm) Shutdown() bool { - nvml.Shutdown() + if d.nvmlInited { + nvml.Shutdown() + } dcgm.FieldsTerm() if d.deviceGroupName != "" { dcgm.DestroyGroup(d.deviceGroupHandle) @@ -294,6 +300,7 @@ func (d *GPUDcgm) Shutdown() bool { } d.collectionSupported = false d.libInited = false + d.nvmlInited = false return true }