From c20c23ce1b45eaf449dcdc2b5df39140de495b65 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Wed, 18 Sep 2024 10:40:07 -0400 Subject: [PATCH 1/2] fix: error initializing dcgm Signed-off-by: Maryam Tahhan --- pkg/config/config.go | 2 +- pkg/config/types.go | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 6e0cf39f27..556797352b 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -118,7 +118,7 @@ func newConfig() *Config { Metrics: getMetricsConfig(), Redfish: getRedfishConfig(), Libvirt: getLibvirtConfig(), - DCGMHostEngineEndpoint: getConfig("DCGM_HOST_ENGINE_ENDPOINT", ""), + DCGMHostEngineEndpoint: getConfig("NVIDIA_HOSTENGINE_ENDPOINT", defaultDCGMHostEngineEndpoint), KernelVersion: float32(0), } } diff --git a/pkg/config/types.go b/pkg/config/types.go index 961ef1ddc0..e19ca5c7d5 100644 --- a/pkg/config/types.go +++ b/pkg/config/types.go @@ -103,4 +103,5 @@ const ( defaultProcessPlatformPowerKey = "PROCESS_TOTAL" defaultProcessComponentsPowerKey = "PROCESS_COMPONENTS" DefaultMachineSpecFilePath = "/etc/kepler/models/machine/spec.json" + defaultDCGMHostEngineEndpoint = "localhost:5555" ) From debd99b05329d9cdf16489175506e8d474fc9af2 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Fri, 27 Sep 2024 12:02:54 -0400 Subject: [PATCH 2/2] chore: fixup missing nvml library Signed-off-by: Maryam Tahhan --- build/Dockerfile | 4 ++-- pkg/sensors/accelerator/device/sources/dcgm.go | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/build/Dockerfile b/build/Dockerfile index fc1e074505..148a035cdd 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -7,7 +7,7 @@ COPY . . RUN if [[ "$INSTALL_DCGM" == "true" ]]; then \ dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \ - yum install -y datacenter-gpu-manager; \ + yum install -y datacenter-gpu-manager libnvidia-ml; \ fi; \ if [[ "$INSTALL_HABANA" == "true" ]]; then \ rpm -Uvh https://vault.habana.ai/artifactory/rhel/9/9.2/habanalabs-firmware-tools-1.15.1-15.el9.x86_64.rpm --nodeps; \ @@ -45,7 +45,7 @@ RUN set -e -x ;\ yum install -y cpuid; \ if [[ "$INSTALL_DCGM" == "true" ]]; then \ dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \ - yum install -y datacenter-gpu-manager; \ + yum install -y datacenter-gpu-manager libnvidia-ml; \ fi; \ if [[ "$INSTALL_HABANA" == "true" ]]; then \ rpm -Uvh https://vault.habana.ai/artifactory/rhel/9/9.2/habanalabs-firmware-tools-1.15.1-15.el9.x86_64.rpm --nodeps; \ diff --git a/pkg/sensors/accelerator/device/sources/dcgm.go b/pkg/sensors/accelerator/device/sources/dcgm.go index 61b3cdff34..d3e9c5fb4e 100644 --- a/pkg/sensors/accelerator/device/sources/dcgm.go +++ b/pkg/sensors/accelerator/device/sources/dcgm.go @@ -54,6 +54,7 @@ type GPUDcgm struct { devices map[int]device.GPUDevice migDevices map[int]map[int]device.GPUDevice // list of mig devices for each GPU instance libInited bool + nvmlInited bool deviceGroupName string deviceGroupHandle dcgm.GroupHandle fieldGroupName string @@ -145,16 +146,19 @@ func (d *GPUDcgm) InitLib() (err error) { } klog.Info("Started DCGM in the Embedded mode ") } - + d.nvmlInited = false d.devices = make(map[int]device.GPUDevice) d.cleanup = cleanup dcgm.FieldsInit() if err := d.initNVML(); err != nil { + klog.Errorf("Could not init NVML. Error: %s", err) d.Shutdown() return err } + d.nvmlInited = true if err := d.loadDevices(); err != nil { + klog.Errorf("Could not load Devices. Error: %s", err) d.Shutdown() return err } @@ -281,7 +285,9 @@ func (d *GPUDcgm) SetDeviceCollectionSupported(supported bool) { } func (d *GPUDcgm) Shutdown() bool { - nvml.Shutdown() + if d.nvmlInited { + nvml.Shutdown() + } dcgm.FieldsTerm() if d.deviceGroupName != "" { dcgm.DestroyGroup(d.deviceGroupHandle) @@ -294,6 +300,7 @@ func (d *GPUDcgm) Shutdown() bool { } d.collectionSupported = false d.libInited = false + d.nvmlInited = false return true }