Merge pull request #1788 from maryamtahhan/hotfix-dcgm-init

fix: error initializing dcgm
sustainable-computing-io · Sep 30, 2024 · 8d4b3ae · 8d4b3ae
2 parents e9e4c38 + debd99b
commit 8d4b3ae
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 5 deletions.
diff --git a/build/Dockerfile b/build/Dockerfile
@@ -7,7 +7,7 @@ COPY . .
 
 RUN if [[ "$INSTALL_DCGM" == "true" ]]; then \
 		dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \
-		yum install -y datacenter-gpu-manager; \
+		yum install -y datacenter-gpu-manager libnvidia-ml; \
 	fi; \
 	if [[ "$INSTALL_HABANA" == "true" ]]; then \
 		rpm -Uvh https://vault.habana.ai/artifactory/rhel/9/9.2/habanalabs-firmware-tools-1.15.1-15.el9.x86_64.rpm --nodeps; \
@@ -45,7 +45,7 @@ RUN set -e -x ;\
 			yum install -y cpuid; \
 			if [[ "$INSTALL_DCGM" == "true" ]]; then \
 				dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \
-				yum install -y datacenter-gpu-manager; \
+				yum install -y datacenter-gpu-manager libnvidia-ml; \
 			fi; \
 			if [[ "$INSTALL_HABANA" == "true" ]]; then \
 				rpm -Uvh https://vault.habana.ai/artifactory/rhel/9/9.2/habanalabs-firmware-tools-1.15.1-15.el9.x86_64.rpm --nodeps; \

diff --git a/pkg/config/config.go b/pkg/config/config.go
@@ -118,7 +118,7 @@ func newConfig() *Config {
 		Metrics:                getMetricsConfig(),
 		Redfish:                getRedfishConfig(),
 		Libvirt:                getLibvirtConfig(),
-		DCGMHostEngineEndpoint: getConfig("DCGM_HOST_ENGINE_ENDPOINT", ""),
+		DCGMHostEngineEndpoint: getConfig("NVIDIA_HOSTENGINE_ENDPOINT", defaultDCGMHostEngineEndpoint),
 		KernelVersion:          float32(0),
 	}
 }

diff --git a/pkg/config/types.go b/pkg/config/types.go
@@ -103,4 +103,5 @@ const (
 	defaultProcessPlatformPowerKey     = "PROCESS_TOTAL"
 	defaultProcessComponentsPowerKey   = "PROCESS_COMPONENTS"
 	DefaultMachineSpecFilePath         = "/etc/kepler/models/machine/spec.json"
+	defaultDCGMHostEngineEndpoint      = "localhost:5555"
 )
diff --git a/pkg/sensors/accelerator/device/sources/dcgm.go b/pkg/sensors/accelerator/device/sources/dcgm.go
@@ -54,6 +54,7 @@ type GPUDcgm struct {
 	devices             map[int]device.GPUDevice
 	migDevices          map[int]map[int]device.GPUDevice // list of mig devices for each GPU instance
 	libInited           bool
+	nvmlInited          bool
 	deviceGroupName     string
 	deviceGroupHandle   dcgm.GroupHandle
 	fieldGroupName      string
@@ -145,16 +146,19 @@ func (d *GPUDcgm) InitLib() (err error) {
 		}
 		klog.Info("Started DCGM in the Embedded mode ")
 	}
-
+	d.nvmlInited = false
 	d.devices = make(map[int]device.GPUDevice)
 	d.cleanup = cleanup
 	dcgm.FieldsInit()
 
 	if err := d.initNVML(); err != nil {
+		klog.Errorf("Could not init NVML. Error: %s", err)
 		d.Shutdown()
 		return err
 	}
+	d.nvmlInited = true
 	if err := d.loadDevices(); err != nil {
+		klog.Errorf("Could not load Devices. Error: %s", err)
 		d.Shutdown()
 		return err
 	}
@@ -281,7 +285,9 @@ func (d *GPUDcgm) SetDeviceCollectionSupported(supported bool) {
 }
 
 func (d *GPUDcgm) Shutdown() bool {
-	nvml.Shutdown()
+	if d.nvmlInited {
+		nvml.Shutdown()
+	}
 	dcgm.FieldsTerm()
 	if d.deviceGroupName != "" {
 		dcgm.DestroyGroup(d.deviceGroupHandle)
@@ -294,6 +300,7 @@ func (d *GPUDcgm) Shutdown() bool {
 	}
 	d.collectionSupported = false
 	d.libInited = false
+	d.nvmlInited = false
 	return true
 }