Skip to content

Commit

Permalink
Merge pull request #1788 from maryamtahhan/hotfix-dcgm-init
Browse files Browse the repository at this point in the history
fix: error initializing dcgm
  • Loading branch information
maryamtahhan authored Sep 30, 2024
2 parents e9e4c38 + debd99b commit 8d4b3ae
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 5 deletions.
4 changes: 2 additions & 2 deletions build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ COPY . .

RUN if [[ "$INSTALL_DCGM" == "true" ]]; then \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \
yum install -y datacenter-gpu-manager; \
yum install -y datacenter-gpu-manager libnvidia-ml; \
fi; \
if [[ "$INSTALL_HABANA" == "true" ]]; then \
rpm -Uvh https://vault.habana.ai/artifactory/rhel/9/9.2/habanalabs-firmware-tools-1.15.1-15.el9.x86_64.rpm --nodeps; \
Expand Down Expand Up @@ -45,7 +45,7 @@ RUN set -e -x ;\
yum install -y cpuid; \
if [[ "$INSTALL_DCGM" == "true" ]]; then \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \
yum install -y datacenter-gpu-manager; \
yum install -y datacenter-gpu-manager libnvidia-ml; \
fi; \
if [[ "$INSTALL_HABANA" == "true" ]]; then \
rpm -Uvh https://vault.habana.ai/artifactory/rhel/9/9.2/habanalabs-firmware-tools-1.15.1-15.el9.x86_64.rpm --nodeps; \
Expand Down
2 changes: 1 addition & 1 deletion pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ func newConfig() *Config {
Metrics: getMetricsConfig(),
Redfish: getRedfishConfig(),
Libvirt: getLibvirtConfig(),
DCGMHostEngineEndpoint: getConfig("DCGM_HOST_ENGINE_ENDPOINT", ""),
DCGMHostEngineEndpoint: getConfig("NVIDIA_HOSTENGINE_ENDPOINT", defaultDCGMHostEngineEndpoint),
KernelVersion: float32(0),
}
}
Expand Down
1 change: 1 addition & 0 deletions pkg/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,5 @@ const (
defaultProcessPlatformPowerKey = "PROCESS_TOTAL"
defaultProcessComponentsPowerKey = "PROCESS_COMPONENTS"
DefaultMachineSpecFilePath = "/etc/kepler/models/machine/spec.json"
defaultDCGMHostEngineEndpoint = "localhost:5555"
)
11 changes: 9 additions & 2 deletions pkg/sensors/accelerator/device/sources/dcgm.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ type GPUDcgm struct {
devices map[int]device.GPUDevice
migDevices map[int]map[int]device.GPUDevice // list of mig devices for each GPU instance
libInited bool
nvmlInited bool
deviceGroupName string
deviceGroupHandle dcgm.GroupHandle
fieldGroupName string
Expand Down Expand Up @@ -145,16 +146,19 @@ func (d *GPUDcgm) InitLib() (err error) {
}
klog.Info("Started DCGM in the Embedded mode ")
}

d.nvmlInited = false
d.devices = make(map[int]device.GPUDevice)
d.cleanup = cleanup
dcgm.FieldsInit()

if err := d.initNVML(); err != nil {
klog.Errorf("Could not init NVML. Error: %s", err)
d.Shutdown()
return err
}
d.nvmlInited = true
if err := d.loadDevices(); err != nil {
klog.Errorf("Could not load Devices. Error: %s", err)
d.Shutdown()
return err
}
Expand Down Expand Up @@ -281,7 +285,9 @@ func (d *GPUDcgm) SetDeviceCollectionSupported(supported bool) {
}

func (d *GPUDcgm) Shutdown() bool {
nvml.Shutdown()
if d.nvmlInited {
nvml.Shutdown()
}
dcgm.FieldsTerm()
if d.deviceGroupName != "" {
dcgm.DestroyGroup(d.deviceGroupHandle)
Expand All @@ -294,6 +300,7 @@ func (d *GPUDcgm) Shutdown() bool {
}
d.collectionSupported = false
d.libInited = false
d.nvmlInited = false
return true
}

Expand Down

0 comments on commit 8d4b3ae

Please sign in to comment.