Skip to content

Commit

Permalink
chore: fixup missing nvml library
Browse files Browse the repository at this point in the history
Signed-off-by: Maryam Tahhan <[email protected]>
  • Loading branch information
maryamtahhan committed Sep 27, 2024
1 parent c20c23c commit debd99b
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 4 deletions.
4 changes: 2 additions & 2 deletions build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ COPY . .

RUN if [[ "$INSTALL_DCGM" == "true" ]]; then \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \
yum install -y datacenter-gpu-manager; \
yum install -y datacenter-gpu-manager libnvidia-ml; \
fi; \
if [[ "$INSTALL_HABANA" == "true" ]]; then \
rpm -Uvh https://vault.habana.ai/artifactory/rhel/9/9.2/habanalabs-firmware-tools-1.15.1-15.el9.x86_64.rpm --nodeps; \
Expand Down Expand Up @@ -45,7 +45,7 @@ RUN set -e -x ;\
yum install -y cpuid; \
if [[ "$INSTALL_DCGM" == "true" ]]; then \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \
yum install -y datacenter-gpu-manager; \
yum install -y datacenter-gpu-manager libnvidia-ml; \
fi; \
if [[ "$INSTALL_HABANA" == "true" ]]; then \
rpm -Uvh https://vault.habana.ai/artifactory/rhel/9/9.2/habanalabs-firmware-tools-1.15.1-15.el9.x86_64.rpm --nodeps; \
Expand Down
11 changes: 9 additions & 2 deletions pkg/sensors/accelerator/device/sources/dcgm.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ type GPUDcgm struct {
devices map[int]device.GPUDevice
migDevices map[int]map[int]device.GPUDevice // list of mig devices for each GPU instance
libInited bool
nvmlInited bool
deviceGroupName string
deviceGroupHandle dcgm.GroupHandle
fieldGroupName string
Expand Down Expand Up @@ -145,16 +146,19 @@ func (d *GPUDcgm) InitLib() (err error) {
}
klog.Info("Started DCGM in the Embedded mode ")
}

d.nvmlInited = false
d.devices = make(map[int]device.GPUDevice)
d.cleanup = cleanup
dcgm.FieldsInit()

if err := d.initNVML(); err != nil {
klog.Errorf("Could not init NVML. Error: %s", err)
d.Shutdown()
return err
}
d.nvmlInited = true
if err := d.loadDevices(); err != nil {
klog.Errorf("Could not load Devices. Error: %s", err)
d.Shutdown()
return err
}
Expand Down Expand Up @@ -281,7 +285,9 @@ func (d *GPUDcgm) SetDeviceCollectionSupported(supported bool) {
}

func (d *GPUDcgm) Shutdown() bool {
nvml.Shutdown()
if d.nvmlInited {
nvml.Shutdown()
}
dcgm.FieldsTerm()
if d.deviceGroupName != "" {
dcgm.DestroyGroup(d.deviceGroupHandle)
Expand All @@ -294,6 +300,7 @@ func (d *GPUDcgm) Shutdown() bool {
}
d.collectionSupported = false
d.libInited = false
d.nvmlInited = false
return true
}

Expand Down

0 comments on commit debd99b

Please sign in to comment.