From 41213764640ad3852dee982ecade34bac6661e53 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Sep 2024 11:21:39 -0400 Subject: [PATCH] fix(aa66ada): readding libraries to the builder The builder stage for the Kepler image needs to have the dcgm/habana libraries installed for the build tags to work. Signed-off-by: Maryam Tahhan --- build/Dockerfile | 12 ++++++++++++ pkg/config/config.go | 5 +++++ pkg/sensors/accelerator/device/sources/dcgm.go | 4 ++-- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/build/Dockerfile b/build/Dockerfile index cc6e1e97d7..fc1e074505 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -1,8 +1,20 @@ FROM quay.io/sustainable_computing_io/kepler_builder:ubi-9-libbpf-1.3.0 AS builder +ARG INSTALL_DCGM=false +ARG INSTALL_HABANA=false WORKDIR /workspace COPY . . +RUN if [[ "$INSTALL_DCGM" == "true" ]]; then \ + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \ + yum install -y datacenter-gpu-manager; \ + fi; \ + if [[ "$INSTALL_HABANA" == "true" ]]; then \ + rpm -Uvh https://vault.habana.ai/artifactory/rhel/9/9.2/habanalabs-firmware-tools-1.15.1-15.el9.x86_64.rpm --nodeps; \ + echo /usr/lib/habanalabs > /etc/ld.so.conf.d/habanalabs.conf; \ + ldconfig; \ + fi; + RUN make tidy-vendor format RUN make build diff --git a/pkg/config/config.go b/pkg/config/config.go index d02a7e4296..6e0cf39f27 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -668,3 +668,8 @@ func APIServerEnabled() bool { ensureConfigInitialized() return instance.Kepler.EnableAPIServer } + +func DCGMHostEngineEndpoint() string { + ensureConfigInitialized() + return instance.DCGMHostEngineEndpoint +} diff --git a/pkg/sensors/accelerator/device/sources/dcgm.go b/pkg/sensors/accelerator/device/sources/dcgm.go index 9fbba5fdcb..61b3cdff34 100644 --- a/pkg/sensors/accelerator/device/sources/dcgm.go +++ b/pkg/sensors/accelerator/device/sources/dcgm.go @@ -62,7 +62,7 @@ type GPUDcgm struct { } func init() { - if _, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint, isSocket); err != nil { + if _, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint(), isSocket); err != nil { klog.Errorf("Error initializing dcgm: %v", err) return } @@ -131,7 +131,7 @@ func (d *GPUDcgm) InitLib() (err error) { err = fmt.Errorf("could not init dcgm: %v", r) } }() - cleanup, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint, isSocket) + cleanup, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint(), isSocket) if err != nil { klog.Infof("There is no DCGM daemon running in the host: %s", err) // embedded mode is not recommended for production per https://github.com/NVIDIA/dcgm-exporter/issues/22#issuecomment-1321521995