From 41213764640ad3852dee982ecade34bac6661e53 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Sep 2024 11:21:39 -0400
Subject: [PATCH] fix(aa66ada): readding libraries to the builder

The builder stage for the Kepler image needs to have
the dcgm/habana libraries installed for the build
tags to work.

Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 build/Dockerfile                               | 12 ++++++++++++
 pkg/config/config.go                           |  5 +++++
 pkg/sensors/accelerator/device/sources/dcgm.go |  4 ++--
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/build/Dockerfile b/build/Dockerfile
index cc6e1e97d7..fc1e074505 100644
--- a/build/Dockerfile
+++ b/build/Dockerfile
@@ -1,8 +1,20 @@
 FROM quay.io/sustainable_computing_io/kepler_builder:ubi-9-libbpf-1.3.0 AS builder
+ARG INSTALL_DCGM=false
+ARG INSTALL_HABANA=false
 WORKDIR /workspace
 
 COPY . .
 
+RUN if [[ "$INSTALL_DCGM" == "true" ]]; then \
+		dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \
+		yum install -y datacenter-gpu-manager; \
+	fi; \
+	if [[ "$INSTALL_HABANA" == "true" ]]; then \
+		rpm -Uvh https://vault.habana.ai/artifactory/rhel/9/9.2/habanalabs-firmware-tools-1.15.1-15.el9.x86_64.rpm --nodeps; \
+		echo /usr/lib/habanalabs > /etc/ld.so.conf.d/habanalabs.conf; \
+		ldconfig; \
+	fi;
+
 RUN make tidy-vendor format
 
 RUN make build
diff --git a/pkg/config/config.go b/pkg/config/config.go
index d02a7e4296..6e0cf39f27 100644
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -668,3 +668,8 @@ func APIServerEnabled() bool {
 	ensureConfigInitialized()
 	return instance.Kepler.EnableAPIServer
 }
+
+func DCGMHostEngineEndpoint() string {
+	ensureConfigInitialized()
+	return instance.DCGMHostEngineEndpoint
+}
diff --git a/pkg/sensors/accelerator/device/sources/dcgm.go b/pkg/sensors/accelerator/device/sources/dcgm.go
index 9fbba5fdcb..61b3cdff34 100644
--- a/pkg/sensors/accelerator/device/sources/dcgm.go
+++ b/pkg/sensors/accelerator/device/sources/dcgm.go
@@ -62,7 +62,7 @@ type GPUDcgm struct {
 }
 
 func init() {
-	if _, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint, isSocket); err != nil {
+	if _, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint(), isSocket); err != nil {
 		klog.Errorf("Error initializing dcgm: %v", err)
 		return
 	}
@@ -131,7 +131,7 @@ func (d *GPUDcgm) InitLib() (err error) {
 			err = fmt.Errorf("could not init dcgm: %v", r)
 		}
 	}()
-	cleanup, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint, isSocket)
+	cleanup, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint(), isSocket)
 	if err != nil {
 		klog.Infof("There is no DCGM daemon running in the host: %s", err)
 		// embedded mode is not recommended for production per https://github.com/NVIDIA/dcgm-exporter/issues/22#issuecomment-1321521995