Skip to content

Commit

Permalink
Merge pull request #1785 from maryamtahhan/dcgm-build
Browse files Browse the repository at this point in the history
fixes: aa66ada adding the needed libraries to the builder stage
  • Loading branch information
maryamtahhan authored Sep 18, 2024
2 parents b73c6b1 + 4121376 commit 600235f
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 2 deletions.
12 changes: 12 additions & 0 deletions build/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,20 @@
FROM quay.io/sustainable_computing_io/kepler_builder:ubi-9-libbpf-1.3.0 AS builder
ARG INSTALL_DCGM=false
ARG INSTALL_HABANA=false
WORKDIR /workspace

COPY . .

RUN if [[ "$INSTALL_DCGM" == "true" ]]; then \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo; \
yum install -y datacenter-gpu-manager; \
fi; \
if [[ "$INSTALL_HABANA" == "true" ]]; then \
rpm -Uvh https://vault.habana.ai/artifactory/rhel/9/9.2/habanalabs-firmware-tools-1.15.1-15.el9.x86_64.rpm --nodeps; \
echo /usr/lib/habanalabs > /etc/ld.so.conf.d/habanalabs.conf; \
ldconfig; \
fi;

RUN make tidy-vendor format

RUN make build
Expand Down
5 changes: 5 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -668,3 +668,8 @@ func APIServerEnabled() bool {
ensureConfigInitialized()
return instance.Kepler.EnableAPIServer
}

func DCGMHostEngineEndpoint() string {
ensureConfigInitialized()
return instance.DCGMHostEngineEndpoint
}
4 changes: 2 additions & 2 deletions pkg/sensors/accelerator/device/sources/dcgm.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ type GPUDcgm struct {
}

func init() {
if _, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint, isSocket); err != nil {
if _, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint(), isSocket); err != nil {
klog.Errorf("Error initializing dcgm: %v", err)
return
}
Expand Down Expand Up @@ -131,7 +131,7 @@ func (d *GPUDcgm) InitLib() (err error) {
err = fmt.Errorf("could not init dcgm: %v", r)
}
}()
cleanup, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint, isSocket)
cleanup, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint(), isSocket)
if err != nil {
klog.Infof("There is no DCGM daemon running in the host: %s", err)
// embedded mode is not recommended for production per https://github.com/NVIDIA/dcgm-exporter/issues/22#issuecomment-1321521995
Expand Down

0 comments on commit 600235f

Please sign in to comment.