gpu: switch to dcgm standalone mode due to containerization limitations

Signed-off-by: Huamin Chen <[email protected]>
sustainable-computing-io · Feb 21, 2024 · d42bbdd · d42bbdd
1 parent f89960d
commit d42bbdd
Show file tree

Hide file tree

Showing 6 changed files with 39 additions and 8 deletions.
diff --git a/build/Dockerfile b/build/Dockerfile
@@ -15,8 +15,9 @@ RUN yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.n
 RUN if [ $(uname -i) == "x86_64" ]; then yum install -y cpuid; fi
 
 ENV NVIDIA_VISIBLE_DEVICES=all
-# add utility to support nvidia-smi
 ENV NVIDIA_DRIVER_CAPABILITIES=utility
+ENV NVIDIA_MIG_CONFIG_DEVICES=all 
+ENV NVIDIA_MIG_MONITOR_DEVICES=all 
 
 RUN INSTALL_PKGS=" \
     libbpf \

diff --git a/cmd/exporter/exporter.go b/cmd/exporter/exporter.go
@@ -203,10 +203,11 @@ func main() {
 		// the GPU operators typically takes longer time to initialize than kepler resulting in error to start the gpu driver
 		// therefore, we wait up to 1 min to allow the gpu operator initialize
 		for i := 0; i <= maxGPUInitRetry; i++ {
-			time.Sleep(6 * time.Second)
 			err = gpu.Init()
 			if err == nil {
 				break
+			} else {
+				time.Sleep(6 * time.Second)
 			}
 		}
 		if err == nil {

diff --git a/pkg/config/config.go b/pkg/config/config.go
@@ -101,6 +101,9 @@ var (
 
 	configPath = "/etc/kepler/kepler.config"
 
+	// nvidia dcgm hostengine endpoint
+	DCGMHostEngineEndpoint = getConfig("NVIDIA_HOSTENGINE_ENDPOINT", "localhost:5555")
+
 	// dir of kernel sources for bcc
 	kernelSourceDirs = []string{}
 

diff --git a/pkg/sensors/accelerator/gpu/gpu.go b/pkg/sensors/accelerator/gpu/gpu.go
@@ -44,11 +44,14 @@ Then, we use gpu.go file to initialize the acceleratorImpl from power.go when gp
 func init() {
 	var errLib error
 	for i := 0; i < len(acceleratorOrder); i++ {
+		klog.Infof("Trying to initialize GPU collector using %s", acceleratorOrder[i].GetName())
 		acceleratorImpl = acceleratorOrder[i]
 		errLib = acceleratorImpl.InitLib()
 		if errLib == nil {
 			klog.Infof("Using %s to obtain gpu power", acceleratorImpl.GetName())
 			return
+		} else {
+			klog.Infof("Error initializing %s: %v", acceleratorImpl.GetName(), errLib)
 		}
 	}
 	klog.Infof("no gpu collector available: %v", errLib)

diff --git a/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go b/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go
@@ -51,6 +51,7 @@ var (
 
 type GPUDcgm struct {
 	collectionSupported bool
+	libInited           bool
 	devices             map[string]interface{}
 	deviceGroupName     string
 	deviceGroupHandle   dcgm.GroupHandle
@@ -70,12 +71,13 @@ func (d *GPUDcgm) InitLib() error {
 	d.devices = make(map[string]interface{})
 	d.entities = make(map[string]dcgm.GroupEntityPair)
 
-	cleanup, err := dcgm.Init(dcgm.Embedded)
+	// cleanup, err := dcgm.Init(dcgm.Embedded) // embeded mode is not recommended for production per https://github.com/NVIDIA/dcgm-exporter/issues/22#issuecomment-1321521995
+	cleanup, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint, "0")
 	if err != nil {
 		if cleanup != nil {
 			cleanup()
 		}
-		return fmt.Errorf("not able to connect to DCGM: %s", err)
+		return fmt.Errorf("not able to connect to DCGM %v: %s", config.DCGMHostEngineEndpoint, err)
 	}
 	d.cleanup = cleanup
 	dcgm.FieldsInit()
@@ -84,26 +86,37 @@ func (d *GPUDcgm) InitLib() error {
 		d.Shutdown()
 		return err
 	}
+	d.libInited = true
 	return nil
 }
 
 func (d *GPUDcgm) Init() error {
+	if !d.libInited {
+		if err := d.InitLib(); err != nil {
+			klog.Infof("failed to init lib: %v", err)
+			return err
+		}
+	}
 	if err := d.createDeviceGroup(); err != nil {
+		klog.Infof("failed to create device group: %v", err)
 		d.Shutdown()
 		return err
 	}
 
 	if err := d.addDevicesToGroup(); err != nil {
+		klog.Infof("failed to add devices to group: %v", err)
 		d.Shutdown()
 		return err
 	}
 
 	if err := d.createFieldGroup(); err != nil {
+		klog.Infof("failed to create field group: %v", err)
 		d.Shutdown()
 		return err
 	}
 
 	if err := d.setupWatcher(); err != nil {
+		klog.Infof("failed to set up watcher: %v", err)
 		d.Shutdown()
 		return err
 	}
@@ -133,6 +146,7 @@ func (d *GPUDcgm) Shutdown() bool {
 		d.cleanup()
 	}
 	d.collectionSupported = false
+	d.libInited = false
 	return true
 }
 
@@ -197,8 +211,8 @@ func (d *GPUDcgm) GetProcessResourceUtilizationPerDevice(device interface{}, dev
 		return processAcceleratorMetrics, fmt.Errorf("failed to get running processes: %v", nvml.ErrorString(ret))
 	}
 	for _, p := range processInfo {
-		// klog.V(debugLevel).Infof("pid: %d, memUtil: %d gpu instance id %d compute id %d\n", p.Pid, p.UsedGpuMemory, p.GpuInstanceId, p.ComputeInstanceId)
-		if p.GpuInstanceId > 0 { // this is a MIG, get it entity id and reads the related fields
+		klog.V(debugLevel).Infof("pid: %d, memUtil: %d gpu instance id %d compute id %d\n", p.Pid, p.UsedGpuMemory, p.GpuInstanceId, p.ComputeInstanceId)
+		if p.GpuInstanceId > 0 && p.GpuInstanceId < uint32(len(gpuMigArray[deviceIndex])) { // this is a MIG, get it entity id and reads the related fields
 			entityName := gpuMigArray[deviceIndex][p.GpuInstanceId].EntityName
 			multiprocessorCountRatio := gpuMigArray[deviceIndex][p.GpuInstanceId].MultiprocessorCountRatio
 			mi := d.entities[entityName]
@@ -245,7 +259,7 @@ func (d *GPUDcgm) initNVML() error {
 }
 
 func (d *GPUDcgm) createDeviceGroup() error {
-	deviceGroupName := "kepler-exporter-" + time.Now().Format("2006-01-02-15-04-05")
+	deviceGroupName := "dev-grp-" + time.Now().Format("2006-01-02-15-04-05")
 	deviceGroup, err := dcgm.CreateGroup(deviceGroupName)
 	if err != nil {
 		return fmt.Errorf("failed to create group %q: %v", deviceGroupName, err)
@@ -310,7 +324,7 @@ func (d *GPUDcgm) addDevicesToGroup() error {
 }
 
 func (d *GPUDcgm) createFieldGroup() error {
-	fieldGroupName := "kepler-exporter-" + time.Now().Format("2006-01-02-15-04-05")
+	fieldGroupName := "fld-grp-" + time.Now().Format("2006-01-02-15-04-05")
 	fieldGroup, err := dcgm.FieldGroupCreate(fieldGroupName, deviceFields)
 	if err != nil {
 		return fmt.Errorf("failed to create field group %q: %v", fieldGroupName, err)

diff --git a/pkg/sensors/accelerator/gpu/source/gpu_nvml.go b/pkg/sensors/accelerator/gpu/source/gpu_nvml.go
@@ -36,6 +36,7 @@ var (
 )
 
 type GPUNvml struct {
+	libInited           bool
 	collectionSupported bool
 }
 
@@ -57,10 +58,17 @@ func (n *GPUNvml) InitLib() (err error) {
 		err = fmt.Errorf("failed to init nvml. %s", nvmlErrorString(ret))
 		return err
 	}
+	n.libInited = true
 	return nil
 }
 
 func (n *GPUNvml) Init() (err error) {
+	if !n.libInited {
+		if err := n.InitLib(); err != nil {
+			return err
+		}
+	}
+
 	count, ret := nvml.DeviceGetCount()
 	if ret != nvml.SUCCESS {
 		nvml.Shutdown()
@@ -89,6 +97,7 @@ func (n *GPUNvml) Init() (err error) {
 
 // Shutdown stops the GPU metric collector
 func (n *GPUNvml) Shutdown() bool {
+	n.libInited = false
 	return nvml.Shutdown() == nvml.SUCCESS
 }