diff --git a/build/Dockerfile b/build/Dockerfile index 5102a1e80a..7dc1f10c25 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -15,8 +15,9 @@ RUN yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.n RUN if [ $(uname -i) == "x86_64" ]; then yum install -y cpuid; fi ENV NVIDIA_VISIBLE_DEVICES=all -# add utility to support nvidia-smi ENV NVIDIA_DRIVER_CAPABILITIES=utility +ENV NVIDIA_MIG_CONFIG_DEVICES=all +ENV NVIDIA_MIG_MONITOR_DEVICES=all RUN INSTALL_PKGS=" \ libbpf \ diff --git a/cmd/exporter/exporter.go b/cmd/exporter/exporter.go index fe8529113a..16f0ad83a8 100644 --- a/cmd/exporter/exporter.go +++ b/cmd/exporter/exporter.go @@ -203,10 +203,11 @@ func main() { // the GPU operators typically takes longer time to initialize than kepler resulting in error to start the gpu driver // therefore, we wait up to 1 min to allow the gpu operator initialize for i := 0; i <= maxGPUInitRetry; i++ { - time.Sleep(6 * time.Second) err = gpu.Init() if err == nil { break + } else { + time.Sleep(6 * time.Second) } } if err == nil { diff --git a/pkg/config/config.go b/pkg/config/config.go index bcf1b69124..33a0cc9518 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -101,6 +101,9 @@ var ( configPath = "/etc/kepler/kepler.config" + // nvidia dcgm hostengine endpoint + DCGMHostEngineEndpoint = getConfig("NVIDIA_HOSTENGINE_ENDPOINT", "localhost:5555") + // dir of kernel sources for bcc kernelSourceDirs = []string{} diff --git a/pkg/sensors/accelerator/gpu/gpu.go b/pkg/sensors/accelerator/gpu/gpu.go index 020b356209..08bb799f74 100644 --- a/pkg/sensors/accelerator/gpu/gpu.go +++ b/pkg/sensors/accelerator/gpu/gpu.go @@ -44,11 +44,14 @@ Then, we use gpu.go file to initialize the acceleratorImpl from power.go when gp func init() { var errLib error for i := 0; i < len(acceleratorOrder); i++ { + klog.Infof("Trying to initialize GPU collector using %s", acceleratorOrder[i].GetName()) acceleratorImpl = acceleratorOrder[i] errLib = acceleratorImpl.InitLib() if errLib == nil { klog.Infof("Using %s to obtain gpu power", acceleratorImpl.GetName()) return + } else { + klog.Infof("Error initializing %s: %v", acceleratorImpl.GetName(), errLib) } } klog.Infof("no gpu collector available: %v", errLib) diff --git a/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go b/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go index 49fcd54f8d..ddbfba96eb 100644 --- a/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go +++ b/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go @@ -51,6 +51,7 @@ var ( type GPUDcgm struct { collectionSupported bool + libInited bool devices map[string]interface{} deviceGroupName string deviceGroupHandle dcgm.GroupHandle @@ -70,12 +71,13 @@ func (d *GPUDcgm) InitLib() error { d.devices = make(map[string]interface{}) d.entities = make(map[string]dcgm.GroupEntityPair) - cleanup, err := dcgm.Init(dcgm.Embedded) + // cleanup, err := dcgm.Init(dcgm.Embedded) // embeded mode is not recommended for production per https://github.com/NVIDIA/dcgm-exporter/issues/22#issuecomment-1321521995 + cleanup, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint, "0") if err != nil { if cleanup != nil { cleanup() } - return fmt.Errorf("not able to connect to DCGM: %s", err) + return fmt.Errorf("not able to connect to DCGM %v: %s", config.DCGMHostEngineEndpoint, err) } d.cleanup = cleanup dcgm.FieldsInit() @@ -84,26 +86,37 @@ func (d *GPUDcgm) InitLib() error { d.Shutdown() return err } + d.libInited = true return nil } func (d *GPUDcgm) Init() error { + if !d.libInited { + if err := d.InitLib(); err != nil { + klog.Infof("failed to init lib: %v", err) + return err + } + } if err := d.createDeviceGroup(); err != nil { + klog.Infof("failed to create device group: %v", err) d.Shutdown() return err } if err := d.addDevicesToGroup(); err != nil { + klog.Infof("failed to add devices to group: %v", err) d.Shutdown() return err } if err := d.createFieldGroup(); err != nil { + klog.Infof("failed to create field group: %v", err) d.Shutdown() return err } if err := d.setupWatcher(); err != nil { + klog.Infof("failed to set up watcher: %v", err) d.Shutdown() return err } @@ -133,6 +146,7 @@ func (d *GPUDcgm) Shutdown() bool { d.cleanup() } d.collectionSupported = false + d.libInited = false return true } @@ -197,8 +211,8 @@ func (d *GPUDcgm) GetProcessResourceUtilizationPerDevice(device interface{}, dev return processAcceleratorMetrics, fmt.Errorf("failed to get running processes: %v", nvml.ErrorString(ret)) } for _, p := range processInfo { - // klog.V(debugLevel).Infof("pid: %d, memUtil: %d gpu instance id %d compute id %d\n", p.Pid, p.UsedGpuMemory, p.GpuInstanceId, p.ComputeInstanceId) - if p.GpuInstanceId > 0 { // this is a MIG, get it entity id and reads the related fields + klog.V(debugLevel).Infof("pid: %d, memUtil: %d gpu instance id %d compute id %d\n", p.Pid, p.UsedGpuMemory, p.GpuInstanceId, p.ComputeInstanceId) + if p.GpuInstanceId > 0 && p.GpuInstanceId < uint32(len(gpuMigArray[deviceIndex])) { // this is a MIG, get it entity id and reads the related fields entityName := gpuMigArray[deviceIndex][p.GpuInstanceId].EntityName multiprocessorCountRatio := gpuMigArray[deviceIndex][p.GpuInstanceId].MultiprocessorCountRatio mi := d.entities[entityName] @@ -245,7 +259,7 @@ func (d *GPUDcgm) initNVML() error { } func (d *GPUDcgm) createDeviceGroup() error { - deviceGroupName := "kepler-exporter-" + time.Now().Format("2006-01-02-15-04-05") + deviceGroupName := "dev-grp-" + time.Now().Format("2006-01-02-15-04-05") deviceGroup, err := dcgm.CreateGroup(deviceGroupName) if err != nil { return fmt.Errorf("failed to create group %q: %v", deviceGroupName, err) @@ -310,7 +324,7 @@ func (d *GPUDcgm) addDevicesToGroup() error { } func (d *GPUDcgm) createFieldGroup() error { - fieldGroupName := "kepler-exporter-" + time.Now().Format("2006-01-02-15-04-05") + fieldGroupName := "fld-grp-" + time.Now().Format("2006-01-02-15-04-05") fieldGroup, err := dcgm.FieldGroupCreate(fieldGroupName, deviceFields) if err != nil { return fmt.Errorf("failed to create field group %q: %v", fieldGroupName, err) diff --git a/pkg/sensors/accelerator/gpu/source/gpu_nvml.go b/pkg/sensors/accelerator/gpu/source/gpu_nvml.go index c8c2c7da00..59c9fc0a02 100644 --- a/pkg/sensors/accelerator/gpu/source/gpu_nvml.go +++ b/pkg/sensors/accelerator/gpu/source/gpu_nvml.go @@ -36,6 +36,7 @@ var ( ) type GPUNvml struct { + libInited bool collectionSupported bool } @@ -57,10 +58,17 @@ func (n *GPUNvml) InitLib() (err error) { err = fmt.Errorf("failed to init nvml. %s", nvmlErrorString(ret)) return err } + n.libInited = true return nil } func (n *GPUNvml) Init() (err error) { + if !n.libInited { + if err := n.InitLib(); err != nil { + return err + } + } + count, ret := nvml.DeviceGetCount() if ret != nvml.SUCCESS { nvml.Shutdown() @@ -89,6 +97,7 @@ func (n *GPUNvml) Init() (err error) { // Shutdown stops the GPU metric collector func (n *GPUNvml) Shutdown() bool { + n.libInited = false return nvml.Shutdown() == nvml.SUCCESS }