diff --git a/pkg/config/config.go b/pkg/config/config.go index c42904ac22..0ba69a24fa 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -287,6 +287,9 @@ func logBoolConfigs() { klog.V(5).Infof("EXPOSE_ESTIMATED_IDLE_POWER_METRICS: %t. This only impacts when the power is estimated using pre-prained models. Estimated idle power is meaningful only when Kepler is running on bare-metal or with a single virtual machine (VM) on the node.", instance.Kepler.ExposeIdlePowerMetrics) klog.V(5).Infof("EXPERIMENTAL_BPF_SAMPLE_RATE: %d", instance.Kepler.BPFSampleRate) klog.V(5).Infof("EXCLUDE_SWAPPER_PROCESS: %t", instance.Kepler.ExcludeSwapperProcess) + if instance.Kepler.EnabledGPU { + klog.V(5).Infof("DCGMHostEngineEndpoint %s", instance.DCGMHostEngineEndpoint) + } } } diff --git a/pkg/sensors/accelerator/accelerator.go b/pkg/sensors/accelerator/accelerator.go index db2f4a618c..467d32ea56 100644 --- a/pkg/sensors/accelerator/accelerator.go +++ b/pkg/sensors/accelerator/accelerator.go @@ -15,6 +15,7 @@ package accelerator //nolint:gci // The supported device imports are kept separate. import ( + "encoding/json" "slices" "sync" "time" @@ -130,7 +131,10 @@ func New(atype string, sleep bool) (Accelerator, error) { // Init the available devices. - devs := devices.GetRegistry().GetAllDeviceTypes() + r := devices.GetRegistry() + j, _ := json.Marshal(r.GetAllDevices()) + klog.V(5).Infof("Accelerator Registry AllDevices: %s", string(j)) + devs := r.GetAllDeviceTypes() numDevs := len(devs) if numDevs == 0 || !slices.Contains(devs, atype) { return nil, errors.New("no devices found") diff --git a/pkg/sensors/accelerator/devices/dcgm.go b/pkg/sensors/accelerator/devices/dcgm.go index 02abf238e7..61af0a415a 100644 --- a/pkg/sensors/accelerator/devices/dcgm.go +++ b/pkg/sensors/accelerator/devices/dcgm.go @@ -76,21 +76,22 @@ func dcgmCheck(r *Registry) { } func dcgmDeviceStartup() Device { - a := dcgmAccImpl + klog.V(3).Infof("Attempting to startup DCGM") + d := dcgmAccImpl - if err := a.InitLib(); err != nil { + if err := d.InitLib(); err != nil { klog.Errorf("Error initializing %s: %v", dcgmType.String(), err) return nil } - if err := a.Init(); err != nil { + if err := d.Init(); err != nil { klog.Errorf("failed to StartupDevice: %v", err) return nil } klog.Infof("Using %s to obtain gpu power", dcgmType.String()) - return &a + return &d } func (d *gpuDcgm) Init() error { @@ -138,6 +139,7 @@ func (d *gpuDcgm) InitLib() (err error) { if err != nil { klog.Infof("There is no DCGM daemon running in the host: %s", err) // embedded mode is not recommended for production per https://github.com/NVIDIA/dcgm-exporter/issues/22#issuecomment-1321521995 + klog.Info("Attempting to inilialize dcgm in Embedded mode.") cleanup, err = dcgm.Init(dcgm.Embedded) if err != nil { klog.Errorf("Could not start DCGM. Error: %s", err) @@ -147,6 +149,8 @@ func (d *gpuDcgm) InitLib() (err error) { return fmt.Errorf("not able to connect to DCGM: %s", err) } klog.Info("Started DCGM in the Embedded mode ") + } else { + klog.Info("Started DCGM in the Standalone mode ") } d.nvmlInited = false d.devs = make(map[int]GPUDevice) @@ -172,6 +176,7 @@ func (d *gpuDcgm) InitLib() (err error) { } func (d *gpuDcgm) loadDevices() error { + klog.V(5).Infof("Attempting to load dcgm devices.") d.devs = map[int]GPUDevice{} count, err := nvml.DeviceGetCount() if err != nvml.SUCCESS { diff --git a/pkg/sensors/accelerator/devices/device.go b/pkg/sensors/accelerator/devices/device.go index e1e0057747..c4c269decc 100644 --- a/pkg/sensors/accelerator/devices/device.go +++ b/pkg/sensors/accelerator/devices/device.go @@ -121,8 +121,13 @@ func (r *Registry) MustRegister(a string, d DeviceType, deviceStartup deviceStar return } klog.V(5).Infof("Adding the device to the registry [%s][%s]", a, d.String()) - r.Registry[a] = map[DeviceType]deviceStartupFunc{ - d: deviceStartup, + m, ok := r.Registry[a] + if !ok { + r.Registry[a] = map[DeviceType]deviceStartupFunc{ + d: deviceStartup, + } + } else { + m[d] = deviceStartup } } @@ -143,6 +148,18 @@ func (r *Registry) GetAllDeviceTypes() []string { return devices } +func (r *Registry) GetAllDevices() map[string]map[string]interface{} { + all := map[string]map[string]interface{}{} + for t, m := range r.Registry { + devices := map[string]interface{}{} + for d := range m { + devices[d.String()] = struct{}{} + } + all[t] = devices + } + return all +} + func addDeviceInterface(registry *Registry, dtype DeviceType, accType string, deviceStartup deviceStartupFunc) error { switch accType { case config.GPU: