From 871a4f9810b9ca4c7022943e7ae1d9a6ce1f3032 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 19 Aug 2024 06:31:58 -0400 Subject: [PATCH 1/3] chore: gpu cleanup Cleanup the init() calls in the gpu code. Remove gpu tags from the Makefile with the exception of habana (as the libhlml.go cgo implementation in vendor directory is problematic if libhlml.so isn't available). Signed-off-by: Maryam Tahhan --- Makefile | 8 +- cmd/exporter/exporter.go | 2 +- pkg/collector/energy/node_energy_collector.go | 2 +- pkg/collector/metric_collector.go | 2 +- pkg/collector/metric_collector_test.go | 2 +- .../accelerator/process_gpu_collector.go | 4 +- pkg/collector/stats/node_stats.go | 2 +- pkg/collector/stats/stats.go | 4 +- pkg/collector/stats/test_utils.go | 2 +- pkg/collector/stats/utils.go | 2 +- pkg/metrics/metricfactory/metric_factory.go | 4 +- pkg/metrics/prometheus_collector_test.go | 3 +- pkg/metrics/utils/utils.go | 2 +- pkg/model/process_energy.go | 2 +- pkg/sensors/accelerator/accelerator.go | 126 ++++++++---------- pkg/sensors/accelerator/accelerator_test.go | 106 ++++++--------- .../accelerator/device/sources/dummy.go | 120 ----------------- .../{device/sources => devices}/dcgm.go | 0 .../accelerator/{device => devices}/device.go | 65 +++++---- pkg/sensors/accelerator/devices/fakehabana.go | 25 ++++ .../{device/sources => devices}/habana.go | 31 +++-- .../accelerator/devices/mock_device.go | 116 ++++++++++++++++ .../{device/sources => devices}/nvml.go | 47 +++---- .../accelerator/{device => devices}/type.go | 2 +- 24 files changed, 341 insertions(+), 338 deletions(-) delete mode 100644 pkg/sensors/accelerator/device/sources/dummy.go rename pkg/sensors/accelerator/{device/sources => devices}/dcgm.go (100%) rename pkg/sensors/accelerator/{device => devices}/device.go (74%) create mode 100644 pkg/sensors/accelerator/devices/fakehabana.go rename pkg/sensors/accelerator/{device/sources => devices}/habana.go (84%) create mode 100644 pkg/sensors/accelerator/devices/mock_device.go rename pkg/sensors/accelerator/{device/sources => devices}/nvml.go (86%) rename pkg/sensors/accelerator/{device => devices}/type.go (98%) diff --git a/Makefile b/Makefile index acae4461b4..be53e1eef2 100644 --- a/Makefile +++ b/Makefile @@ -43,14 +43,8 @@ CTR_CMD_PUSH_OPTIONS ?= GENERAL_TAGS := include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo GPU_TAGS := -ifeq ($(shell ldconfig -p | grep -q libnvml_injection.so && echo exists),exists) - GPU_TAGS := nvml -endif -ifeq ($(shell ldconfig -p | grep -q libdcgm.so && echo exists),exists) - GPU_TAGS := dcgm -endif ifeq ($(shell ldconfig -p | grep -q libhlml.so && echo exists),exists) - GPU_TAGS := habana + GPU_TAGS := habana endif # set GOENV diff --git a/cmd/exporter/exporter.go b/cmd/exporter/exporter.go index ac5b50f02e..2687a2f13f 100644 --- a/cmd/exporter/exporter.go +++ b/cmd/exporter/exporter.go @@ -148,7 +148,7 @@ func main() { if config.EnabledGPU() { r := accelerator.GetRegistry() - if a, err := accelerator.New(accelerator.GPU, true); err == nil { + if a, err := accelerator.New(config.GPU, true); err == nil { r.MustRegister(a) // Register the accelerator with the registry } else { klog.Errorf("failed to init GPU accelerators: %v", err) diff --git a/pkg/collector/energy/node_energy_collector.go b/pkg/collector/energy/node_energy_collector.go index d30107ae41..763dc6a6bb 100644 --- a/pkg/collector/energy/node_energy_collector.go +++ b/pkg/collector/energy/node_energy_collector.go @@ -67,7 +67,7 @@ func UpdateNodeComponentsEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup) func UpdateNodeGPUEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup) { defer wg.Done() if config.EnabledGPU() { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { gpuEnergy := gpu.Device().AbsEnergyFromDevice() for gpu, energy := range gpuEnergy { nodeStats.EnergyUsage[config.AbsEnergyInGPU].SetDeltaStat(fmt.Sprintf("%d", gpu), uint64(energy)) diff --git a/pkg/collector/metric_collector.go b/pkg/collector/metric_collector.go index aa030c9ec9..cc704a2699 100644 --- a/pkg/collector/metric_collector.go +++ b/pkg/collector/metric_collector.go @@ -159,7 +159,7 @@ func (c *Collector) updateProcessResourceUtilizationMetrics(wg *sync.WaitGroup) // we first updates the bpf which is responsible to include new processes in the ProcessStats collection resourceBpf.UpdateProcessBPFMetrics(c.bpfExporter, c.ProcessStats) if config.EnabledGPU() { - if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { + if acc.GetActiveAcceleratorByType(config.GPU) != nil { accelerator.UpdateProcessGPUUtilizationMetrics(c.ProcessStats) } } diff --git a/pkg/collector/metric_collector_test.go b/pkg/collector/metric_collector_test.go index 7280af9a4e..e2c6e973a7 100644 --- a/pkg/collector/metric_collector_test.go +++ b/pkg/collector/metric_collector_test.go @@ -16,7 +16,7 @@ import ( ) func newMockCollector(mockAttacher bpf.Exporter) *Collector { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { d := gpu.Device() err := d.Init() // create structure instances that will be accessed to create a containerMetric Expect(err).NotTo(HaveOccurred()) diff --git a/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go b/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go index 79d0de133a..5bdf4c2b7c 100644 --- a/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go +++ b/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go @@ -26,7 +26,7 @@ import ( "github.com/sustainable-computing-io/kepler/pkg/config" "github.com/sustainable-computing-io/kepler/pkg/libvirt" acc "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator" - dev "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" + dev "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/devices" "k8s.io/klog/v2" "github.com/sustainable-computing-io/kepler/pkg/utils" @@ -43,7 +43,7 @@ var ( // UpdateProcessGPUUtilizationMetrics reads the GPU metrics of each process using the GPU func UpdateProcessGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats) { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { d := gpu.Device() migDevices := d.DeviceInstances() for _, _device := range d.DevicesByID() { diff --git a/pkg/collector/stats/node_stats.go b/pkg/collector/stats/node_stats.go index d50ed1fa21..7ab80fe5ba 100644 --- a/pkg/collector/stats/node_stats.go +++ b/pkg/collector/stats/node_stats.go @@ -51,7 +51,7 @@ func (ne *NodeStats) ResetDeltaValues() { func (ne *NodeStats) UpdateIdleEnergyWithMinValue(isComponentsSystemCollectionSupported bool) { // gpu metric if config.EnabledGPU() { - if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { + if acc.GetActiveAcceleratorByType(config.GPU) != nil { ne.CalcIdleEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.GPUComputeUtilization) } } diff --git a/pkg/collector/stats/stats.go b/pkg/collector/stats/stats.go index c9d6b85862..2217345803 100644 --- a/pkg/collector/stats/stats.go +++ b/pkg/collector/stats/stats.go @@ -82,7 +82,7 @@ func NewStats() *Stats { } if config.EnabledGPU() { - if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { + if acc.GetActiveAcceleratorByType(config.GPU) != nil { stats.ResourceUsage[config.GPUComputeUtilization] = types.NewUInt64StatCollection() stats.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection() stats.ResourceUsage[config.IdleEnergyInGPU] = types.NewUInt64StatCollection() @@ -141,7 +141,7 @@ func (s *Stats) UpdateDynEnergy() { } // GPU metric if config.EnabledGPU() { - if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { + if acc.GetActiveAcceleratorByType(config.GPU) != nil { for gpuID := range s.EnergyUsage[config.AbsEnergyInGPU] { s.CalcDynEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.DynEnergyInGPU, gpuID) } diff --git a/pkg/collector/stats/test_utils.go b/pkg/collector/stats/test_utils.go index ddb623d442..dd14bbf0ab 100644 --- a/pkg/collector/stats/test_utils.go +++ b/pkg/collector/stats/test_utils.go @@ -32,7 +32,7 @@ const ( // TODO: do not use a fixed usageMetric array in the power models, a structured data is more disarable. func SetMockedCollectorMetrics() { config.GetConfig() - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { err := gpu.Device().Init() // create structure instances that will be accessed to create a processMetric klog.Fatalln(err) } diff --git a/pkg/collector/stats/utils.go b/pkg/collector/stats/utils.go index ad07ad22d3..5c9158e8da 100644 --- a/pkg/collector/stats/utils.go +++ b/pkg/collector/stats/utils.go @@ -31,7 +31,7 @@ func GetProcessFeatureNames() []string { // gpu metric if config.EnabledGPU() { - if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { + if acc.GetActiveAcceleratorByType(config.GPU) != nil { gpuMetrics := []string{config.GPUComputeUtilization, config.GPUMemUtilization} metrics = append(metrics, gpuMetrics...) klog.V(3).Infof("Available GPU metrics: %v", gpuMetrics) diff --git a/pkg/metrics/metricfactory/metric_factory.go b/pkg/metrics/metricfactory/metric_factory.go index aeb68a899f..9f76994e96 100644 --- a/pkg/metrics/metricfactory/metric_factory.go +++ b/pkg/metrics/metricfactory/metric_factory.go @@ -36,7 +36,7 @@ func EnergyMetricsPromDesc(context string) (descriptions map[string]*prometheus. // set the default source to trained power model source := modeltypes.TrainedPowerModelSource if strings.Contains(name, config.GPU) { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { source = gpu.Device().Name() } } else if strings.Contains(name, config.PLATFORM) && platform.IsSystemCollectionSupported() { @@ -87,7 +87,7 @@ func SCMetricsPromDesc(context string, bpfSupportedMetrics bpf.SupportedMetrics) func GPUUsageMetricsPromDesc(context string) (descriptions map[string]*prometheus.Desc) { descriptions = make(map[string]*prometheus.Desc) if config.EnabledGPU() { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { for _, name := range consts.GPUMetricNames { descriptions[name] = resMetricsPromDesc(context, name, gpu.Device().Name()) } diff --git a/pkg/metrics/prometheus_collector_test.go b/pkg/metrics/prometheus_collector_test.go index abc277d19f..822fe3efa2 100644 --- a/pkg/metrics/prometheus_collector_test.go +++ b/pkg/metrics/prometheus_collector_test.go @@ -33,6 +33,7 @@ import ( "github.com/sustainable-computing-io/kepler/pkg/bpf" "github.com/sustainable-computing-io/kepler/pkg/collector" "github.com/sustainable-computing-io/kepler/pkg/collector/stats" + "github.com/sustainable-computing-io/kepler/pkg/config" "github.com/sustainable-computing-io/kepler/pkg/model" acc "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator" @@ -63,7 +64,7 @@ var _ = Describe("Test Prometheus Collector Unit", func() { // we need to disable the system real time power metrics for testing since we add mock values or use power model estimator components.SetIsSystemCollectionSupported(false) platform.SetIsSystemCollectionSupported(false) - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { err := gpu.Device().Init() // create structure instances that will be accessed to create a containerMetric Expect(err).NotTo(HaveOccurred()) } diff --git a/pkg/metrics/utils/utils.go b/pkg/metrics/utils/utils.go index 9d66ee1515..ee52a4f03a 100644 --- a/pkg/metrics/utils/utils.go +++ b/pkg/metrics/utils/utils.go @@ -58,7 +58,7 @@ func CollectResUtilizationMetrics(ch chan<- prometheus.Metric, instance interfac CollectResUtil(ch, instance, collectorName, collectors[collectorName]) } if config.EnabledGPU() { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { for _, collectorName := range consts.GPUMetricNames { CollectResUtil(ch, instance, collectorName, collectors[collectorName]) } diff --git a/pkg/model/process_energy.go b/pkg/model/process_energy.go index 824016c367..19857919af 100644 --- a/pkg/model/process_energy.go +++ b/pkg/model/process_energy.go @@ -201,7 +201,7 @@ func addEstimatedEnergy(processIDList []uint64, processesMetrics map[uint64]*sta } // estimate the associated power consumption of GPU for each process if config.EnabledGPU() { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { processGPUPower, errGPU = processComponentPowerModel.GetGPUPower(isIdlePower) if errGPU != nil { klog.V(5).Infoln("Could not estimate the Process GPU Power") diff --git a/pkg/sensors/accelerator/accelerator.go b/pkg/sensors/accelerator/accelerator.go index f0a77b8fc1..db2f4a618c 100644 --- a/pkg/sensors/accelerator/accelerator.go +++ b/pkg/sensors/accelerator/accelerator.go @@ -20,11 +20,8 @@ import ( "time" "github.com/pkg/errors" - "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" + "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/devices" "k8s.io/klog/v2" - - // Add supported devices. - _ "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device/sources" ) var ( @@ -32,45 +29,30 @@ var ( once sync.Once ) -const ( - DUMMY AcceleratorType = iota - GPU - // Add other accelerator types here [IPU|DPU|...] -) - -type AcceleratorType int - -func (a AcceleratorType) String() string { - return [...]string{"DUMMY", "GPU"}[a] -} - // Accelerator represents an implementation of... equivalent Accelerator device. type Accelerator interface { // Device returns an underlying accelerator device implementation... - Device() device.Device + Device() devices.Device // IsRunning returns whether or not that device is running IsRunning() bool - // AccType returns the accelerator type. - AccType() AcceleratorType // stop stops an accelerator and unregisters it stop() } type accelerator struct { - dev device.Device // Device Accelerator Interface - accType AcceleratorType + dev devices.Device // Device Accelerator Interface running bool } type Registry struct { - Registry map[AcceleratorType]Accelerator + Registry map[string]Accelerator } // Registry gets the default device Registry instance func GetRegistry() *Registry { once.Do(func() { globalRegistry = &Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } }) return globalRegistry @@ -84,27 +66,27 @@ func SetRegistry(registry *Registry) { } func (r *Registry) MustRegister(a Accelerator) { - _, ok := r.Registry[a.AccType()] + _, ok := r.Registry[a.Device().HwType()] if ok { - klog.V(5).Infof("Accelerator with type %s already exists", a.AccType()) + klog.V(5).Infof("Accelerator with type %s already exists", a.Device().HwType()) return } - r.Registry[a.AccType()] = a + r.Registry[a.Device().HwType()] = a } func (r *Registry) Unregister(a Accelerator) bool { - _, exists := r.Registry[a.AccType()] + _, exists := r.Registry[a.Device().HwType()] if exists { - delete(r.Registry, a.AccType()) + delete(r.Registry, a.Device().HwType()) return true } - klog.Errorf("Accelerator with type %s doesn't exist", a.AccType()) + klog.Errorf("Accelerator with type %s doesn't exist", a.Device().HwType()) return false } // Devices returns a map of supported accelerators. -func (r *Registry) Accelerators() map[device.DeviceType]Accelerator { - acc := map[device.DeviceType]Accelerator{} +func (r *Registry) accelerators() map[string]Accelerator { + acc := map[string]Accelerator{} if len(r.Registry) == 0 { // No accelerators found @@ -115,7 +97,7 @@ func (r *Registry) Accelerators() map[device.DeviceType]Accelerator { if a.IsRunning() { d := a.Device() if d.IsDeviceCollectionSupported() { - acc[d.DevType()] = a + acc[d.HwType()] = a } } } @@ -124,9 +106,15 @@ func (r *Registry) Accelerators() map[device.DeviceType]Accelerator { } // ActiveAcceleratorByType returns a map of supported accelerators based on the specified type... -func (r *Registry) ActiveAcceleratorByType(t AcceleratorType) Accelerator { +func (r *Registry) activeAcceleratorByType(t string) Accelerator { + if len(r.Registry) == 0 { + // No accelerators found + klog.V(5).Infof("No accelerators found") + return nil + } + for _, a := range r.Registry { - if a.AccType() == t && a.IsRunning() { + if a.Device().HwType() == t && a.IsRunning() { d := a.Device() if d.IsDeviceCollectionSupported() { return a @@ -136,56 +124,48 @@ func (r *Registry) ActiveAcceleratorByType(t AcceleratorType) Accelerator { return nil } -func New(atype AcceleratorType, sleep bool) (Accelerator, error) { - var numDevs int +func New(atype string, sleep bool) (Accelerator, error) { + var d devices.Device maxDeviceInitRetry := 10 - var d device.Device - switch atype { - case GPU: - numDevs = len(device.GetAllDeviceTypes()) - default: - return nil, errors.New("unsupported accelerator") + // Init the available devices. + + devs := devices.GetRegistry().GetAllDeviceTypes() + numDevs := len(devs) + if numDevs == 0 || !slices.Contains(devs, atype) { + return nil, errors.New("no devices found") } - if numDevs != 0 { - devices := device.GetAllDeviceTypes() - if !slices.Contains(devices, atype.String()) { - klog.Errorf("%v doesn't contain %s", devices, atype.String()) - return nil, errors.New("no devices found") - } - klog.V(5).Infof("Initializing the Accelerator of type %v", atype.String()) - - for i := 0; i < maxDeviceInitRetry; i++ { - if d = device.Startup(atype.String()); d == nil { - klog.Errorf("Could not init the %s device going to try again", atype.String()) - if sleep { - // The GPU operators typically takes longer time to initialize than kepler resulting in error to start the gpu driver - // therefore, we wait up to 1 min to allow the gpu operator initialize - time.Sleep(6 * time.Second) - } - continue + klog.V(5).Infof("Initializing the Accelerator of type %v", atype) + + for i := 0; i < maxDeviceInitRetry; i++ { + if d = devices.Startup(atype); d == nil { + klog.Errorf("Could not init the %s device going to try again", atype) + if sleep { + // The GPU operators typically takes longer time to initialize than kepler resulting in error to start the gpu driver + // therefore, we wait up to 1 min to allow the gpu operator initialize + time.Sleep(6 * time.Second) } - klog.V(5).Infof("Startup %s Accelerator successful", atype.String()) - break + continue } - } else { - return nil, errors.New("No Accelerator devices found") + klog.V(5).Infof("Startup %s Accelerator successful", atype) + break } return &accelerator{ dev: d, running: true, - accType: atype, }, nil } func Shutdown() { - if accelerators := GetRegistry().Accelerators(); accelerators != nil { + if accelerators := GetRegistry().accelerators(); accelerators != nil { for _, a := range accelerators { - klog.V(5).Infof("Shutting down %s", a.AccType().String()) + klog.V(5).Infof("Shutting down %s", a.Device().DevType()) a.stop() } + } else { + klog.V(5).Info("No devices to shutdown") } } @@ -205,16 +185,24 @@ func (a *accelerator) stop() { } // Device returns an accelerator interface -func (a *accelerator) Device() device.Device { +func (a *accelerator) Device() devices.Device { return a.dev } // DeviceType returns the accelerator's underlying device type -func (a *accelerator) AccType() AcceleratorType { - return a.accType +func (a *accelerator) DevType() string { + return a.dev.DevType().String() } // IsRunning returns the running status of an accelerator func (a *accelerator) IsRunning() bool { return a.running } + +func GetActiveAcceleratorByType(t string) Accelerator { + return GetRegistry().activeAcceleratorByType(t) +} + +func GetAccelerators() map[string]Accelerator { + return GetRegistry().accelerators() +} diff --git a/pkg/sensors/accelerator/accelerator_test.go b/pkg/sensors/accelerator/accelerator_test.go index 30d8575cf2..52695e7cc2 100644 --- a/pkg/sensors/accelerator/accelerator_test.go +++ b/pkg/sensors/accelerator/accelerator_test.go @@ -3,15 +3,11 @@ package accelerator import ( "testing" - "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" - - // Add supported devices. - - _ "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device/sources" + "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/devices" ) -func newMockDevice() device.Device { - return device.Startup(AcceleratorType(0).String()) +func newMockDevice() devices.Device { + return devices.Startup(devices.MOCK.String()) } func cleanupMockDevice() { @@ -30,7 +26,7 @@ func TestRegistry(t *testing.T) { name: "Empty registry", setup: func() *Registry { registry := Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } SetRegistry(®istry) @@ -38,19 +34,18 @@ func TestRegistry(t *testing.T) { }, expectedLen: 0, expectError: false, - cleanup: func() {}, + cleanup: func() { cleanupMockDevice() }, }, { name: "Non-empty registry", setup: func() *Registry { registry := &Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } SetRegistry(registry) - + devices.RegisterMockDevice() a := &accelerator{ dev: newMockDevice(), - accType: AcceleratorType(0), running: true, } registry.MustRegister(a) @@ -68,10 +63,10 @@ func TestRegistry(t *testing.T) { var a Accelerator var err error registry := tt.setup() - if a, err = New(AcceleratorType(0), true); err == nil { + if a, err = New("MOCK", true); err == nil { registry.MustRegister(a) // Register the accelerator with the registry } - accs := registry.Accelerators() + accs := GetAccelerators() if tt.expectError && err == nil { t.Errorf("expected an error but got nil") } @@ -90,7 +85,6 @@ func TestActiveAcceleratorByType(t *testing.T) { tests := []struct { name string setup func() *Registry - accType AcceleratorType expectError bool cleanup func() }{ @@ -98,53 +92,37 @@ func TestActiveAcceleratorByType(t *testing.T) { name: "No accelerators of given type", setup: func() *Registry { return &Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } }, - accType: AcceleratorType(0), expectError: true, - cleanup: func() {}, + cleanup: func() { cleanupMockDevice() }, }, { name: "One active accelerator of given type", setup: func() *Registry { registry := &Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } - registry.Registry[AcceleratorType(0)] = &accelerator{ + SetRegistry(registry) + devices.RegisterMockDevice() + a := &accelerator{ dev: newMockDevice(), - accType: AcceleratorType(0), running: true, } - return registry + registry.MustRegister(a) + + return GetRegistry() }, - accType: AcceleratorType(0), expectError: false, cleanup: func() { cleanupMockDevice() }, }, - { - name: "One inactive accelerator of given type", - setup: func() *Registry { - registry := &Registry{ - Registry: map[AcceleratorType]Accelerator{}, - } - registry.Registry[AcceleratorType(0)] = &accelerator{ - dev: newMockDevice(), - accType: AcceleratorType(0), - running: false, - } - return registry - }, - accType: AcceleratorType(0), - expectError: true, - cleanup: func() {}, - }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - registry := tt.setup() - accs := registry.ActiveAcceleratorByType(tt.accType) + tt.setup() + accs := GetActiveAcceleratorByType("MOCK") if tt.expectError && accs != nil { t.Errorf("expected an error") } @@ -159,7 +137,7 @@ func TestActiveAcceleratorByType(t *testing.T) { func TestCreateAndRegister(t *testing.T) { tests := []struct { name string - accType AcceleratorType + accType string setup func() *Registry sleep bool expectError bool @@ -167,15 +145,15 @@ func TestCreateAndRegister(t *testing.T) { }{ { name: "Unsupported accelerator", - accType: AcceleratorType(999), // invalid accelerator type + accType: "UNSUPPORTED", // invalid accelerator type setup: func() *Registry { return &Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } }, sleep: false, expectError: true, - cleanup: func() {}, + cleanup: func() { cleanupMockDevice() }, }, } @@ -197,37 +175,34 @@ func TestCreateAndRegister(t *testing.T) { func TestShutdown(t *testing.T) { tests := []struct { - name string - setup func() *Registry - accType AcceleratorType + name string + setup func() *Registry }{ { name: "Shutdown active accelerators", setup: func() *Registry { registry := &Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } SetRegistry(registry) + devices.RegisterMockDevice() a := &accelerator{ dev: newMockDevice(), - accType: AcceleratorType(0), running: true, } registry.MustRegister(a) return GetRegistry() }, - accType: AcceleratorType(0), }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - registry := tt.setup() - SetRegistry(registry) + tt.setup() Shutdown() - accs := GetRegistry().Accelerators() + accs := GetAccelerators() for _, a := range accs { if a.IsRunning() { t.Errorf("expected accelerator to be stopped but it is still running") @@ -238,23 +213,30 @@ func TestShutdown(t *testing.T) { } func TestAcceleratorMethods(t *testing.T) { - devType := device.DeviceType(0) + registry := &Registry{ + Registry: map[string]Accelerator{}, + } + + SetRegistry(registry) + + devices.RegisterMockDevice() + acc := &accelerator{ dev: newMockDevice(), running: true, - accType: AcceleratorType(0), } + registry.MustRegister(acc) - if got := acc.Device(); got.DevType() != devType { + devType := acc.dev.HwType() + + if got := acc.Device(); got.HwType() != devType { t.Errorf("expected device type %v, got %v", devType, got.DevType()) } - if got := acc.Device().DevType(); got != devType { + if got := acc.Device().HwType(); got != devType { t.Errorf("expected device type %v, got %v", devType, got) } if got := acc.IsRunning(); !got { t.Errorf("expected accelerator to be running, got %v", got) } - if got := acc.AccType(); got != AcceleratorType(0) { - t.Errorf("expected accelerator type AcceleratorType(0), got %v", got) - } + Shutdown() } diff --git a/pkg/sensors/accelerator/device/sources/dummy.go b/pkg/sensors/accelerator/device/sources/dummy.go deleted file mode 100644 index 40199e25ee..0000000000 --- a/pkg/sensors/accelerator/device/sources/dummy.go +++ /dev/null @@ -1,120 +0,0 @@ -//go:build !habana || !nvml || !dcgm -// +build !habana !nvml !dcgm - -/* -Copyright 2024. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package sources - -import ( - "time" - - "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" -) - -var ( - dummyDevice device.DeviceType -) - -type Dummy struct { - dummyDevice device.DeviceType - name string - collectionSupported bool -} - -func init() { - dummyDevice = device.DUMMY - device.AddDeviceInterface(dummyDevice, dummyDevice.String(), DummyDeviceStartup) -} - -func DummyDeviceStartup() device.Device { - d := Dummy{ - dummyDevice: dummyDevice, - name: dummyDevice.String(), - collectionSupported: true, - } - - return &d -} - -func (d *Dummy) Name() string { - return d.name -} - -func (d *Dummy) DevType() device.DeviceType { - return d.dummyDevice -} - -func (d *Dummy) HwType() string { - return d.dummyDevice.String() -} - -func (d *Dummy) InitLib() error { - return nil -} - -func (d *Dummy) Init() error { - return nil -} - -func (d *Dummy) Shutdown() bool { - return true -} - -func (d *Dummy) AbsEnergyFromDevice() []uint32 { - return nil -} - -func (d *Dummy) DevicesByID() map[int]any { - return nil -} - -func (d *Dummy) DevicesByName() map[string]any { - return nil -} - -func (d *Dummy) DeviceInstances() map[int]map[int]any { - return nil -} - -func (d *Dummy) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { - ds := make(map[any]interface{}) // Process Accelerator Metrics - return ds, nil -} - -func (d *Dummy) ProcessResourceUtilizationPerDevice(dev any, _ time.Duration) (map[uint32]any, error) { - processAcceleratorMetrics := map[uint32]device.GPUProcessUtilizationSample{} - pam := make(map[uint32]interface{}) - processAcceleratorMetrics[0] = device.GPUProcessUtilizationSample{ - Pid: 0, - TimeStamp: uint64(time.Now().UnixNano()), - ComputeUtil: 10, - MemUtil: 10, - EncUtil: 10, - DecUtil: 10, - } - for k, v := range processAcceleratorMetrics { - pam[k] = v - } - return pam, nil -} - -func (d *Dummy) IsDeviceCollectionSupported() bool { - return d.collectionSupported -} - -func (d *Dummy) SetDeviceCollectionSupported(supported bool) { - d.collectionSupported = supported -} diff --git a/pkg/sensors/accelerator/device/sources/dcgm.go b/pkg/sensors/accelerator/devices/dcgm.go similarity index 100% rename from pkg/sensors/accelerator/device/sources/dcgm.go rename to pkg/sensors/accelerator/devices/dcgm.go diff --git a/pkg/sensors/accelerator/device/device.go b/pkg/sensors/accelerator/devices/device.go similarity index 74% rename from pkg/sensors/accelerator/device/device.go rename to pkg/sensors/accelerator/devices/device.go index b46ef37f04..58d1f842f4 100644 --- a/pkg/sensors/accelerator/device/device.go +++ b/pkg/sensors/accelerator/devices/device.go @@ -1,22 +1,23 @@ -package device +package devices import ( "sync" "time" + "github.com/sustainable-computing-io/kepler/pkg/config" "golang.org/x/exp/maps" "k8s.io/klog/v2" ) const ( - DUMMY DeviceType = iota + MOCK DeviceType = iota HABANA DCGM NVML ) var ( - globalRegistry *Registry + deviceRegistry *Registry once sync.Once ) @@ -29,7 +30,7 @@ type ( ) func (d DeviceType) String() string { - return [...]string{"DUMMY", "HABANA", "DCGM", "NVML"}[d] + return [...]string{"MOCK", "HABANA", "DCGM", "NVML"}[d] } type Device interface { @@ -66,18 +67,33 @@ type Device interface { // Registry gets the default device Registry instance func GetRegistry() *Registry { once.Do(func() { - globalRegistry = &Registry{ - Registry: map[string]map[DeviceType]deviceStartupFunc{}, - } + deviceRegistry = newRegistry() + registerDevices(deviceRegistry) }) - return globalRegistry + return deviceRegistry +} + +// NewRegistry creates a new instance of Registry without registering devices +func newRegistry() *Registry { + return &Registry{ + Registry: map[string]map[DeviceType]deviceStartupFunc{}, + } } // SetRegistry replaces the global registry instance // NOTE: All plugins will need to be manually registered // after this function is called. func SetRegistry(registry *Registry) { - globalRegistry = registry + deviceRegistry = registry + registerDevices(deviceRegistry) +} + +// Register all available devices in the global registry +func registerDevices(r *Registry) { + // Call individual device check functions + dcgmCheck(r) + habanaCheck(r) + nvmlCheck(r) } func (r *Registry) MustRegister(a string, d DeviceType, deviceStartup deviceStartupFunc) { @@ -103,40 +119,41 @@ func (r *Registry) Unregister(d DeviceType) { klog.Errorf("Device with type %s doesn't exist", d) } -// AddDeviceInterface adds a supported device interface, prints a fatal error in case of double registration. -func AddDeviceInterface(dtype DeviceType, accType string, deviceStartup deviceStartupFunc) { +// GetAllDeviceTypes returns a slice with all the registered devices. +func (r *Registry) GetAllDeviceTypes() []string { + devices := append([]string{}, maps.Keys(r.Registry)...) + return devices +} + +func addDeviceInterface(registry *Registry, dtype DeviceType, accType string, deviceStartup deviceStartupFunc) { switch accType { - case "GPU", "DUMMY": - // Handle GPU|Dummy device startup function registration - if existingDevice := GetRegistry().Registry[accType][dtype]; existingDevice != nil { + case config.GPU: + // Check if device is already registered + if existingDevice := registry.Registry[accType][dtype]; existingDevice != nil { klog.Errorf("Multiple Devices attempting to register with name %q", dtype.String()) return } if dtype == DCGM { // Remove "nvml" if "dcgm" is being registered - GetRegistry().Unregister(NVML) + registry.Unregister(NVML) } else if dtype == NVML { // Do not register "nvml" if "dcgm" is already registered - if _, ok := GetRegistry().Registry["GPU"][DCGM]; ok { + if _, ok := registry.Registry[config.GPU][DCGM]; ok { return } } + klog.V(5).Infof("Try to Register %s", dtype) - GetRegistry().MustRegister(accType, dtype, deviceStartup) + registry.MustRegister(accType, dtype, deviceStartup) default: - klog.Errorf("Unsupported device type %q", dtype) + klog.V(5).Infof("Try to Register %s", dtype) + registry.MustRegister(accType, dtype, deviceStartup) } klog.V(5).Infof("Registered %s", dtype) } -// GetAllDeviceTypes returns a slice with all the registered devices. -func GetAllDeviceTypes() []string { - devices := append([]string{}, maps.Keys(GetRegistry().Registry)...) - return devices -} - // Startup initializes and returns a new Device according to the given DeviceType [NVML|DCGM|DUMMY|HABANA]. func Startup(a string) Device { // Retrieve the global registry diff --git a/pkg/sensors/accelerator/devices/fakehabana.go b/pkg/sensors/accelerator/devices/fakehabana.go new file mode 100644 index 0000000000..95e16d79d5 --- /dev/null +++ b/pkg/sensors/accelerator/devices/fakehabana.go @@ -0,0 +1,25 @@ +//go:build !habana +// +build !habana + +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package devices + +import "k8s.io/klog/v2" + +func habanaCheck(r *Registry) { + klog.V(5).Info("Error initializing habana: ERROR_LIBRARY_NOT_FOUND") +} diff --git a/pkg/sensors/accelerator/device/sources/habana.go b/pkg/sensors/accelerator/devices/habana.go similarity index 84% rename from pkg/sensors/accelerator/device/sources/habana.go rename to pkg/sensors/accelerator/devices/habana.go index 7cfefb3ce6..d93c884360 100644 --- a/pkg/sensors/accelerator/device/sources/habana.go +++ b/pkg/sensors/accelerator/devices/habana.go @@ -16,25 +16,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -package sources +package devices import ( + "errors" + "os" "time" hlml "github.com/HabanaAI/gohlml" - "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" "k8s.io/klog/v2" "github.com/sustainable-computing-io/kepler/pkg/config" ) const ( - habanaAccType = "GPU" + habanaAccType = config.GPU + libhlmlpath = "/usr/lib/habanalabs/libhlml.so" ) var ( habanaAccImpl = GPUHabana{} - habanaType device.DeviceType + habanaType DeviceType ) type GPUHabana struct { @@ -42,18 +44,18 @@ type GPUHabana struct { devices map[int]interface{} } -func init() { +func habanaCheck(r *Registry) { if err := habanaAccImpl.InitLib(); err != nil { - klog.Infof("Error initializing %s: %v", habanaAccImpl.Name(), err) + klog.V(5).Infof("Error initializing %s: %v", habanaAccImpl.Name(), err) return } - habanaType = device.HABANA + habanaType = HABANA klog.V(5).Infof("Register %s with device startup register", habanaType) - device.AddDeviceInterface(habanaType, habanaAccType, habanaDeviceStartup) + addDeviceInterface(r, habanaType, habanaAccType, habanaDeviceStartup) klog.Infof("Using %s to obtain processor power", habanaAccImpl.Name()) } -func habanaDeviceStartup() device.Device { +func habanaDeviceStartup() Device { a := habanaAccImpl if err := a.Init(); err != nil { @@ -68,7 +70,7 @@ func (g *GPUHabana) Name() string { return habanaType.String() } -func (g *GPUHabana) DevType() device.DeviceType { +func (g *GPUHabana) DevType() DeviceType { return habanaType } @@ -77,6 +79,9 @@ func (g *GPUHabana) HwType() string { } func (g *GPUHabana) InitLib() error { + if _, err := os.Stat(libhlmlpath); errors.Is(err, os.ErrNotExist) { + return err + } return nil } @@ -103,7 +108,7 @@ func (g *GPUHabana) Shutdown() bool { func (g *GPUHabana) AbsEnergyFromDevice() []uint32 { gpuEnergy := []uint32{} for _, dev := range g.devices { - power, ret := dev.(device.GPUDevice).DeviceHandler.(hlml.Device).PowerUsage() + power, ret := dev.(GPUDevice).DeviceHandler.(hlml.Device).PowerUsage() if ret != nil { klog.Errorf("failed to get power usage on device %v: %v\n", dev, ret) continue @@ -111,7 +116,7 @@ func (g *GPUHabana) AbsEnergyFromDevice() []uint32 { energy := uint32(uint64(power) * config.SamplePeriodSec()) gpuEnergy = append(gpuEnergy, energy) - dname, _ := dev.(device.GPUDevice).DeviceHandler.(hlml.Device).Name() + dname, _ := dev.(GPUDevice).DeviceHandler.(hlml.Device).Name() klog.V(5).Infof("AbsEnergyFromDevice power usage on device %v: %v\n", dname, gpuEnergy) } @@ -133,7 +138,7 @@ func (g *GPUHabana) DevicesByID() map[int]interface{} { for i := 0; i < int(count); i++ { // Get the device handle for the current index if h, ret := hlml.DeviceHandleByIndex(uint(i)); ret == nil { - devices[i] = device.GPUDevice{ + devices[i] = GPUDevice{ DeviceHandler: h, } } diff --git a/pkg/sensors/accelerator/devices/mock_device.go b/pkg/sensors/accelerator/devices/mock_device.go new file mode 100644 index 0000000000..8cb8329597 --- /dev/null +++ b/pkg/sensors/accelerator/devices/mock_device.go @@ -0,0 +1,116 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package devices + +import ( + "time" +) + +var ( + mockDevice = MOCK +) + +type MockDevice struct { + mockDevice DeviceType + name string + collectionSupported bool +} + +func RegisterMockDevice() { + r := GetRegistry() + addDeviceInterface(r, mockDevice, mockDevice.String(), MockDeviceDeviceStartup) +} + +func MockDeviceDeviceStartup() Device { + d := MockDevice{ + mockDevice: mockDevice, + name: mockDevice.String(), + collectionSupported: true, + } + + return &d +} + +func (d *MockDevice) Name() string { + return d.name +} + +func (d *MockDevice) DevType() DeviceType { + return d.mockDevice +} + +func (d *MockDevice) HwType() string { + return d.mockDevice.String() +} + +func (d *MockDevice) InitLib() error { + return nil +} + +func (d *MockDevice) Init() error { + return nil +} + +func (d *MockDevice) Shutdown() bool { + GetRegistry().Unregister(d.DevType()) + return true +} + +func (d *MockDevice) AbsEnergyFromDevice() []uint32 { + return nil +} + +func (d *MockDevice) DevicesByID() map[int]any { + return nil +} + +func (d *MockDevice) DevicesByName() map[string]any { + return nil +} + +func (d *MockDevice) DeviceInstances() map[int]map[int]any { + return nil +} + +func (d *MockDevice) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { + ds := make(map[any]interface{}) // Process Accelerator Metrics + return ds, nil +} + +func (d *MockDevice) ProcessResourceUtilizationPerDevice(dev any, _ time.Duration) (map[uint32]any, error) { + processAcceleratorMetrics := map[uint32]GPUProcessUtilizationSample{} + pam := make(map[uint32]interface{}) + processAcceleratorMetrics[0] = GPUProcessUtilizationSample{ + Pid: 0, + TimeStamp: uint64(time.Now().UnixNano()), + ComputeUtil: 10, + MemUtil: 10, + EncUtil: 10, + DecUtil: 10, + } + for k, v := range processAcceleratorMetrics { + pam[k] = v + } + return pam, nil +} + +func (d *MockDevice) IsDeviceCollectionSupported() bool { + return d.collectionSupported +} + +func (d *MockDevice) SetDeviceCollectionSupported(supported bool) { + d.collectionSupported = supported +} diff --git a/pkg/sensors/accelerator/device/sources/nvml.go b/pkg/sensors/accelerator/devices/nvml.go similarity index 86% rename from pkg/sensors/accelerator/device/sources/nvml.go rename to pkg/sensors/accelerator/devices/nvml.go index 04b9931c75..f3f6e1bbfb 100644 --- a/pkg/sensors/accelerator/device/sources/nvml.go +++ b/pkg/sensors/accelerator/devices/nvml.go @@ -1,8 +1,5 @@ -//go:build nvml -// +build nvml - /* -Copyright 2021. +Copyright 2021-2024 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -package sources +package devices import ( "errors" @@ -27,38 +24,36 @@ import ( "k8s.io/klog/v2" "github.com/sustainable-computing-io/kepler/pkg/config" - "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" ) const ( - nvmlHwType = "GPU" + nvmlHwType = config.GPU ) var ( nvmlAccImpl = GPUNvml{} - nvmlType device.DeviceType + nvmlType DeviceType ) type GPUNvml struct { libInited bool collectionSupported bool - devices map[int]device.GPUDevice // List of GPU identifiers for the device - processUtilizationSupported bool // bool to check if the process utilization collection is supported + devices map[int]GPUDevice // List of GPU identifiers for the device + processUtilizationSupported bool // bool to check if the process utilization collection is supported } -func init() { +func nvmlCheck(r *Registry) { if err := nvml.Init(); err != nvml.SUCCESS { - klog.Errorf("Error initializing nvml: %v", err) + klog.V(5).Infof("Error initializing nvml: %v", nvmlErrorString(err)) return } klog.Info("Initializing nvml Successful") - nvmlType = device.NVML - device.AddDeviceInterface(nvmlType, nvmlHwType, nvmlDeviceStartup) + nvmlType = NVML + addDeviceInterface(r, nvmlType, nvmlHwType, nvmlDeviceStartup) klog.Infof("Using %s to obtain processor power", nvmlAccImpl.Name()) - } -func nvmlDeviceStartup() device.Device { +func nvmlDeviceStartup() Device { a := nvmlAccImpl if err := a.InitLib(); err != nil { klog.Errorf("Error initializing %s: %v", nvmlType.String(), err) @@ -76,7 +71,7 @@ func (n *GPUNvml) Name() string { return nvmlType.String() } -func (n *GPUNvml) DevType() device.DeviceType { +func (n *GPUNvml) DevType() DeviceType { return nvmlType } @@ -125,7 +120,7 @@ func (n *GPUNvml) Init() (err error) { return err } klog.Infof("found %d gpu devices\n", count) - n.devices = make(map[int]device.GPUDevice, count) + n.devices = make(map[int]GPUDevice, count) for gpuID := 0; gpuID < count; gpuID++ { nvmlDeviceHandler, ret := nvml.DeviceGetHandleByIndex(gpuID) if ret != nvml.SUCCESS { @@ -137,7 +132,7 @@ func (n *GPUNvml) Init() (err error) { name, _ := nvmlDeviceHandler.GetName() uuid, _ := nvmlDeviceHandler.GetUUID() klog.Infof("GPU %v %q %q", gpuID, name, uuid) - dev := device.GPUDevice{ + dev := GPUDevice{ DeviceHandler: nvmlDeviceHandler, ID: gpuID, IsSubdevice: false, @@ -199,17 +194,17 @@ func (n *GPUNvml) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { // GPUProcessUtilizationSample.SmUtil represents the process Streaming Multiprocessors - SM (3D/Compute) utilization in percentage. // GPUProcessUtilizationSample.MemUtil represents the process Frame Buffer Memory utilization Value. func (n *GPUNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]any, error) { - processAcceleratorMetrics := map[uint32]device.GPUProcessUtilizationSample{} + processAcceleratorMetrics := map[uint32]GPUProcessUtilizationSample{} pam := make(map[uint32]interface{}) lastUtilizationTimestamp := uint64(time.Now().Add(-1*since).UnixNano() / 1000) switch d := dev.(type) { - case device.GPUDevice: + case GPUDevice: if d.DeviceHandler == nil { return pam, nil } if n.processUtilizationSupported { - GPUProcessUtilizationSample, ret := d.DeviceHandler.(nvml.Device).GetProcessUtilization(lastUtilizationTimestamp) + gpuProcessUtilizationSample, ret := d.DeviceHandler.(nvml.Device).GetProcessUtilization(lastUtilizationTimestamp) if ret != nvml.SUCCESS { if ret == nvml.ERROR_NOT_FOUND { // Ignore the error if there is no process running in the GPU @@ -217,10 +212,10 @@ func (n *GPUNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Durati } n.processUtilizationSupported = false } else { - for _, pinfo := range GPUProcessUtilizationSample { + for _, pinfo := range gpuProcessUtilizationSample { // pid 0 means no data if pinfo.Pid != 0 { - processAcceleratorMetrics[pinfo.Pid] = device.GPUProcessUtilizationSample{ + processAcceleratorMetrics[pinfo.Pid] = GPUProcessUtilizationSample{ Pid: pinfo.Pid, TimeStamp: pinfo.TimeStamp, ComputeUtil: pinfo.SmUtil, @@ -251,7 +246,7 @@ func (n *GPUNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Durati for _, pinfo := range processInfo { // pid 0 means no data if pinfo.Pid != 0 { - processAcceleratorMetrics[pinfo.Pid] = device.GPUProcessUtilizationSample{ + processAcceleratorMetrics[pinfo.Pid] = GPUProcessUtilizationSample{ Pid: pinfo.Pid, MemUtil: uint32(pinfo.UsedGpuMemory * 100 / memoryInfo.Total), } @@ -266,7 +261,7 @@ func (n *GPUNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Durati return pam, nil default: - klog.Error("expected device.GPUDevice but got come other type") + klog.Error("expected GPUDevice but got come other type") return pam, errors.New("invalid device type") } } diff --git a/pkg/sensors/accelerator/device/type.go b/pkg/sensors/accelerator/devices/type.go similarity index 98% rename from pkg/sensors/accelerator/device/type.go rename to pkg/sensors/accelerator/devices/type.go index f14dcb455c..c0bfc92104 100644 --- a/pkg/sensors/accelerator/device/type.go +++ b/pkg/sensors/accelerator/devices/type.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package device +package devices type GPUProcessUtilizationSample struct { Pid uint32 From ee20e91d8ceea6cf0bd0ccffd9148280c0e41549 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 19 Aug 2024 09:43:54 -0400 Subject: [PATCH 2/3] chore: add missing copyright tags to gpu code Signed-off-by: Maryam Tahhan --- pkg/sensors/accelerator/accelerator_test.go | 15 +++++++++++++++ pkg/sensors/accelerator/devices/device.go | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/pkg/sensors/accelerator/accelerator_test.go b/pkg/sensors/accelerator/accelerator_test.go index 52695e7cc2..59f1ed96f2 100644 --- a/pkg/sensors/accelerator/accelerator_test.go +++ b/pkg/sensors/accelerator/accelerator_test.go @@ -1,3 +1,18 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package accelerator import ( diff --git a/pkg/sensors/accelerator/devices/device.go b/pkg/sensors/accelerator/devices/device.go index 58d1f842f4..7c564f0d60 100644 --- a/pkg/sensors/accelerator/devices/device.go +++ b/pkg/sensors/accelerator/devices/device.go @@ -1,3 +1,18 @@ +/* +Copyright 2024. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package devices import ( From bb1b4efe26a0742fe0ee549c1ce5374b04d2c214 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 19 Aug 2024 10:20:48 -0400 Subject: [PATCH 3/3] chore: remove gpu global vars Signed-off-by: Maryam Tahhan --- pkg/sensors/accelerator/devices/dcgm.go | 147 +++++++++--------- pkg/sensors/accelerator/devices/device.go | 11 +- pkg/sensors/accelerator/devices/habana.go | 39 ++--- .../accelerator/devices/mock_device.go | 6 +- pkg/sensors/accelerator/devices/nvml.go | 41 ++--- 5 files changed, 125 insertions(+), 119 deletions(-) diff --git a/pkg/sensors/accelerator/devices/dcgm.go b/pkg/sensors/accelerator/devices/dcgm.go index d3e9c5fb4e..95ca57e06d 100644 --- a/pkg/sensors/accelerator/devices/dcgm.go +++ b/pkg/sensors/accelerator/devices/dcgm.go @@ -1,6 +1,3 @@ -//go:build dcgm -// +build dcgm - /* Copyright 2024. @@ -17,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package sources +package devices import ( "errors" @@ -27,7 +24,6 @@ import ( "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/sustainable-computing-io/kepler/pkg/config" - "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" "k8s.io/klog/v2" ) @@ -35,24 +31,24 @@ const ( debugLevel = 5 isSocket = "0" maxMIGProfiles = 15 // use a large number since the profile ids are not linear - dcgmHwType = "GPU" + dcgmHwType = config.GPU ) var ( - dcgmAccImpl = GPUDcgm{} + dcgmAccImpl = gpuDcgm{} deviceFields []dcgm.Short = []dcgm.Short{ // https://docs.nvidia.com/datacenter/dcgm/1.7/dcgm-api/group__dcgmFieldIdentifiers.htm dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, } ratioFields uint = dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE // this is the field that we will use to calculate the utilization per @yuezhu1 profileInfos map[int]nvml.GpuInstanceProfileInfo - dcgmType device.DeviceType + dcgmType DeviceType ) -type GPUDcgm struct { +type gpuDcgm struct { collectionSupported bool - devices map[int]device.GPUDevice - migDevices map[int]map[int]device.GPUDevice // list of mig devices for each GPU instance + devs map[int]GPUDevice + migDevices map[int]map[int]GPUDevice // list of mig devices for each GPU instance libInited bool nvmlInited bool deviceGroupName string @@ -62,18 +58,21 @@ type GPUDcgm struct { cleanup func() } -func init() { +func dcgmCheck(r *Registry) { if _, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint(), isSocket); err != nil { - klog.Errorf("Error initializing dcgm: %v", err) + klog.V(5).Infof("Error initializing dcgm: %v", err) return } klog.Info("Initializing dcgm Successful") - dcgmType = device.DCGM - device.AddDeviceInterface(dcgmType, dcgmHwType, dcgmDeviceStartup) - klog.Infof("Using %s to obtain processor power", dcgmAccImpl.Name()) + dcgmType = DCGM + if err := addDeviceInterface(r, dcgmType, dcgmHwType, dcgmDeviceStartup); err == nil { + klog.Infof("Using %s to obtain processor power", dcgmAccImpl.Name()) + } else { + klog.V(5).Infof("Error registering DCGM: %v", err) + } } -func dcgmDeviceStartup() device.Device { +func dcgmDeviceStartup() Device { a := dcgmAccImpl if err := a.InitLib(); err != nil { @@ -91,7 +90,7 @@ func dcgmDeviceStartup() device.Device { return &a } -func (d *GPUDcgm) Init() error { +func (d *gpuDcgm) Init() error { if !d.libInited { if err := d.InitLib(); err != nil { klog.Errorf("failed to init lib: %v", err) @@ -126,7 +125,7 @@ func (d *GPUDcgm) Init() error { return nil } -func (d *GPUDcgm) InitLib() (err error) { +func (d *gpuDcgm) InitLib() (err error) { defer func() { if r := recover(); r != nil { err = fmt.Errorf("could not init dcgm: %v", r) @@ -147,7 +146,7 @@ func (d *GPUDcgm) InitLib() (err error) { klog.Info("Started DCGM in the Embedded mode ") } d.nvmlInited = false - d.devices = make(map[int]device.GPUDevice) + d.devs = make(map[int]GPUDevice) d.cleanup = cleanup dcgm.FieldsInit() @@ -169,8 +168,8 @@ func (d *GPUDcgm) InitLib() (err error) { return nil } -func (d *GPUDcgm) loadDevices() error { - d.devices = map[int]device.GPUDevice{} +func (d *gpuDcgm) loadDevices() error { + d.devs = map[int]GPUDevice{} count, err := nvml.DeviceGetCount() if err != nvml.SUCCESS { return fmt.Errorf("error getting GPUs: %s", nvml.ErrorString(err)) @@ -181,19 +180,19 @@ func (d *GPUDcgm) loadDevices() error { klog.Errorf("failed to get device handle for index %d: %v", gpuID, nvml.ErrorString(err)) continue } - dev := device.GPUDevice{ + dev := GPUDevice{ DeviceHandler: nvmlDeviceHandler, ID: gpuID, IsSubdevice: false, } - d.devices[gpuID] = dev + d.devs[gpuID] = dev } return nil } // LoadMIGDevices dynamically discover the MIG instances of all GPUs -func (d *GPUDcgm) LoadMIGDevices() { - d.migDevices = map[int]map[int]device.GPUDevice{} +func (d *gpuDcgm) LoadMIGDevices() { + d.migDevices = map[int]map[int]GPUDevice{} // find all GPUs and the MIG slices if they exist hierarchy, err := dcgm.GetGpuInstanceHierarchy() @@ -206,17 +205,18 @@ func (d *GPUDcgm) LoadMIGDevices() { // the bigger MIG profiles that a GPU can have to be used to calculate the SM ratio fullGPUProfile := profileInfos[0] - for _, entity := range hierarchy.EntityList { + for i := range hierarchy.EntityList { + entity := &hierarchy.EntityList[i] if entity.Entity.EntityGroupId != dcgm.FE_GPU && entity.Entity.EntityGroupId != dcgm.FE_GPU_I { continue } parentGPUID := int(entity.Parent.EntityId) - parentDevice := d.devices[parentGPUID] + parentDevice := d.devs[parentGPUID] // init migDevices if _, exit := d.migDevices[parentGPUID]; !exit { - d.migDevices[parentGPUID] = map[int]device.GPUDevice{} + d.migDevices[parentGPUID] = map[int]GPUDevice{} } // find MIG device handler @@ -234,7 +234,7 @@ func (d *GPUDcgm) LoadMIGDevices() { // add MIG device migNvmlEntityID := int(entity.Entity.EntityId) - d.migDevices[parentGPUID][migNvmlEntityID] = device.GPUDevice{ + d.migDevices[parentGPUID][migNvmlEntityID] = GPUDevice{ DeviceHandler: migDeviceHandler, ID: migNvmlEntityID, ParentID: parentGPUID, @@ -244,13 +244,13 @@ func (d *GPUDcgm) LoadMIGDevices() { } } -func (d *GPUDcgm) loadMIGProfiles() { - if len(d.devices) == 0 { +func (d *gpuDcgm) loadMIGProfiles() { + if len(d.devs) == 0 { klog.Errorln("DCGM has no GPU to monitor") return } profileInfos = map[int]nvml.GpuInstanceProfileInfo{} - for _, dev := range d.devices { + for _, dev := range d.devs { for i := 0; i < maxMIGProfiles; i++ { profileInfo, ret := dev.DeviceHandler.(nvml.Device).GetGpuInstanceProfileInfo(i) if ret != nvml.SUCCESS { @@ -264,36 +264,40 @@ func (d *GPUDcgm) loadMIGProfiles() { } } -func (d *GPUDcgm) Name() string { +func (d *gpuDcgm) Name() string { return dcgmType.String() } -func (d *GPUDcgm) DevType() device.DeviceType { +func (d *gpuDcgm) DevType() DeviceType { return dcgmType } -func (d *GPUDcgm) HwType() string { +func (d *gpuDcgm) HwType() string { return dcgmHwType } -func (d *GPUDcgm) IsDeviceCollectionSupported() bool { +func (d *gpuDcgm) IsDeviceCollectionSupported() bool { return d.collectionSupported } -func (d *GPUDcgm) SetDeviceCollectionSupported(supported bool) { +func (d *gpuDcgm) SetDeviceCollectionSupported(supported bool) { d.collectionSupported = supported } -func (d *GPUDcgm) Shutdown() bool { +func (d *gpuDcgm) Shutdown() bool { if d.nvmlInited { nvml.Shutdown() } dcgm.FieldsTerm() if d.deviceGroupName != "" { - dcgm.DestroyGroup(d.deviceGroupHandle) + if err := dcgm.DestroyGroup(d.deviceGroupHandle); err != nil { + klog.Errorf("failed to destroy group %v", err) + } } if d.fieldGroupName != "" { - dcgm.FieldGroupDestroy(d.fieldGroupHandle) + if err := dcgm.FieldGroupDestroy(d.fieldGroupHandle); err != nil { + klog.Errorf("failed to destroy field group %v", err) + } } if d.cleanup != nil { d.cleanup() @@ -304,9 +308,9 @@ func (d *GPUDcgm) Shutdown() bool { return true } -func (d *GPUDcgm) AbsEnergyFromDevice() []uint32 { +func (d *gpuDcgm) AbsEnergyFromDevice() []uint32 { gpuEnergy := []uint32{} - for _, dev := range d.devices { + for _, dev := range d.devs { power, ret := dev.DeviceHandler.(nvml.Device).GetPowerUsage() if ret != nvml.SUCCESS { klog.Errorf("failed to get power usage on device %v: %v\n", dev, nvml.ErrorString(ret)) @@ -320,20 +324,20 @@ func (d *GPUDcgm) AbsEnergyFromDevice() []uint32 { return gpuEnergy } -func (d *GPUDcgm) DevicesByID() map[int]any { - devices := make(map[int]any) - for id, dev := range d.devices { - devices[id] = dev +func (d *gpuDcgm) DevicesByID() map[int]any { + devs := make(map[int]any) + for id, dev := range d.devs { + devs[id] = dev } - return devices + return devs } -func (d *GPUDcgm) DevicesByName() map[string]any { - devices := make(map[string]any) - return devices +func (d *gpuDcgm) DevicesByName() map[string]any { + devs := make(map[string]any) + return devs } -func (d *GPUDcgm) DeviceInstances() map[int]map[int]any { +func (d *gpuDcgm) DeviceInstances() map[int]map[int]any { // LoadMIGDevices d.LoadMIGDevices() @@ -348,19 +352,19 @@ func (d *GPUDcgm) DeviceInstances() map[int]map[int]any { return devInstances } -func (d *GPUDcgm) DeviceUtilizationStats(dev any) (map[any]any, error) { +func (d *gpuDcgm) DeviceUtilizationStats(dev any) (map[any]any, error) { ds := make(map[any]any) // Process Accelerator Metrics return ds, nil } // ProcessResourceUtilizationPerDevice returns the GPU utilization per process. The gpuID can be a MIG instance or the main GPU -func (d *GPUDcgm) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]any, error) { - processAcceleratorMetrics := map[uint32]device.GPUProcessUtilizationSample{} +func (d *gpuDcgm) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]any, error) { + processAcceleratorMetrics := map[uint32]GPUProcessUtilizationSample{} pam := make(map[uint32]any) // Check if the device is of type dev.GPUDevice and extract the DeviceHandler switch d := dev.(type) { - case device.GPUDevice: + case GPUDevice: if d.DeviceHandler == nil { return pam, nil } @@ -387,7 +391,7 @@ func (d *GPUDcgm) ProcessResourceUtilizationPerDevice(dev any, since time.Durati } } } - processAcceleratorMetrics[p.Pid] = device.GPUProcessUtilizationSample{ + processAcceleratorMetrics[p.Pid] = GPUProcessUtilizationSample{ Pid: p.Pid, TimeStamp: uint64(time.Now().UnixNano()), // TODO: It does not make sense to use the whole GPU utilization since a GPU might have more than one PID @@ -406,10 +410,10 @@ func (d *GPUDcgm) ProcessResourceUtilizationPerDevice(dev any, since time.Durati if val.FieldId == ratioFields { migUtilization := ToFloat64(val, 100) // ratio of active multiprocessors to total multiprocessors - // the MIG metrics represent the utilization of the MIG device. We need to normalize the metric to represent the overall GPU utilization + // the MIG metrics represent the utilization of the MIG We need to normalize the metric to represent the overall GPU utilization // FIXME: the MIG device could have multiple processes, such as using MPS, how to split the MIG utilization between the processes? gpuUtilization := migUtilization * d.MIGSMRatio - processAcceleratorMetrics[p.Pid] = device.GPUProcessUtilizationSample{ + processAcceleratorMetrics[p.Pid] = GPUProcessUtilizationSample{ Pid: p.Pid, TimeStamp: uint64(time.Now().UnixNano()), ComputeUtil: uint32(gpuUtilization), @@ -425,13 +429,13 @@ func (d *GPUDcgm) ProcessResourceUtilizationPerDevice(dev any, since time.Durati return pam, nil default: - klog.Error("expected device.GPUDevice but got come other type") + klog.Error("expected GPUDevice but got come other type") return pam, errors.New("invalid device type") } } // helper functions -func (d *GPUDcgm) initNVML() error { +func (d *gpuDcgm) initNVML() error { if ret := nvml.Init(); ret != nvml.SUCCESS { d.collectionSupported = false d.Shutdown() @@ -440,7 +444,7 @@ func (d *GPUDcgm) initNVML() error { return nil } -func (d *GPUDcgm) createDeviceGroup() error { +func (d *gpuDcgm) createDeviceGroup() error { deviceGroupName := "dev-grp-" + time.Now().Format("2006-01-02-15-04-05") deviceGroup, err := dcgm.CreateGroup(deviceGroupName) if err != nil { @@ -452,8 +456,8 @@ func (d *GPUDcgm) createDeviceGroup() error { return nil } -func (d *GPUDcgm) addDevicesToGroup() error { - for gpuID := range d.devices { +func (d *gpuDcgm) addDevicesToGroup() error { + for gpuID := range d.devs { err := dcgm.AddEntityToGroup(d.deviceGroupHandle, dcgm.FE_GPU, uint(gpuID)) if err != nil { klog.Errorf("failed to add device %d to group %q: %v", gpuID, d.deviceGroupName, err) @@ -468,7 +472,7 @@ func (d *GPUDcgm) addDevicesToGroup() error { return nil } -func (d *GPUDcgm) createFieldGroup() error { +func (d *gpuDcgm) createFieldGroup() error { fieldGroupName := "fld-grp-" + time.Now().Format("2006-01-02-15-04-05") fieldGroup, err := dcgm.FieldGroupCreate(fieldGroupName, deviceFields) if err != nil { @@ -479,7 +483,7 @@ func (d *GPUDcgm) createFieldGroup() error { return nil } -func (d *GPUDcgm) setupWatcher() error { +func (d *gpuDcgm) setupWatcher() error { // watch interval has an impact on cpu usage, set it carefully err := dcgm.WatchFieldsWithGroupEx(d.fieldGroupHandle, d.deviceGroupHandle, int64(1000)*1000, 0.0, 1) if err != nil { @@ -493,7 +497,6 @@ func (d *GPUDcgm) setupWatcher() error { func ToFloat64(value *dcgm.FieldValue_v1, multiplyFactor float64) float64 { defaultValue := float64(0) switch v := value.FieldType; v { - // Floating-point case dcgm.DCGM_FT_DOUBLE: switch v := value.Float64(); v { @@ -537,13 +540,3 @@ func ToFloat64(value *dcgm.FieldValue_v1, multiplyFactor float64) float64 { return defaultValue } } - -func nvmlErrorString(errno nvml.Return) string { - switch errno { - case nvml.SUCCESS: - return "SUCCESS" - case nvml.ERROR_LIBRARY_NOT_FOUND: - return "ERROR_LIBRARY_NOT_FOUND" - } - return fmt.Sprintf("Error %d", errno) -} diff --git a/pkg/sensors/accelerator/devices/device.go b/pkg/sensors/accelerator/devices/device.go index 7c564f0d60..e3b6f84a35 100644 --- a/pkg/sensors/accelerator/devices/device.go +++ b/pkg/sensors/accelerator/devices/device.go @@ -16,6 +16,7 @@ limitations under the License. package devices import ( + "errors" "sync" "time" @@ -140,13 +141,13 @@ func (r *Registry) GetAllDeviceTypes() []string { return devices } -func addDeviceInterface(registry *Registry, dtype DeviceType, accType string, deviceStartup deviceStartupFunc) { +func addDeviceInterface(registry *Registry, dtype DeviceType, accType string, deviceStartup deviceStartupFunc) error { switch accType { case config.GPU: // Check if device is already registered if existingDevice := registry.Registry[accType][dtype]; existingDevice != nil { klog.Errorf("Multiple Devices attempting to register with name %q", dtype.String()) - return + return errors.New("multiple Devices attempting to register with name") } if dtype == DCGM { @@ -155,7 +156,7 @@ func addDeviceInterface(registry *Registry, dtype DeviceType, accType string, de } else if dtype == NVML { // Do not register "nvml" if "dcgm" is already registered if _, ok := registry.Registry[config.GPU][DCGM]; ok { - return + return errors.New("DCGM already registered. Skipping NVML") } } @@ -167,9 +168,11 @@ func addDeviceInterface(registry *Registry, dtype DeviceType, accType string, de } klog.V(5).Infof("Registered %s", dtype) + + return nil } -// Startup initializes and returns a new Device according to the given DeviceType [NVML|DCGM|DUMMY|HABANA]. +// Startup initializes and returns a new Device according to the given DeviceType [NVML|DCGM|HABANA]. func Startup(a string) Device { // Retrieve the global registry registry := GetRegistry() diff --git a/pkg/sensors/accelerator/devices/habana.go b/pkg/sensors/accelerator/devices/habana.go index d93c884360..e03320a787 100644 --- a/pkg/sensors/accelerator/devices/habana.go +++ b/pkg/sensors/accelerator/devices/habana.go @@ -35,11 +35,11 @@ const ( ) var ( - habanaAccImpl = GPUHabana{} + habanaAccImpl = gpuHabana{} habanaType DeviceType ) -type GPUHabana struct { +type gpuHabana struct { collectionSupported bool devices map[int]interface{} } @@ -51,8 +51,11 @@ func habanaCheck(r *Registry) { } habanaType = HABANA klog.V(5).Infof("Register %s with device startup register", habanaType) - addDeviceInterface(r, habanaType, habanaAccType, habanaDeviceStartup) - klog.Infof("Using %s to obtain processor power", habanaAccImpl.Name()) + if err := addDeviceInterface(r, habanaType, habanaAccType, habanaDeviceStartup); err == nil { + klog.Infof("Using %s to obtain processor power", habanaAccImpl.Name()) + } else { + klog.V(5).Infof("Error registering habana: %v", err) + } } func habanaDeviceStartup() Device { @@ -66,26 +69,26 @@ func habanaDeviceStartup() Device { return &a } -func (g *GPUHabana) Name() string { +func (g *gpuHabana) Name() string { return habanaType.String() } -func (g *GPUHabana) DevType() DeviceType { +func (g *gpuHabana) DevType() DeviceType { return habanaType } -func (g *GPUHabana) HwType() string { +func (g *gpuHabana) HwType() string { return habanaAccType } -func (g *GPUHabana) InitLib() error { +func (g *gpuHabana) InitLib() error { if _, err := os.Stat(libhlmlpath); errors.Is(err, os.ErrNotExist) { return err } return nil } -func (g *GPUHabana) Init() error { +func (g *gpuHabana) Init() error { ret := hlml.Initialize() if ret != nil { klog.Error("ERROR initializing hlml") @@ -98,14 +101,14 @@ func (g *GPUHabana) Init() error { return ret } -func (g *GPUHabana) Shutdown() bool { +func (g *gpuHabana) Shutdown() bool { if ret := hlml.Shutdown(); ret != nil { return false } return true } -func (g *GPUHabana) AbsEnergyFromDevice() []uint32 { +func (g *gpuHabana) AbsEnergyFromDevice() []uint32 { gpuEnergy := []uint32{} for _, dev := range g.devices { power, ret := dev.(GPUDevice).DeviceHandler.(hlml.Device).PowerUsage() @@ -123,7 +126,7 @@ func (g *GPUHabana) AbsEnergyFromDevice() []uint32 { return gpuEnergy } -func (g *GPUHabana) DevicesByID() map[int]interface{} { +func (g *gpuHabana) DevicesByID() map[int]interface{} { // Get the count of available devices count, ret := hlml.DeviceCount() if ret != nil { @@ -146,30 +149,30 @@ func (g *GPUHabana) DevicesByID() map[int]interface{} { return devices } -func (g *GPUHabana) DevicesByName() map[string]any { +func (g *gpuHabana) DevicesByName() map[string]any { devices := make(map[string]interface{}) return devices } -func (g *GPUHabana) DeviceInstances() map[int]map[int]interface{} { +func (g *gpuHabana) DeviceInstances() map[int]map[int]interface{} { var devices map[int]map[int]interface{} return devices } -func (g *GPUHabana) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { +func (g *gpuHabana) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { ds := make(map[any]interface{}) // Process Accelerator Metrics return ds, nil } -func (g *GPUHabana) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]interface{}, error) { +func (g *gpuHabana) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]interface{}, error) { pam := make(map[uint32]interface{}) // Process Accelerator Metrics return pam, nil } -func (g *GPUHabana) IsDeviceCollectionSupported() bool { +func (g *gpuHabana) IsDeviceCollectionSupported() bool { return g.collectionSupported } -func (g *GPUHabana) SetDeviceCollectionSupported(supported bool) { +func (g *gpuHabana) SetDeviceCollectionSupported(supported bool) { g.collectionSupported = supported } diff --git a/pkg/sensors/accelerator/devices/mock_device.go b/pkg/sensors/accelerator/devices/mock_device.go index 8cb8329597..78c6204995 100644 --- a/pkg/sensors/accelerator/devices/mock_device.go +++ b/pkg/sensors/accelerator/devices/mock_device.go @@ -17,6 +17,8 @@ package devices import ( "time" + + "k8s.io/klog/v2" ) var ( @@ -31,7 +33,9 @@ type MockDevice struct { func RegisterMockDevice() { r := GetRegistry() - addDeviceInterface(r, mockDevice, mockDevice.String(), MockDeviceDeviceStartup) + if err := addDeviceInterface(r, mockDevice, mockDevice.String(), MockDeviceDeviceStartup); err != nil { + klog.Errorf("couldn't register mock device %v", err) + } } func MockDeviceDeviceStartup() Device { diff --git a/pkg/sensors/accelerator/devices/nvml.go b/pkg/sensors/accelerator/devices/nvml.go index f3f6e1bbfb..f9bf80d3ae 100644 --- a/pkg/sensors/accelerator/devices/nvml.go +++ b/pkg/sensors/accelerator/devices/nvml.go @@ -31,11 +31,11 @@ const ( ) var ( - nvmlAccImpl = GPUNvml{} + nvmlAccImpl = gpuNvml{} nvmlType DeviceType ) -type GPUNvml struct { +type gpuNvml struct { libInited bool collectionSupported bool devices map[int]GPUDevice // List of GPU identifiers for the device @@ -49,8 +49,11 @@ func nvmlCheck(r *Registry) { } klog.Info("Initializing nvml Successful") nvmlType = NVML - addDeviceInterface(r, nvmlType, nvmlHwType, nvmlDeviceStartup) - klog.Infof("Using %s to obtain processor power", nvmlAccImpl.Name()) + if err := addDeviceInterface(r, nvmlType, nvmlHwType, nvmlDeviceStartup); err == nil { + klog.Infof("Using %s to obtain processor power", nvmlAccImpl.Name()) + } else { + klog.V(5).Infof("Error registering nvml: %v", err) + } } func nvmlDeviceStartup() Device { @@ -67,30 +70,30 @@ func nvmlDeviceStartup() Device { return &a } -func (n *GPUNvml) Name() string { +func (n *gpuNvml) Name() string { return nvmlType.String() } -func (n *GPUNvml) DevType() DeviceType { +func (n *gpuNvml) DevType() DeviceType { return nvmlType } -func (n *GPUNvml) HwType() string { +func (n *gpuNvml) HwType() string { return nvmlHwType } -func (n *GPUNvml) IsDeviceCollectionSupported() bool { +func (n *gpuNvml) IsDeviceCollectionSupported() bool { return n.collectionSupported } -func (n *GPUNvml) SetDeviceCollectionSupported(supported bool) { +func (n *gpuNvml) SetDeviceCollectionSupported(supported bool) { n.collectionSupported = supported } // Init initizalize and start the GPU metric collector // the nvml only works if the container has support to GPU, e.g., it is using nvidia-docker2 // otherwise it will fail to load the libnvidia-ml.so.1 -func (n *GPUNvml) InitLib() (err error) { +func (n *gpuNvml) InitLib() (err error) { defer func() { if r := recover(); r != nil { err = fmt.Errorf("could not init nvml: %v", r) @@ -105,7 +108,7 @@ func (n *GPUNvml) InitLib() (err error) { return nil } -func (n *GPUNvml) Init() (err error) { +func (n *gpuNvml) Init() (err error) { if !n.libInited { if err := n.InitLib(); err != nil { return err @@ -144,13 +147,13 @@ func (n *GPUNvml) Init() (err error) { } // Shutdown stops the GPU metric collector -func (n *GPUNvml) Shutdown() bool { +func (n *gpuNvml) Shutdown() bool { n.libInited = false return nvml.Shutdown() == nvml.SUCCESS } // GetGpus returns a map with gpu device -func (n *GPUNvml) DevicesByID() map[int]interface{} { +func (n *gpuNvml) DevicesByID() map[int]interface{} { devices := make(map[int]interface{}) for id, dev := range n.devices { devices[id] = dev @@ -158,18 +161,18 @@ func (n *GPUNvml) DevicesByID() map[int]interface{} { return devices } -func (n *GPUNvml) DevicesByName() map[string]any { +func (n *gpuNvml) DevicesByName() map[string]any { devices := make(map[string]interface{}) return devices } -func (n *GPUNvml) DeviceInstances() map[int]map[int]interface{} { +func (n *gpuNvml) DeviceInstances() map[int]map[int]interface{} { var devices map[int]map[int]interface{} return devices } // GetAbsEnergyFromGPU returns a map with mJ in each gpu device -func (n *GPUNvml) AbsEnergyFromDevice() []uint32 { +func (n *gpuNvml) AbsEnergyFromDevice() []uint32 { gpuEnergy := []uint32{} for _, dev := range n.devices { power, ret := dev.DeviceHandler.(nvml.Device).GetPowerUsage() @@ -185,7 +188,7 @@ func (n *GPUNvml) AbsEnergyFromDevice() []uint32 { return gpuEnergy } -func (n *GPUNvml) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { +func (n *gpuNvml) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { ds := make(map[any]interface{}) // Process Accelerator Metrics return ds, nil } @@ -193,7 +196,7 @@ func (n *GPUNvml) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { // GetProcessResourceUtilization returns a map of GPUProcessUtilizationSample where the key is the process pid // GPUProcessUtilizationSample.SmUtil represents the process Streaming Multiprocessors - SM (3D/Compute) utilization in percentage. // GPUProcessUtilizationSample.MemUtil represents the process Frame Buffer Memory utilization Value. -func (n *GPUNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]any, error) { +func (n *gpuNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]any, error) { processAcceleratorMetrics := map[uint32]GPUProcessUtilizationSample{} pam := make(map[uint32]interface{}) lastUtilizationTimestamp := uint64(time.Now().Add(-1*since).UnixNano() / 1000) @@ -229,7 +232,7 @@ func (n *GPUNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Durati } if !n.processUtilizationSupported { // If processUtilizationSupported is false, try deviceGetMPSComputeRunningProcesses_v3 to use memory usage to ratio power usage - config.GPUUsageMetric = config.GPUMemUtilization + config.SetGPUUsageMetric(config.GPUMemUtilization) processInfo, ret := d.DeviceHandler.(nvml.Device).GetComputeRunningProcesses() if ret != nvml.SUCCESS { if ret == nvml.ERROR_NOT_FOUND {