diff --git a/Makefile b/Makefile index acae4461b4..be53e1eef2 100644 --- a/Makefile +++ b/Makefile @@ -43,14 +43,8 @@ CTR_CMD_PUSH_OPTIONS ?= GENERAL_TAGS := include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo GPU_TAGS := -ifeq ($(shell ldconfig -p | grep -q libnvml_injection.so && echo exists),exists) - GPU_TAGS := nvml -endif -ifeq ($(shell ldconfig -p | grep -q libdcgm.so && echo exists),exists) - GPU_TAGS := dcgm -endif ifeq ($(shell ldconfig -p | grep -q libhlml.so && echo exists),exists) - GPU_TAGS := habana + GPU_TAGS := habana endif # set GOENV diff --git a/cmd/exporter/exporter.go b/cmd/exporter/exporter.go index ac5b50f02e..2687a2f13f 100644 --- a/cmd/exporter/exporter.go +++ b/cmd/exporter/exporter.go @@ -148,7 +148,7 @@ func main() { if config.EnabledGPU() { r := accelerator.GetRegistry() - if a, err := accelerator.New(accelerator.GPU, true); err == nil { + if a, err := accelerator.New(config.GPU, true); err == nil { r.MustRegister(a) // Register the accelerator with the registry } else { klog.Errorf("failed to init GPU accelerators: %v", err) diff --git a/pkg/collector/energy/node_energy_collector.go b/pkg/collector/energy/node_energy_collector.go index d30107ae41..763dc6a6bb 100644 --- a/pkg/collector/energy/node_energy_collector.go +++ b/pkg/collector/energy/node_energy_collector.go @@ -67,7 +67,7 @@ func UpdateNodeComponentsEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup) func UpdateNodeGPUEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup) { defer wg.Done() if config.EnabledGPU() { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { gpuEnergy := gpu.Device().AbsEnergyFromDevice() for gpu, energy := range gpuEnergy { nodeStats.EnergyUsage[config.AbsEnergyInGPU].SetDeltaStat(fmt.Sprintf("%d", gpu), uint64(energy)) diff --git a/pkg/collector/metric_collector.go b/pkg/collector/metric_collector.go index aa030c9ec9..cc704a2699 100644 --- a/pkg/collector/metric_collector.go +++ b/pkg/collector/metric_collector.go @@ -159,7 +159,7 @@ func (c *Collector) updateProcessResourceUtilizationMetrics(wg *sync.WaitGroup) // we first updates the bpf which is responsible to include new processes in the ProcessStats collection resourceBpf.UpdateProcessBPFMetrics(c.bpfExporter, c.ProcessStats) if config.EnabledGPU() { - if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { + if acc.GetActiveAcceleratorByType(config.GPU) != nil { accelerator.UpdateProcessGPUUtilizationMetrics(c.ProcessStats) } } diff --git a/pkg/collector/metric_collector_test.go b/pkg/collector/metric_collector_test.go index 7280af9a4e..e2c6e973a7 100644 --- a/pkg/collector/metric_collector_test.go +++ b/pkg/collector/metric_collector_test.go @@ -16,7 +16,7 @@ import ( ) func newMockCollector(mockAttacher bpf.Exporter) *Collector { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { d := gpu.Device() err := d.Init() // create structure instances that will be accessed to create a containerMetric Expect(err).NotTo(HaveOccurred()) diff --git a/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go b/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go index 79d0de133a..5bdf4c2b7c 100644 --- a/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go +++ b/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go @@ -26,7 +26,7 @@ import ( "github.com/sustainable-computing-io/kepler/pkg/config" "github.com/sustainable-computing-io/kepler/pkg/libvirt" acc "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator" - dev "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" + dev "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/devices" "k8s.io/klog/v2" "github.com/sustainable-computing-io/kepler/pkg/utils" @@ -43,7 +43,7 @@ var ( // UpdateProcessGPUUtilizationMetrics reads the GPU metrics of each process using the GPU func UpdateProcessGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats) { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { d := gpu.Device() migDevices := d.DeviceInstances() for _, _device := range d.DevicesByID() { diff --git a/pkg/collector/stats/node_stats.go b/pkg/collector/stats/node_stats.go index d50ed1fa21..7ab80fe5ba 100644 --- a/pkg/collector/stats/node_stats.go +++ b/pkg/collector/stats/node_stats.go @@ -51,7 +51,7 @@ func (ne *NodeStats) ResetDeltaValues() { func (ne *NodeStats) UpdateIdleEnergyWithMinValue(isComponentsSystemCollectionSupported bool) { // gpu metric if config.EnabledGPU() { - if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { + if acc.GetActiveAcceleratorByType(config.GPU) != nil { ne.CalcIdleEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.GPUComputeUtilization) } } diff --git a/pkg/collector/stats/stats.go b/pkg/collector/stats/stats.go index c9d6b85862..2217345803 100644 --- a/pkg/collector/stats/stats.go +++ b/pkg/collector/stats/stats.go @@ -82,7 +82,7 @@ func NewStats() *Stats { } if config.EnabledGPU() { - if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { + if acc.GetActiveAcceleratorByType(config.GPU) != nil { stats.ResourceUsage[config.GPUComputeUtilization] = types.NewUInt64StatCollection() stats.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection() stats.ResourceUsage[config.IdleEnergyInGPU] = types.NewUInt64StatCollection() @@ -141,7 +141,7 @@ func (s *Stats) UpdateDynEnergy() { } // GPU metric if config.EnabledGPU() { - if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { + if acc.GetActiveAcceleratorByType(config.GPU) != nil { for gpuID := range s.EnergyUsage[config.AbsEnergyInGPU] { s.CalcDynEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.DynEnergyInGPU, gpuID) } diff --git a/pkg/collector/stats/test_utils.go b/pkg/collector/stats/test_utils.go index ddb623d442..dd14bbf0ab 100644 --- a/pkg/collector/stats/test_utils.go +++ b/pkg/collector/stats/test_utils.go @@ -32,7 +32,7 @@ const ( // TODO: do not use a fixed usageMetric array in the power models, a structured data is more disarable. func SetMockedCollectorMetrics() { config.GetConfig() - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { err := gpu.Device().Init() // create structure instances that will be accessed to create a processMetric klog.Fatalln(err) } diff --git a/pkg/collector/stats/utils.go b/pkg/collector/stats/utils.go index ad07ad22d3..5c9158e8da 100644 --- a/pkg/collector/stats/utils.go +++ b/pkg/collector/stats/utils.go @@ -31,7 +31,7 @@ func GetProcessFeatureNames() []string { // gpu metric if config.EnabledGPU() { - if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { + if acc.GetActiveAcceleratorByType(config.GPU) != nil { gpuMetrics := []string{config.GPUComputeUtilization, config.GPUMemUtilization} metrics = append(metrics, gpuMetrics...) klog.V(3).Infof("Available GPU metrics: %v", gpuMetrics) diff --git a/pkg/metrics/metricfactory/metric_factory.go b/pkg/metrics/metricfactory/metric_factory.go index aeb68a899f..9f76994e96 100644 --- a/pkg/metrics/metricfactory/metric_factory.go +++ b/pkg/metrics/metricfactory/metric_factory.go @@ -36,7 +36,7 @@ func EnergyMetricsPromDesc(context string) (descriptions map[string]*prometheus. // set the default source to trained power model source := modeltypes.TrainedPowerModelSource if strings.Contains(name, config.GPU) { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { source = gpu.Device().Name() } } else if strings.Contains(name, config.PLATFORM) && platform.IsSystemCollectionSupported() { @@ -87,7 +87,7 @@ func SCMetricsPromDesc(context string, bpfSupportedMetrics bpf.SupportedMetrics) func GPUUsageMetricsPromDesc(context string) (descriptions map[string]*prometheus.Desc) { descriptions = make(map[string]*prometheus.Desc) if config.EnabledGPU() { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { for _, name := range consts.GPUMetricNames { descriptions[name] = resMetricsPromDesc(context, name, gpu.Device().Name()) } diff --git a/pkg/metrics/prometheus_collector_test.go b/pkg/metrics/prometheus_collector_test.go index abc277d19f..822fe3efa2 100644 --- a/pkg/metrics/prometheus_collector_test.go +++ b/pkg/metrics/prometheus_collector_test.go @@ -33,6 +33,7 @@ import ( "github.com/sustainable-computing-io/kepler/pkg/bpf" "github.com/sustainable-computing-io/kepler/pkg/collector" "github.com/sustainable-computing-io/kepler/pkg/collector/stats" + "github.com/sustainable-computing-io/kepler/pkg/config" "github.com/sustainable-computing-io/kepler/pkg/model" acc "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator" @@ -63,7 +64,7 @@ var _ = Describe("Test Prometheus Collector Unit", func() { // we need to disable the system real time power metrics for testing since we add mock values or use power model estimator components.SetIsSystemCollectionSupported(false) platform.SetIsSystemCollectionSupported(false) - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { err := gpu.Device().Init() // create structure instances that will be accessed to create a containerMetric Expect(err).NotTo(HaveOccurred()) } diff --git a/pkg/metrics/utils/utils.go b/pkg/metrics/utils/utils.go index 9d66ee1515..ee52a4f03a 100644 --- a/pkg/metrics/utils/utils.go +++ b/pkg/metrics/utils/utils.go @@ -58,7 +58,7 @@ func CollectResUtilizationMetrics(ch chan<- prometheus.Metric, instance interfac CollectResUtil(ch, instance, collectorName, collectors[collectorName]) } if config.EnabledGPU() { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { for _, collectorName := range consts.GPUMetricNames { CollectResUtil(ch, instance, collectorName, collectors[collectorName]) } diff --git a/pkg/model/process_energy.go b/pkg/model/process_energy.go index 824016c367..19857919af 100644 --- a/pkg/model/process_energy.go +++ b/pkg/model/process_energy.go @@ -201,7 +201,7 @@ func addEstimatedEnergy(processIDList []uint64, processesMetrics map[uint64]*sta } // estimate the associated power consumption of GPU for each process if config.EnabledGPU() { - if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { + if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { processGPUPower, errGPU = processComponentPowerModel.GetGPUPower(isIdlePower) if errGPU != nil { klog.V(5).Infoln("Could not estimate the Process GPU Power") diff --git a/pkg/sensors/accelerator/accelerator.go b/pkg/sensors/accelerator/accelerator.go index f0a77b8fc1..db2f4a618c 100644 --- a/pkg/sensors/accelerator/accelerator.go +++ b/pkg/sensors/accelerator/accelerator.go @@ -20,11 +20,8 @@ import ( "time" "github.com/pkg/errors" - "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" + "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/devices" "k8s.io/klog/v2" - - // Add supported devices. - _ "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device/sources" ) var ( @@ -32,45 +29,30 @@ var ( once sync.Once ) -const ( - DUMMY AcceleratorType = iota - GPU - // Add other accelerator types here [IPU|DPU|...] -) - -type AcceleratorType int - -func (a AcceleratorType) String() string { - return [...]string{"DUMMY", "GPU"}[a] -} - // Accelerator represents an implementation of... equivalent Accelerator device. type Accelerator interface { // Device returns an underlying accelerator device implementation... - Device() device.Device + Device() devices.Device // IsRunning returns whether or not that device is running IsRunning() bool - // AccType returns the accelerator type. - AccType() AcceleratorType // stop stops an accelerator and unregisters it stop() } type accelerator struct { - dev device.Device // Device Accelerator Interface - accType AcceleratorType + dev devices.Device // Device Accelerator Interface running bool } type Registry struct { - Registry map[AcceleratorType]Accelerator + Registry map[string]Accelerator } // Registry gets the default device Registry instance func GetRegistry() *Registry { once.Do(func() { globalRegistry = &Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } }) return globalRegistry @@ -84,27 +66,27 @@ func SetRegistry(registry *Registry) { } func (r *Registry) MustRegister(a Accelerator) { - _, ok := r.Registry[a.AccType()] + _, ok := r.Registry[a.Device().HwType()] if ok { - klog.V(5).Infof("Accelerator with type %s already exists", a.AccType()) + klog.V(5).Infof("Accelerator with type %s already exists", a.Device().HwType()) return } - r.Registry[a.AccType()] = a + r.Registry[a.Device().HwType()] = a } func (r *Registry) Unregister(a Accelerator) bool { - _, exists := r.Registry[a.AccType()] + _, exists := r.Registry[a.Device().HwType()] if exists { - delete(r.Registry, a.AccType()) + delete(r.Registry, a.Device().HwType()) return true } - klog.Errorf("Accelerator with type %s doesn't exist", a.AccType()) + klog.Errorf("Accelerator with type %s doesn't exist", a.Device().HwType()) return false } // Devices returns a map of supported accelerators. -func (r *Registry) Accelerators() map[device.DeviceType]Accelerator { - acc := map[device.DeviceType]Accelerator{} +func (r *Registry) accelerators() map[string]Accelerator { + acc := map[string]Accelerator{} if len(r.Registry) == 0 { // No accelerators found @@ -115,7 +97,7 @@ func (r *Registry) Accelerators() map[device.DeviceType]Accelerator { if a.IsRunning() { d := a.Device() if d.IsDeviceCollectionSupported() { - acc[d.DevType()] = a + acc[d.HwType()] = a } } } @@ -124,9 +106,15 @@ func (r *Registry) Accelerators() map[device.DeviceType]Accelerator { } // ActiveAcceleratorByType returns a map of supported accelerators based on the specified type... -func (r *Registry) ActiveAcceleratorByType(t AcceleratorType) Accelerator { +func (r *Registry) activeAcceleratorByType(t string) Accelerator { + if len(r.Registry) == 0 { + // No accelerators found + klog.V(5).Infof("No accelerators found") + return nil + } + for _, a := range r.Registry { - if a.AccType() == t && a.IsRunning() { + if a.Device().HwType() == t && a.IsRunning() { d := a.Device() if d.IsDeviceCollectionSupported() { return a @@ -136,56 +124,48 @@ func (r *Registry) ActiveAcceleratorByType(t AcceleratorType) Accelerator { return nil } -func New(atype AcceleratorType, sleep bool) (Accelerator, error) { - var numDevs int +func New(atype string, sleep bool) (Accelerator, error) { + var d devices.Device maxDeviceInitRetry := 10 - var d device.Device - switch atype { - case GPU: - numDevs = len(device.GetAllDeviceTypes()) - default: - return nil, errors.New("unsupported accelerator") + // Init the available devices. + + devs := devices.GetRegistry().GetAllDeviceTypes() + numDevs := len(devs) + if numDevs == 0 || !slices.Contains(devs, atype) { + return nil, errors.New("no devices found") } - if numDevs != 0 { - devices := device.GetAllDeviceTypes() - if !slices.Contains(devices, atype.String()) { - klog.Errorf("%v doesn't contain %s", devices, atype.String()) - return nil, errors.New("no devices found") - } - klog.V(5).Infof("Initializing the Accelerator of type %v", atype.String()) - - for i := 0; i < maxDeviceInitRetry; i++ { - if d = device.Startup(atype.String()); d == nil { - klog.Errorf("Could not init the %s device going to try again", atype.String()) - if sleep { - // The GPU operators typically takes longer time to initialize than kepler resulting in error to start the gpu driver - // therefore, we wait up to 1 min to allow the gpu operator initialize - time.Sleep(6 * time.Second) - } - continue + klog.V(5).Infof("Initializing the Accelerator of type %v", atype) + + for i := 0; i < maxDeviceInitRetry; i++ { + if d = devices.Startup(atype); d == nil { + klog.Errorf("Could not init the %s device going to try again", atype) + if sleep { + // The GPU operators typically takes longer time to initialize than kepler resulting in error to start the gpu driver + // therefore, we wait up to 1 min to allow the gpu operator initialize + time.Sleep(6 * time.Second) } - klog.V(5).Infof("Startup %s Accelerator successful", atype.String()) - break + continue } - } else { - return nil, errors.New("No Accelerator devices found") + klog.V(5).Infof("Startup %s Accelerator successful", atype) + break } return &accelerator{ dev: d, running: true, - accType: atype, }, nil } func Shutdown() { - if accelerators := GetRegistry().Accelerators(); accelerators != nil { + if accelerators := GetRegistry().accelerators(); accelerators != nil { for _, a := range accelerators { - klog.V(5).Infof("Shutting down %s", a.AccType().String()) + klog.V(5).Infof("Shutting down %s", a.Device().DevType()) a.stop() } + } else { + klog.V(5).Info("No devices to shutdown") } } @@ -205,16 +185,24 @@ func (a *accelerator) stop() { } // Device returns an accelerator interface -func (a *accelerator) Device() device.Device { +func (a *accelerator) Device() devices.Device { return a.dev } // DeviceType returns the accelerator's underlying device type -func (a *accelerator) AccType() AcceleratorType { - return a.accType +func (a *accelerator) DevType() string { + return a.dev.DevType().String() } // IsRunning returns the running status of an accelerator func (a *accelerator) IsRunning() bool { return a.running } + +func GetActiveAcceleratorByType(t string) Accelerator { + return GetRegistry().activeAcceleratorByType(t) +} + +func GetAccelerators() map[string]Accelerator { + return GetRegistry().accelerators() +} diff --git a/pkg/sensors/accelerator/accelerator_test.go b/pkg/sensors/accelerator/accelerator_test.go index 30d8575cf2..59f1ed96f2 100644 --- a/pkg/sensors/accelerator/accelerator_test.go +++ b/pkg/sensors/accelerator/accelerator_test.go @@ -1,17 +1,28 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package accelerator import ( "testing" - "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" - - // Add supported devices. - - _ "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device/sources" + "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/devices" ) -func newMockDevice() device.Device { - return device.Startup(AcceleratorType(0).String()) +func newMockDevice() devices.Device { + return devices.Startup(devices.MOCK.String()) } func cleanupMockDevice() { @@ -30,7 +41,7 @@ func TestRegistry(t *testing.T) { name: "Empty registry", setup: func() *Registry { registry := Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } SetRegistry(®istry) @@ -38,19 +49,18 @@ func TestRegistry(t *testing.T) { }, expectedLen: 0, expectError: false, - cleanup: func() {}, + cleanup: func() { cleanupMockDevice() }, }, { name: "Non-empty registry", setup: func() *Registry { registry := &Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } SetRegistry(registry) - + devices.RegisterMockDevice() a := &accelerator{ dev: newMockDevice(), - accType: AcceleratorType(0), running: true, } registry.MustRegister(a) @@ -68,10 +78,10 @@ func TestRegistry(t *testing.T) { var a Accelerator var err error registry := tt.setup() - if a, err = New(AcceleratorType(0), true); err == nil { + if a, err = New("MOCK", true); err == nil { registry.MustRegister(a) // Register the accelerator with the registry } - accs := registry.Accelerators() + accs := GetAccelerators() if tt.expectError && err == nil { t.Errorf("expected an error but got nil") } @@ -90,7 +100,6 @@ func TestActiveAcceleratorByType(t *testing.T) { tests := []struct { name string setup func() *Registry - accType AcceleratorType expectError bool cleanup func() }{ @@ -98,53 +107,37 @@ func TestActiveAcceleratorByType(t *testing.T) { name: "No accelerators of given type", setup: func() *Registry { return &Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } }, - accType: AcceleratorType(0), expectError: true, - cleanup: func() {}, + cleanup: func() { cleanupMockDevice() }, }, { name: "One active accelerator of given type", setup: func() *Registry { registry := &Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } - registry.Registry[AcceleratorType(0)] = &accelerator{ + SetRegistry(registry) + devices.RegisterMockDevice() + a := &accelerator{ dev: newMockDevice(), - accType: AcceleratorType(0), running: true, } - return registry + registry.MustRegister(a) + + return GetRegistry() }, - accType: AcceleratorType(0), expectError: false, cleanup: func() { cleanupMockDevice() }, }, - { - name: "One inactive accelerator of given type", - setup: func() *Registry { - registry := &Registry{ - Registry: map[AcceleratorType]Accelerator{}, - } - registry.Registry[AcceleratorType(0)] = &accelerator{ - dev: newMockDevice(), - accType: AcceleratorType(0), - running: false, - } - return registry - }, - accType: AcceleratorType(0), - expectError: true, - cleanup: func() {}, - }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - registry := tt.setup() - accs := registry.ActiveAcceleratorByType(tt.accType) + tt.setup() + accs := GetActiveAcceleratorByType("MOCK") if tt.expectError && accs != nil { t.Errorf("expected an error") } @@ -159,7 +152,7 @@ func TestActiveAcceleratorByType(t *testing.T) { func TestCreateAndRegister(t *testing.T) { tests := []struct { name string - accType AcceleratorType + accType string setup func() *Registry sleep bool expectError bool @@ -167,15 +160,15 @@ func TestCreateAndRegister(t *testing.T) { }{ { name: "Unsupported accelerator", - accType: AcceleratorType(999), // invalid accelerator type + accType: "UNSUPPORTED", // invalid accelerator type setup: func() *Registry { return &Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } }, sleep: false, expectError: true, - cleanup: func() {}, + cleanup: func() { cleanupMockDevice() }, }, } @@ -197,37 +190,34 @@ func TestCreateAndRegister(t *testing.T) { func TestShutdown(t *testing.T) { tests := []struct { - name string - setup func() *Registry - accType AcceleratorType + name string + setup func() *Registry }{ { name: "Shutdown active accelerators", setup: func() *Registry { registry := &Registry{ - Registry: map[AcceleratorType]Accelerator{}, + Registry: map[string]Accelerator{}, } SetRegistry(registry) + devices.RegisterMockDevice() a := &accelerator{ dev: newMockDevice(), - accType: AcceleratorType(0), running: true, } registry.MustRegister(a) return GetRegistry() }, - accType: AcceleratorType(0), }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - registry := tt.setup() - SetRegistry(registry) + tt.setup() Shutdown() - accs := GetRegistry().Accelerators() + accs := GetAccelerators() for _, a := range accs { if a.IsRunning() { t.Errorf("expected accelerator to be stopped but it is still running") @@ -238,23 +228,30 @@ func TestShutdown(t *testing.T) { } func TestAcceleratorMethods(t *testing.T) { - devType := device.DeviceType(0) + registry := &Registry{ + Registry: map[string]Accelerator{}, + } + + SetRegistry(registry) + + devices.RegisterMockDevice() + acc := &accelerator{ dev: newMockDevice(), running: true, - accType: AcceleratorType(0), } + registry.MustRegister(acc) + + devType := acc.dev.HwType() - if got := acc.Device(); got.DevType() != devType { + if got := acc.Device(); got.HwType() != devType { t.Errorf("expected device type %v, got %v", devType, got.DevType()) } - if got := acc.Device().DevType(); got != devType { + if got := acc.Device().HwType(); got != devType { t.Errorf("expected device type %v, got %v", devType, got) } if got := acc.IsRunning(); !got { t.Errorf("expected accelerator to be running, got %v", got) } - if got := acc.AccType(); got != AcceleratorType(0) { - t.Errorf("expected accelerator type AcceleratorType(0), got %v", got) - } + Shutdown() } diff --git a/pkg/sensors/accelerator/device/sources/dummy.go b/pkg/sensors/accelerator/device/sources/dummy.go deleted file mode 100644 index 40199e25ee..0000000000 --- a/pkg/sensors/accelerator/device/sources/dummy.go +++ /dev/null @@ -1,120 +0,0 @@ -//go:build !habana || !nvml || !dcgm -// +build !habana !nvml !dcgm - -/* -Copyright 2024. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package sources - -import ( - "time" - - "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" -) - -var ( - dummyDevice device.DeviceType -) - -type Dummy struct { - dummyDevice device.DeviceType - name string - collectionSupported bool -} - -func init() { - dummyDevice = device.DUMMY - device.AddDeviceInterface(dummyDevice, dummyDevice.String(), DummyDeviceStartup) -} - -func DummyDeviceStartup() device.Device { - d := Dummy{ - dummyDevice: dummyDevice, - name: dummyDevice.String(), - collectionSupported: true, - } - - return &d -} - -func (d *Dummy) Name() string { - return d.name -} - -func (d *Dummy) DevType() device.DeviceType { - return d.dummyDevice -} - -func (d *Dummy) HwType() string { - return d.dummyDevice.String() -} - -func (d *Dummy) InitLib() error { - return nil -} - -func (d *Dummy) Init() error { - return nil -} - -func (d *Dummy) Shutdown() bool { - return true -} - -func (d *Dummy) AbsEnergyFromDevice() []uint32 { - return nil -} - -func (d *Dummy) DevicesByID() map[int]any { - return nil -} - -func (d *Dummy) DevicesByName() map[string]any { - return nil -} - -func (d *Dummy) DeviceInstances() map[int]map[int]any { - return nil -} - -func (d *Dummy) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { - ds := make(map[any]interface{}) // Process Accelerator Metrics - return ds, nil -} - -func (d *Dummy) ProcessResourceUtilizationPerDevice(dev any, _ time.Duration) (map[uint32]any, error) { - processAcceleratorMetrics := map[uint32]device.GPUProcessUtilizationSample{} - pam := make(map[uint32]interface{}) - processAcceleratorMetrics[0] = device.GPUProcessUtilizationSample{ - Pid: 0, - TimeStamp: uint64(time.Now().UnixNano()), - ComputeUtil: 10, - MemUtil: 10, - EncUtil: 10, - DecUtil: 10, - } - for k, v := range processAcceleratorMetrics { - pam[k] = v - } - return pam, nil -} - -func (d *Dummy) IsDeviceCollectionSupported() bool { - return d.collectionSupported -} - -func (d *Dummy) SetDeviceCollectionSupported(supported bool) { - d.collectionSupported = supported -} diff --git a/pkg/sensors/accelerator/device/sources/dcgm.go b/pkg/sensors/accelerator/devices/dcgm.go similarity index 81% rename from pkg/sensors/accelerator/device/sources/dcgm.go rename to pkg/sensors/accelerator/devices/dcgm.go index d3e9c5fb4e..95ca57e06d 100644 --- a/pkg/sensors/accelerator/device/sources/dcgm.go +++ b/pkg/sensors/accelerator/devices/dcgm.go @@ -1,6 +1,3 @@ -//go:build dcgm -// +build dcgm - /* Copyright 2024. @@ -17,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package sources +package devices import ( "errors" @@ -27,7 +24,6 @@ import ( "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/sustainable-computing-io/kepler/pkg/config" - "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" "k8s.io/klog/v2" ) @@ -35,24 +31,24 @@ const ( debugLevel = 5 isSocket = "0" maxMIGProfiles = 15 // use a large number since the profile ids are not linear - dcgmHwType = "GPU" + dcgmHwType = config.GPU ) var ( - dcgmAccImpl = GPUDcgm{} + dcgmAccImpl = gpuDcgm{} deviceFields []dcgm.Short = []dcgm.Short{ // https://docs.nvidia.com/datacenter/dcgm/1.7/dcgm-api/group__dcgmFieldIdentifiers.htm dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, } ratioFields uint = dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE // this is the field that we will use to calculate the utilization per @yuezhu1 profileInfos map[int]nvml.GpuInstanceProfileInfo - dcgmType device.DeviceType + dcgmType DeviceType ) -type GPUDcgm struct { +type gpuDcgm struct { collectionSupported bool - devices map[int]device.GPUDevice - migDevices map[int]map[int]device.GPUDevice // list of mig devices for each GPU instance + devs map[int]GPUDevice + migDevices map[int]map[int]GPUDevice // list of mig devices for each GPU instance libInited bool nvmlInited bool deviceGroupName string @@ -62,18 +58,21 @@ type GPUDcgm struct { cleanup func() } -func init() { +func dcgmCheck(r *Registry) { if _, err := dcgm.Init(dcgm.Standalone, config.DCGMHostEngineEndpoint(), isSocket); err != nil { - klog.Errorf("Error initializing dcgm: %v", err) + klog.V(5).Infof("Error initializing dcgm: %v", err) return } klog.Info("Initializing dcgm Successful") - dcgmType = device.DCGM - device.AddDeviceInterface(dcgmType, dcgmHwType, dcgmDeviceStartup) - klog.Infof("Using %s to obtain processor power", dcgmAccImpl.Name()) + dcgmType = DCGM + if err := addDeviceInterface(r, dcgmType, dcgmHwType, dcgmDeviceStartup); err == nil { + klog.Infof("Using %s to obtain processor power", dcgmAccImpl.Name()) + } else { + klog.V(5).Infof("Error registering DCGM: %v", err) + } } -func dcgmDeviceStartup() device.Device { +func dcgmDeviceStartup() Device { a := dcgmAccImpl if err := a.InitLib(); err != nil { @@ -91,7 +90,7 @@ func dcgmDeviceStartup() device.Device { return &a } -func (d *GPUDcgm) Init() error { +func (d *gpuDcgm) Init() error { if !d.libInited { if err := d.InitLib(); err != nil { klog.Errorf("failed to init lib: %v", err) @@ -126,7 +125,7 @@ func (d *GPUDcgm) Init() error { return nil } -func (d *GPUDcgm) InitLib() (err error) { +func (d *gpuDcgm) InitLib() (err error) { defer func() { if r := recover(); r != nil { err = fmt.Errorf("could not init dcgm: %v", r) @@ -147,7 +146,7 @@ func (d *GPUDcgm) InitLib() (err error) { klog.Info("Started DCGM in the Embedded mode ") } d.nvmlInited = false - d.devices = make(map[int]device.GPUDevice) + d.devs = make(map[int]GPUDevice) d.cleanup = cleanup dcgm.FieldsInit() @@ -169,8 +168,8 @@ func (d *GPUDcgm) InitLib() (err error) { return nil } -func (d *GPUDcgm) loadDevices() error { - d.devices = map[int]device.GPUDevice{} +func (d *gpuDcgm) loadDevices() error { + d.devs = map[int]GPUDevice{} count, err := nvml.DeviceGetCount() if err != nvml.SUCCESS { return fmt.Errorf("error getting GPUs: %s", nvml.ErrorString(err)) @@ -181,19 +180,19 @@ func (d *GPUDcgm) loadDevices() error { klog.Errorf("failed to get device handle for index %d: %v", gpuID, nvml.ErrorString(err)) continue } - dev := device.GPUDevice{ + dev := GPUDevice{ DeviceHandler: nvmlDeviceHandler, ID: gpuID, IsSubdevice: false, } - d.devices[gpuID] = dev + d.devs[gpuID] = dev } return nil } // LoadMIGDevices dynamically discover the MIG instances of all GPUs -func (d *GPUDcgm) LoadMIGDevices() { - d.migDevices = map[int]map[int]device.GPUDevice{} +func (d *gpuDcgm) LoadMIGDevices() { + d.migDevices = map[int]map[int]GPUDevice{} // find all GPUs and the MIG slices if they exist hierarchy, err := dcgm.GetGpuInstanceHierarchy() @@ -206,17 +205,18 @@ func (d *GPUDcgm) LoadMIGDevices() { // the bigger MIG profiles that a GPU can have to be used to calculate the SM ratio fullGPUProfile := profileInfos[0] - for _, entity := range hierarchy.EntityList { + for i := range hierarchy.EntityList { + entity := &hierarchy.EntityList[i] if entity.Entity.EntityGroupId != dcgm.FE_GPU && entity.Entity.EntityGroupId != dcgm.FE_GPU_I { continue } parentGPUID := int(entity.Parent.EntityId) - parentDevice := d.devices[parentGPUID] + parentDevice := d.devs[parentGPUID] // init migDevices if _, exit := d.migDevices[parentGPUID]; !exit { - d.migDevices[parentGPUID] = map[int]device.GPUDevice{} + d.migDevices[parentGPUID] = map[int]GPUDevice{} } // find MIG device handler @@ -234,7 +234,7 @@ func (d *GPUDcgm) LoadMIGDevices() { // add MIG device migNvmlEntityID := int(entity.Entity.EntityId) - d.migDevices[parentGPUID][migNvmlEntityID] = device.GPUDevice{ + d.migDevices[parentGPUID][migNvmlEntityID] = GPUDevice{ DeviceHandler: migDeviceHandler, ID: migNvmlEntityID, ParentID: parentGPUID, @@ -244,13 +244,13 @@ func (d *GPUDcgm) LoadMIGDevices() { } } -func (d *GPUDcgm) loadMIGProfiles() { - if len(d.devices) == 0 { +func (d *gpuDcgm) loadMIGProfiles() { + if len(d.devs) == 0 { klog.Errorln("DCGM has no GPU to monitor") return } profileInfos = map[int]nvml.GpuInstanceProfileInfo{} - for _, dev := range d.devices { + for _, dev := range d.devs { for i := 0; i < maxMIGProfiles; i++ { profileInfo, ret := dev.DeviceHandler.(nvml.Device).GetGpuInstanceProfileInfo(i) if ret != nvml.SUCCESS { @@ -264,36 +264,40 @@ func (d *GPUDcgm) loadMIGProfiles() { } } -func (d *GPUDcgm) Name() string { +func (d *gpuDcgm) Name() string { return dcgmType.String() } -func (d *GPUDcgm) DevType() device.DeviceType { +func (d *gpuDcgm) DevType() DeviceType { return dcgmType } -func (d *GPUDcgm) HwType() string { +func (d *gpuDcgm) HwType() string { return dcgmHwType } -func (d *GPUDcgm) IsDeviceCollectionSupported() bool { +func (d *gpuDcgm) IsDeviceCollectionSupported() bool { return d.collectionSupported } -func (d *GPUDcgm) SetDeviceCollectionSupported(supported bool) { +func (d *gpuDcgm) SetDeviceCollectionSupported(supported bool) { d.collectionSupported = supported } -func (d *GPUDcgm) Shutdown() bool { +func (d *gpuDcgm) Shutdown() bool { if d.nvmlInited { nvml.Shutdown() } dcgm.FieldsTerm() if d.deviceGroupName != "" { - dcgm.DestroyGroup(d.deviceGroupHandle) + if err := dcgm.DestroyGroup(d.deviceGroupHandle); err != nil { + klog.Errorf("failed to destroy group %v", err) + } } if d.fieldGroupName != "" { - dcgm.FieldGroupDestroy(d.fieldGroupHandle) + if err := dcgm.FieldGroupDestroy(d.fieldGroupHandle); err != nil { + klog.Errorf("failed to destroy field group %v", err) + } } if d.cleanup != nil { d.cleanup() @@ -304,9 +308,9 @@ func (d *GPUDcgm) Shutdown() bool { return true } -func (d *GPUDcgm) AbsEnergyFromDevice() []uint32 { +func (d *gpuDcgm) AbsEnergyFromDevice() []uint32 { gpuEnergy := []uint32{} - for _, dev := range d.devices { + for _, dev := range d.devs { power, ret := dev.DeviceHandler.(nvml.Device).GetPowerUsage() if ret != nvml.SUCCESS { klog.Errorf("failed to get power usage on device %v: %v\n", dev, nvml.ErrorString(ret)) @@ -320,20 +324,20 @@ func (d *GPUDcgm) AbsEnergyFromDevice() []uint32 { return gpuEnergy } -func (d *GPUDcgm) DevicesByID() map[int]any { - devices := make(map[int]any) - for id, dev := range d.devices { - devices[id] = dev +func (d *gpuDcgm) DevicesByID() map[int]any { + devs := make(map[int]any) + for id, dev := range d.devs { + devs[id] = dev } - return devices + return devs } -func (d *GPUDcgm) DevicesByName() map[string]any { - devices := make(map[string]any) - return devices +func (d *gpuDcgm) DevicesByName() map[string]any { + devs := make(map[string]any) + return devs } -func (d *GPUDcgm) DeviceInstances() map[int]map[int]any { +func (d *gpuDcgm) DeviceInstances() map[int]map[int]any { // LoadMIGDevices d.LoadMIGDevices() @@ -348,19 +352,19 @@ func (d *GPUDcgm) DeviceInstances() map[int]map[int]any { return devInstances } -func (d *GPUDcgm) DeviceUtilizationStats(dev any) (map[any]any, error) { +func (d *gpuDcgm) DeviceUtilizationStats(dev any) (map[any]any, error) { ds := make(map[any]any) // Process Accelerator Metrics return ds, nil } // ProcessResourceUtilizationPerDevice returns the GPU utilization per process. The gpuID can be a MIG instance or the main GPU -func (d *GPUDcgm) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]any, error) { - processAcceleratorMetrics := map[uint32]device.GPUProcessUtilizationSample{} +func (d *gpuDcgm) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]any, error) { + processAcceleratorMetrics := map[uint32]GPUProcessUtilizationSample{} pam := make(map[uint32]any) // Check if the device is of type dev.GPUDevice and extract the DeviceHandler switch d := dev.(type) { - case device.GPUDevice: + case GPUDevice: if d.DeviceHandler == nil { return pam, nil } @@ -387,7 +391,7 @@ func (d *GPUDcgm) ProcessResourceUtilizationPerDevice(dev any, since time.Durati } } } - processAcceleratorMetrics[p.Pid] = device.GPUProcessUtilizationSample{ + processAcceleratorMetrics[p.Pid] = GPUProcessUtilizationSample{ Pid: p.Pid, TimeStamp: uint64(time.Now().UnixNano()), // TODO: It does not make sense to use the whole GPU utilization since a GPU might have more than one PID @@ -406,10 +410,10 @@ func (d *GPUDcgm) ProcessResourceUtilizationPerDevice(dev any, since time.Durati if val.FieldId == ratioFields { migUtilization := ToFloat64(val, 100) // ratio of active multiprocessors to total multiprocessors - // the MIG metrics represent the utilization of the MIG device. We need to normalize the metric to represent the overall GPU utilization + // the MIG metrics represent the utilization of the MIG We need to normalize the metric to represent the overall GPU utilization // FIXME: the MIG device could have multiple processes, such as using MPS, how to split the MIG utilization between the processes? gpuUtilization := migUtilization * d.MIGSMRatio - processAcceleratorMetrics[p.Pid] = device.GPUProcessUtilizationSample{ + processAcceleratorMetrics[p.Pid] = GPUProcessUtilizationSample{ Pid: p.Pid, TimeStamp: uint64(time.Now().UnixNano()), ComputeUtil: uint32(gpuUtilization), @@ -425,13 +429,13 @@ func (d *GPUDcgm) ProcessResourceUtilizationPerDevice(dev any, since time.Durati return pam, nil default: - klog.Error("expected device.GPUDevice but got come other type") + klog.Error("expected GPUDevice but got come other type") return pam, errors.New("invalid device type") } } // helper functions -func (d *GPUDcgm) initNVML() error { +func (d *gpuDcgm) initNVML() error { if ret := nvml.Init(); ret != nvml.SUCCESS { d.collectionSupported = false d.Shutdown() @@ -440,7 +444,7 @@ func (d *GPUDcgm) initNVML() error { return nil } -func (d *GPUDcgm) createDeviceGroup() error { +func (d *gpuDcgm) createDeviceGroup() error { deviceGroupName := "dev-grp-" + time.Now().Format("2006-01-02-15-04-05") deviceGroup, err := dcgm.CreateGroup(deviceGroupName) if err != nil { @@ -452,8 +456,8 @@ func (d *GPUDcgm) createDeviceGroup() error { return nil } -func (d *GPUDcgm) addDevicesToGroup() error { - for gpuID := range d.devices { +func (d *gpuDcgm) addDevicesToGroup() error { + for gpuID := range d.devs { err := dcgm.AddEntityToGroup(d.deviceGroupHandle, dcgm.FE_GPU, uint(gpuID)) if err != nil { klog.Errorf("failed to add device %d to group %q: %v", gpuID, d.deviceGroupName, err) @@ -468,7 +472,7 @@ func (d *GPUDcgm) addDevicesToGroup() error { return nil } -func (d *GPUDcgm) createFieldGroup() error { +func (d *gpuDcgm) createFieldGroup() error { fieldGroupName := "fld-grp-" + time.Now().Format("2006-01-02-15-04-05") fieldGroup, err := dcgm.FieldGroupCreate(fieldGroupName, deviceFields) if err != nil { @@ -479,7 +483,7 @@ func (d *GPUDcgm) createFieldGroup() error { return nil } -func (d *GPUDcgm) setupWatcher() error { +func (d *gpuDcgm) setupWatcher() error { // watch interval has an impact on cpu usage, set it carefully err := dcgm.WatchFieldsWithGroupEx(d.fieldGroupHandle, d.deviceGroupHandle, int64(1000)*1000, 0.0, 1) if err != nil { @@ -493,7 +497,6 @@ func (d *GPUDcgm) setupWatcher() error { func ToFloat64(value *dcgm.FieldValue_v1, multiplyFactor float64) float64 { defaultValue := float64(0) switch v := value.FieldType; v { - // Floating-point case dcgm.DCGM_FT_DOUBLE: switch v := value.Float64(); v { @@ -537,13 +540,3 @@ func ToFloat64(value *dcgm.FieldValue_v1, multiplyFactor float64) float64 { return defaultValue } } - -func nvmlErrorString(errno nvml.Return) string { - switch errno { - case nvml.SUCCESS: - return "SUCCESS" - case nvml.ERROR_LIBRARY_NOT_FOUND: - return "ERROR_LIBRARY_NOT_FOUND" - } - return fmt.Sprintf("Error %d", errno) -} diff --git a/pkg/sensors/accelerator/device/device.go b/pkg/sensors/accelerator/devices/device.go similarity index 64% rename from pkg/sensors/accelerator/device/device.go rename to pkg/sensors/accelerator/devices/device.go index b46ef37f04..e3b6f84a35 100644 --- a/pkg/sensors/accelerator/device/device.go +++ b/pkg/sensors/accelerator/devices/device.go @@ -1,22 +1,39 @@ -package device +/* +Copyright 2024. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package devices import ( + "errors" "sync" "time" + "github.com/sustainable-computing-io/kepler/pkg/config" "golang.org/x/exp/maps" "k8s.io/klog/v2" ) const ( - DUMMY DeviceType = iota + MOCK DeviceType = iota HABANA DCGM NVML ) var ( - globalRegistry *Registry + deviceRegistry *Registry once sync.Once ) @@ -29,7 +46,7 @@ type ( ) func (d DeviceType) String() string { - return [...]string{"DUMMY", "HABANA", "DCGM", "NVML"}[d] + return [...]string{"MOCK", "HABANA", "DCGM", "NVML"}[d] } type Device interface { @@ -66,18 +83,33 @@ type Device interface { // Registry gets the default device Registry instance func GetRegistry() *Registry { once.Do(func() { - globalRegistry = &Registry{ - Registry: map[string]map[DeviceType]deviceStartupFunc{}, - } + deviceRegistry = newRegistry() + registerDevices(deviceRegistry) }) - return globalRegistry + return deviceRegistry +} + +// NewRegistry creates a new instance of Registry without registering devices +func newRegistry() *Registry { + return &Registry{ + Registry: map[string]map[DeviceType]deviceStartupFunc{}, + } } // SetRegistry replaces the global registry instance // NOTE: All plugins will need to be manually registered // after this function is called. func SetRegistry(registry *Registry) { - globalRegistry = registry + deviceRegistry = registry + registerDevices(deviceRegistry) +} + +// Register all available devices in the global registry +func registerDevices(r *Registry) { + // Call individual device check functions + dcgmCheck(r) + habanaCheck(r) + nvmlCheck(r) } func (r *Registry) MustRegister(a string, d DeviceType, deviceStartup deviceStartupFunc) { @@ -103,41 +135,44 @@ func (r *Registry) Unregister(d DeviceType) { klog.Errorf("Device with type %s doesn't exist", d) } -// AddDeviceInterface adds a supported device interface, prints a fatal error in case of double registration. -func AddDeviceInterface(dtype DeviceType, accType string, deviceStartup deviceStartupFunc) { +// GetAllDeviceTypes returns a slice with all the registered devices. +func (r *Registry) GetAllDeviceTypes() []string { + devices := append([]string{}, maps.Keys(r.Registry)...) + return devices +} + +func addDeviceInterface(registry *Registry, dtype DeviceType, accType string, deviceStartup deviceStartupFunc) error { switch accType { - case "GPU", "DUMMY": - // Handle GPU|Dummy device startup function registration - if existingDevice := GetRegistry().Registry[accType][dtype]; existingDevice != nil { + case config.GPU: + // Check if device is already registered + if existingDevice := registry.Registry[accType][dtype]; existingDevice != nil { klog.Errorf("Multiple Devices attempting to register with name %q", dtype.String()) - return + return errors.New("multiple Devices attempting to register with name") } if dtype == DCGM { // Remove "nvml" if "dcgm" is being registered - GetRegistry().Unregister(NVML) + registry.Unregister(NVML) } else if dtype == NVML { // Do not register "nvml" if "dcgm" is already registered - if _, ok := GetRegistry().Registry["GPU"][DCGM]; ok { - return + if _, ok := registry.Registry[config.GPU][DCGM]; ok { + return errors.New("DCGM already registered. Skipping NVML") } } + klog.V(5).Infof("Try to Register %s", dtype) - GetRegistry().MustRegister(accType, dtype, deviceStartup) + registry.MustRegister(accType, dtype, deviceStartup) default: - klog.Errorf("Unsupported device type %q", dtype) + klog.V(5).Infof("Try to Register %s", dtype) + registry.MustRegister(accType, dtype, deviceStartup) } klog.V(5).Infof("Registered %s", dtype) -} -// GetAllDeviceTypes returns a slice with all the registered devices. -func GetAllDeviceTypes() []string { - devices := append([]string{}, maps.Keys(GetRegistry().Registry)...) - return devices + return nil } -// Startup initializes and returns a new Device according to the given DeviceType [NVML|DCGM|DUMMY|HABANA]. +// Startup initializes and returns a new Device according to the given DeviceType [NVML|DCGM|HABANA]. func Startup(a string) Device { // Retrieve the global registry registry := GetRegistry() diff --git a/pkg/sensors/accelerator/devices/fakehabana.go b/pkg/sensors/accelerator/devices/fakehabana.go new file mode 100644 index 0000000000..95e16d79d5 --- /dev/null +++ b/pkg/sensors/accelerator/devices/fakehabana.go @@ -0,0 +1,25 @@ +//go:build !habana +// +build !habana + +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package devices + +import "k8s.io/klog/v2" + +func habanaCheck(r *Registry) { + klog.V(5).Info("Error initializing habana: ERROR_LIBRARY_NOT_FOUND") +} diff --git a/pkg/sensors/accelerator/device/sources/habana.go b/pkg/sensors/accelerator/devices/habana.go similarity index 65% rename from pkg/sensors/accelerator/device/sources/habana.go rename to pkg/sensors/accelerator/devices/habana.go index 7cfefb3ce6..e03320a787 100644 --- a/pkg/sensors/accelerator/device/sources/habana.go +++ b/pkg/sensors/accelerator/devices/habana.go @@ -16,44 +16,49 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -package sources +package devices import ( + "errors" + "os" "time" hlml "github.com/HabanaAI/gohlml" - "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" "k8s.io/klog/v2" "github.com/sustainable-computing-io/kepler/pkg/config" ) const ( - habanaAccType = "GPU" + habanaAccType = config.GPU + libhlmlpath = "/usr/lib/habanalabs/libhlml.so" ) var ( - habanaAccImpl = GPUHabana{} - habanaType device.DeviceType + habanaAccImpl = gpuHabana{} + habanaType DeviceType ) -type GPUHabana struct { +type gpuHabana struct { collectionSupported bool devices map[int]interface{} } -func init() { +func habanaCheck(r *Registry) { if err := habanaAccImpl.InitLib(); err != nil { - klog.Infof("Error initializing %s: %v", habanaAccImpl.Name(), err) + klog.V(5).Infof("Error initializing %s: %v", habanaAccImpl.Name(), err) return } - habanaType = device.HABANA + habanaType = HABANA klog.V(5).Infof("Register %s with device startup register", habanaType) - device.AddDeviceInterface(habanaType, habanaAccType, habanaDeviceStartup) - klog.Infof("Using %s to obtain processor power", habanaAccImpl.Name()) + if err := addDeviceInterface(r, habanaType, habanaAccType, habanaDeviceStartup); err == nil { + klog.Infof("Using %s to obtain processor power", habanaAccImpl.Name()) + } else { + klog.V(5).Infof("Error registering habana: %v", err) + } } -func habanaDeviceStartup() device.Device { +func habanaDeviceStartup() Device { a := habanaAccImpl if err := a.Init(); err != nil { @@ -64,23 +69,26 @@ func habanaDeviceStartup() device.Device { return &a } -func (g *GPUHabana) Name() string { +func (g *gpuHabana) Name() string { return habanaType.String() } -func (g *GPUHabana) DevType() device.DeviceType { +func (g *gpuHabana) DevType() DeviceType { return habanaType } -func (g *GPUHabana) HwType() string { +func (g *gpuHabana) HwType() string { return habanaAccType } -func (g *GPUHabana) InitLib() error { +func (g *gpuHabana) InitLib() error { + if _, err := os.Stat(libhlmlpath); errors.Is(err, os.ErrNotExist) { + return err + } return nil } -func (g *GPUHabana) Init() error { +func (g *gpuHabana) Init() error { ret := hlml.Initialize() if ret != nil { klog.Error("ERROR initializing hlml") @@ -93,17 +101,17 @@ func (g *GPUHabana) Init() error { return ret } -func (g *GPUHabana) Shutdown() bool { +func (g *gpuHabana) Shutdown() bool { if ret := hlml.Shutdown(); ret != nil { return false } return true } -func (g *GPUHabana) AbsEnergyFromDevice() []uint32 { +func (g *gpuHabana) AbsEnergyFromDevice() []uint32 { gpuEnergy := []uint32{} for _, dev := range g.devices { - power, ret := dev.(device.GPUDevice).DeviceHandler.(hlml.Device).PowerUsage() + power, ret := dev.(GPUDevice).DeviceHandler.(hlml.Device).PowerUsage() if ret != nil { klog.Errorf("failed to get power usage on device %v: %v\n", dev, ret) continue @@ -111,14 +119,14 @@ func (g *GPUHabana) AbsEnergyFromDevice() []uint32 { energy := uint32(uint64(power) * config.SamplePeriodSec()) gpuEnergy = append(gpuEnergy, energy) - dname, _ := dev.(device.GPUDevice).DeviceHandler.(hlml.Device).Name() + dname, _ := dev.(GPUDevice).DeviceHandler.(hlml.Device).Name() klog.V(5).Infof("AbsEnergyFromDevice power usage on device %v: %v\n", dname, gpuEnergy) } return gpuEnergy } -func (g *GPUHabana) DevicesByID() map[int]interface{} { +func (g *gpuHabana) DevicesByID() map[int]interface{} { // Get the count of available devices count, ret := hlml.DeviceCount() if ret != nil { @@ -133,7 +141,7 @@ func (g *GPUHabana) DevicesByID() map[int]interface{} { for i := 0; i < int(count); i++ { // Get the device handle for the current index if h, ret := hlml.DeviceHandleByIndex(uint(i)); ret == nil { - devices[i] = device.GPUDevice{ + devices[i] = GPUDevice{ DeviceHandler: h, } } @@ -141,30 +149,30 @@ func (g *GPUHabana) DevicesByID() map[int]interface{} { return devices } -func (g *GPUHabana) DevicesByName() map[string]any { +func (g *gpuHabana) DevicesByName() map[string]any { devices := make(map[string]interface{}) return devices } -func (g *GPUHabana) DeviceInstances() map[int]map[int]interface{} { +func (g *gpuHabana) DeviceInstances() map[int]map[int]interface{} { var devices map[int]map[int]interface{} return devices } -func (g *GPUHabana) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { +func (g *gpuHabana) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { ds := make(map[any]interface{}) // Process Accelerator Metrics return ds, nil } -func (g *GPUHabana) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]interface{}, error) { +func (g *gpuHabana) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]interface{}, error) { pam := make(map[uint32]interface{}) // Process Accelerator Metrics return pam, nil } -func (g *GPUHabana) IsDeviceCollectionSupported() bool { +func (g *gpuHabana) IsDeviceCollectionSupported() bool { return g.collectionSupported } -func (g *GPUHabana) SetDeviceCollectionSupported(supported bool) { +func (g *gpuHabana) SetDeviceCollectionSupported(supported bool) { g.collectionSupported = supported } diff --git a/pkg/sensors/accelerator/devices/mock_device.go b/pkg/sensors/accelerator/devices/mock_device.go new file mode 100644 index 0000000000..78c6204995 --- /dev/null +++ b/pkg/sensors/accelerator/devices/mock_device.go @@ -0,0 +1,120 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package devices + +import ( + "time" + + "k8s.io/klog/v2" +) + +var ( + mockDevice = MOCK +) + +type MockDevice struct { + mockDevice DeviceType + name string + collectionSupported bool +} + +func RegisterMockDevice() { + r := GetRegistry() + if err := addDeviceInterface(r, mockDevice, mockDevice.String(), MockDeviceDeviceStartup); err != nil { + klog.Errorf("couldn't register mock device %v", err) + } +} + +func MockDeviceDeviceStartup() Device { + d := MockDevice{ + mockDevice: mockDevice, + name: mockDevice.String(), + collectionSupported: true, + } + + return &d +} + +func (d *MockDevice) Name() string { + return d.name +} + +func (d *MockDevice) DevType() DeviceType { + return d.mockDevice +} + +func (d *MockDevice) HwType() string { + return d.mockDevice.String() +} + +func (d *MockDevice) InitLib() error { + return nil +} + +func (d *MockDevice) Init() error { + return nil +} + +func (d *MockDevice) Shutdown() bool { + GetRegistry().Unregister(d.DevType()) + return true +} + +func (d *MockDevice) AbsEnergyFromDevice() []uint32 { + return nil +} + +func (d *MockDevice) DevicesByID() map[int]any { + return nil +} + +func (d *MockDevice) DevicesByName() map[string]any { + return nil +} + +func (d *MockDevice) DeviceInstances() map[int]map[int]any { + return nil +} + +func (d *MockDevice) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { + ds := make(map[any]interface{}) // Process Accelerator Metrics + return ds, nil +} + +func (d *MockDevice) ProcessResourceUtilizationPerDevice(dev any, _ time.Duration) (map[uint32]any, error) { + processAcceleratorMetrics := map[uint32]GPUProcessUtilizationSample{} + pam := make(map[uint32]interface{}) + processAcceleratorMetrics[0] = GPUProcessUtilizationSample{ + Pid: 0, + TimeStamp: uint64(time.Now().UnixNano()), + ComputeUtil: 10, + MemUtil: 10, + EncUtil: 10, + DecUtil: 10, + } + for k, v := range processAcceleratorMetrics { + pam[k] = v + } + return pam, nil +} + +func (d *MockDevice) IsDeviceCollectionSupported() bool { + return d.collectionSupported +} + +func (d *MockDevice) SetDeviceCollectionSupported(supported bool) { + d.collectionSupported = supported +} diff --git a/pkg/sensors/accelerator/device/sources/nvml.go b/pkg/sensors/accelerator/devices/nvml.go similarity index 76% rename from pkg/sensors/accelerator/device/sources/nvml.go rename to pkg/sensors/accelerator/devices/nvml.go index 04b9931c75..f9bf80d3ae 100644 --- a/pkg/sensors/accelerator/device/sources/nvml.go +++ b/pkg/sensors/accelerator/devices/nvml.go @@ -1,8 +1,5 @@ -//go:build nvml -// +build nvml - /* -Copyright 2021. +Copyright 2021-2024 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -package sources +package devices import ( "errors" @@ -27,38 +24,39 @@ import ( "k8s.io/klog/v2" "github.com/sustainable-computing-io/kepler/pkg/config" - "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device" ) const ( - nvmlHwType = "GPU" + nvmlHwType = config.GPU ) var ( - nvmlAccImpl = GPUNvml{} - nvmlType device.DeviceType + nvmlAccImpl = gpuNvml{} + nvmlType DeviceType ) -type GPUNvml struct { +type gpuNvml struct { libInited bool collectionSupported bool - devices map[int]device.GPUDevice // List of GPU identifiers for the device - processUtilizationSupported bool // bool to check if the process utilization collection is supported + devices map[int]GPUDevice // List of GPU identifiers for the device + processUtilizationSupported bool // bool to check if the process utilization collection is supported } -func init() { +func nvmlCheck(r *Registry) { if err := nvml.Init(); err != nvml.SUCCESS { - klog.Errorf("Error initializing nvml: %v", err) + klog.V(5).Infof("Error initializing nvml: %v", nvmlErrorString(err)) return } klog.Info("Initializing nvml Successful") - nvmlType = device.NVML - device.AddDeviceInterface(nvmlType, nvmlHwType, nvmlDeviceStartup) - klog.Infof("Using %s to obtain processor power", nvmlAccImpl.Name()) - + nvmlType = NVML + if err := addDeviceInterface(r, nvmlType, nvmlHwType, nvmlDeviceStartup); err == nil { + klog.Infof("Using %s to obtain processor power", nvmlAccImpl.Name()) + } else { + klog.V(5).Infof("Error registering nvml: %v", err) + } } -func nvmlDeviceStartup() device.Device { +func nvmlDeviceStartup() Device { a := nvmlAccImpl if err := a.InitLib(); err != nil { klog.Errorf("Error initializing %s: %v", nvmlType.String(), err) @@ -72,30 +70,30 @@ func nvmlDeviceStartup() device.Device { return &a } -func (n *GPUNvml) Name() string { +func (n *gpuNvml) Name() string { return nvmlType.String() } -func (n *GPUNvml) DevType() device.DeviceType { +func (n *gpuNvml) DevType() DeviceType { return nvmlType } -func (n *GPUNvml) HwType() string { +func (n *gpuNvml) HwType() string { return nvmlHwType } -func (n *GPUNvml) IsDeviceCollectionSupported() bool { +func (n *gpuNvml) IsDeviceCollectionSupported() bool { return n.collectionSupported } -func (n *GPUNvml) SetDeviceCollectionSupported(supported bool) { +func (n *gpuNvml) SetDeviceCollectionSupported(supported bool) { n.collectionSupported = supported } // Init initizalize and start the GPU metric collector // the nvml only works if the container has support to GPU, e.g., it is using nvidia-docker2 // otherwise it will fail to load the libnvidia-ml.so.1 -func (n *GPUNvml) InitLib() (err error) { +func (n *gpuNvml) InitLib() (err error) { defer func() { if r := recover(); r != nil { err = fmt.Errorf("could not init nvml: %v", r) @@ -110,7 +108,7 @@ func (n *GPUNvml) InitLib() (err error) { return nil } -func (n *GPUNvml) Init() (err error) { +func (n *gpuNvml) Init() (err error) { if !n.libInited { if err := n.InitLib(); err != nil { return err @@ -125,7 +123,7 @@ func (n *GPUNvml) Init() (err error) { return err } klog.Infof("found %d gpu devices\n", count) - n.devices = make(map[int]device.GPUDevice, count) + n.devices = make(map[int]GPUDevice, count) for gpuID := 0; gpuID < count; gpuID++ { nvmlDeviceHandler, ret := nvml.DeviceGetHandleByIndex(gpuID) if ret != nvml.SUCCESS { @@ -137,7 +135,7 @@ func (n *GPUNvml) Init() (err error) { name, _ := nvmlDeviceHandler.GetName() uuid, _ := nvmlDeviceHandler.GetUUID() klog.Infof("GPU %v %q %q", gpuID, name, uuid) - dev := device.GPUDevice{ + dev := GPUDevice{ DeviceHandler: nvmlDeviceHandler, ID: gpuID, IsSubdevice: false, @@ -149,13 +147,13 @@ func (n *GPUNvml) Init() (err error) { } // Shutdown stops the GPU metric collector -func (n *GPUNvml) Shutdown() bool { +func (n *gpuNvml) Shutdown() bool { n.libInited = false return nvml.Shutdown() == nvml.SUCCESS } // GetGpus returns a map with gpu device -func (n *GPUNvml) DevicesByID() map[int]interface{} { +func (n *gpuNvml) DevicesByID() map[int]interface{} { devices := make(map[int]interface{}) for id, dev := range n.devices { devices[id] = dev @@ -163,18 +161,18 @@ func (n *GPUNvml) DevicesByID() map[int]interface{} { return devices } -func (n *GPUNvml) DevicesByName() map[string]any { +func (n *gpuNvml) DevicesByName() map[string]any { devices := make(map[string]interface{}) return devices } -func (n *GPUNvml) DeviceInstances() map[int]map[int]interface{} { +func (n *gpuNvml) DeviceInstances() map[int]map[int]interface{} { var devices map[int]map[int]interface{} return devices } // GetAbsEnergyFromGPU returns a map with mJ in each gpu device -func (n *GPUNvml) AbsEnergyFromDevice() []uint32 { +func (n *gpuNvml) AbsEnergyFromDevice() []uint32 { gpuEnergy := []uint32{} for _, dev := range n.devices { power, ret := dev.DeviceHandler.(nvml.Device).GetPowerUsage() @@ -190,7 +188,7 @@ func (n *GPUNvml) AbsEnergyFromDevice() []uint32 { return gpuEnergy } -func (n *GPUNvml) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { +func (n *gpuNvml) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { ds := make(map[any]interface{}) // Process Accelerator Metrics return ds, nil } @@ -198,18 +196,18 @@ func (n *GPUNvml) DeviceUtilizationStats(dev any) (map[any]interface{}, error) { // GetProcessResourceUtilization returns a map of GPUProcessUtilizationSample where the key is the process pid // GPUProcessUtilizationSample.SmUtil represents the process Streaming Multiprocessors - SM (3D/Compute) utilization in percentage. // GPUProcessUtilizationSample.MemUtil represents the process Frame Buffer Memory utilization Value. -func (n *GPUNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]any, error) { - processAcceleratorMetrics := map[uint32]device.GPUProcessUtilizationSample{} +func (n *gpuNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]any, error) { + processAcceleratorMetrics := map[uint32]GPUProcessUtilizationSample{} pam := make(map[uint32]interface{}) lastUtilizationTimestamp := uint64(time.Now().Add(-1*since).UnixNano() / 1000) switch d := dev.(type) { - case device.GPUDevice: + case GPUDevice: if d.DeviceHandler == nil { return pam, nil } if n.processUtilizationSupported { - GPUProcessUtilizationSample, ret := d.DeviceHandler.(nvml.Device).GetProcessUtilization(lastUtilizationTimestamp) + gpuProcessUtilizationSample, ret := d.DeviceHandler.(nvml.Device).GetProcessUtilization(lastUtilizationTimestamp) if ret != nvml.SUCCESS { if ret == nvml.ERROR_NOT_FOUND { // Ignore the error if there is no process running in the GPU @@ -217,10 +215,10 @@ func (n *GPUNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Durati } n.processUtilizationSupported = false } else { - for _, pinfo := range GPUProcessUtilizationSample { + for _, pinfo := range gpuProcessUtilizationSample { // pid 0 means no data if pinfo.Pid != 0 { - processAcceleratorMetrics[pinfo.Pid] = device.GPUProcessUtilizationSample{ + processAcceleratorMetrics[pinfo.Pid] = GPUProcessUtilizationSample{ Pid: pinfo.Pid, TimeStamp: pinfo.TimeStamp, ComputeUtil: pinfo.SmUtil, @@ -234,7 +232,7 @@ func (n *GPUNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Durati } if !n.processUtilizationSupported { // If processUtilizationSupported is false, try deviceGetMPSComputeRunningProcesses_v3 to use memory usage to ratio power usage - config.GPUUsageMetric = config.GPUMemUtilization + config.SetGPUUsageMetric(config.GPUMemUtilization) processInfo, ret := d.DeviceHandler.(nvml.Device).GetComputeRunningProcesses() if ret != nvml.SUCCESS { if ret == nvml.ERROR_NOT_FOUND { @@ -251,7 +249,7 @@ func (n *GPUNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Durati for _, pinfo := range processInfo { // pid 0 means no data if pinfo.Pid != 0 { - processAcceleratorMetrics[pinfo.Pid] = device.GPUProcessUtilizationSample{ + processAcceleratorMetrics[pinfo.Pid] = GPUProcessUtilizationSample{ Pid: pinfo.Pid, MemUtil: uint32(pinfo.UsedGpuMemory * 100 / memoryInfo.Total), } @@ -266,7 +264,7 @@ func (n *GPUNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Durati return pam, nil default: - klog.Error("expected device.GPUDevice but got come other type") + klog.Error("expected GPUDevice but got come other type") return pam, errors.New("invalid device type") } } diff --git a/pkg/sensors/accelerator/device/type.go b/pkg/sensors/accelerator/devices/type.go similarity index 98% rename from pkg/sensors/accelerator/device/type.go rename to pkg/sensors/accelerator/devices/type.go index f14dcb455c..c0bfc92104 100644 --- a/pkg/sensors/accelerator/device/type.go +++ b/pkg/sensors/accelerator/devices/type.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package device +package devices type GPUProcessUtilizationSample struct { Pid uint32