diff --git a/cmd/exporter/exporter.go b/cmd/exporter/exporter.go index d70344ad6e..61636c9afa 100644 --- a/cmd/exporter/exporter.go +++ b/cmd/exporter/exporter.go @@ -31,7 +31,6 @@ import ( "github.com/sustainable-computing-io/kepler/pkg/bpf" "github.com/sustainable-computing-io/kepler/pkg/build" - "github.com/sustainable-computing-io/kepler/pkg/collector/stats" "github.com/sustainable-computing-io/kepler/pkg/config" "github.com/sustainable-computing-io/kepler/pkg/manager" "github.com/sustainable-computing-io/kepler/pkg/metrics" @@ -147,8 +146,6 @@ func main() { platform.InitPowerImpl() defer platform.StopPower() - stats.InitAvailableParamAndMetrics() - if config.EnabledGPU() { r := accelerator.GetRegistry() if a, err := accelerator.New(accelerator.GPU, true); err == nil { diff --git a/pkg/collector/metric_collector.go b/pkg/collector/metric_collector.go index dad5c5a8ca..895b62337c 100644 --- a/pkg/collector/metric_collector.go +++ b/pkg/collector/metric_collector.go @@ -79,8 +79,6 @@ func (c *Collector) Initialize() error { // model component decide whether/how to init model.CreatePowerEstimatorModels( stats.GetProcessFeatureNames(c.bpfSupportedMetrics), - stats.NodeMetadataFeatureNames(), - stats.NodeMetadataFeatureValues(), c.bpfSupportedMetrics, ) diff --git a/pkg/collector/metric_collector_test.go b/pkg/collector/metric_collector_test.go index 9eb3c9a8f7..30f101ebf4 100644 --- a/pkg/collector/metric_collector_test.go +++ b/pkg/collector/metric_collector_test.go @@ -45,7 +45,7 @@ var _ = Describe("Test Collector Unit", func() { metricCollector := newMockCollector(bpfExporter) // The default estimator model is the ratio bpfSupportedMetrics := bpfExporter.SupportedMetrics() - model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), stats.NodeMetadataFeatureNames(), stats.NodeMetadataFeatureValues(), bpfSupportedMetrics) + model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), bpfSupportedMetrics) // update container and node metrics metricCollector.UpdateProcessEnergyUtilizationMetrics() metricCollector.AggregateProcessEnergyUtilizationMetrics() diff --git a/pkg/collector/stats/benchmark_test.go b/pkg/collector/stats/benchmark_test.go index c5083de4ea..df0482e830 100644 --- a/pkg/collector/stats/benchmark_test.go +++ b/pkg/collector/stats/benchmark_test.go @@ -39,7 +39,7 @@ func benchmarkNtesting(b *testing.B, processNumber int) { // The default estimator model is the ratio bpfSupportedMetrics := bpf.DefaultSupportedMetrics() - model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), stats.NodeMetadataFeatureNames(), stats.NodeMetadataFeatureValues(), bpfSupportedMetrics) + model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), bpfSupportedMetrics) // update container and node metrics b.ReportAllocs() diff --git a/pkg/collector/stats/node_stats.go b/pkg/collector/stats/node_stats.go index 201202bda8..5b125bd8ee 100644 --- a/pkg/collector/stats/node_stats.go +++ b/pkg/collector/stats/node_stats.go @@ -1,5 +1,5 @@ /* -Copyright 2021. +Copyright 2021-2024. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ import ( "github.com/sustainable-computing-io/kepler/pkg/bpf" "github.com/sustainable-computing-io/kepler/pkg/config" + "github.com/sustainable-computing-io/kepler/pkg/node" acc "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator" "github.com/sustainable-computing-io/kepler/pkg/utils" ) @@ -30,32 +31,16 @@ type NodeStats struct { // IdleResUtilization is used to determine idle pmap[string]eriods IdleResUtilization map[string]uint64 -} - -// NodeCPUArchitecture returns the CPU architecture -func NodeCPUArchitecture() string { - return getCPUArch() -} - -// NodeCPUPackageMap returns the CPU package map -func NodeCPUPackageMap() map[int32]string { - return getCPUPackageMap() -} - -// NodeMetadataFeatureNames returns the feature names for metadata -func NodeMetadataFeatureNames() []string { - return []string{"cpu_architecture"} -} -// NodeMetadataFeatureValues returns the feature values for metadata -func NodeMetadataFeatureValues() []string { - return []string{NodeCPUArchitecture()} + // nodeInfo allows access to node information + nodeInfo node.Node } func NewNodeStats(bpfSupportedMetrics bpf.SupportedMetrics) *NodeStats { return &NodeStats{ Stats: *NewStats(bpfSupportedMetrics), IdleResUtilization: map[string]uint64{}, + nodeInfo: node.NewNodeInfo(), } } @@ -151,3 +136,19 @@ func (ne *NodeStats) String() string { "%v\n", ne.Stats.String(), ) } + +func (ne *NodeStats) MetadataFeatureNames() []string { + return ne.nodeInfo.MetadataFeatureNames() +} + +func (ne *NodeStats) MetadataFeatureValues() []string { + return ne.nodeInfo.MetadataFeatureValues() +} + +func (ne *NodeStats) CPUArchitecture() string { + return ne.nodeInfo.CPUArchitecture() +} + +func (ne *NodeStats) NodeName() string { + return ne.nodeInfo.Name() +} diff --git a/pkg/collector/stats/stats.go b/pkg/collector/stats/stats.go index 232e0cd30e..733fab92d8 100644 --- a/pkg/collector/stats/stats.go +++ b/pkg/collector/stats/stats.go @@ -27,132 +27,152 @@ import ( "k8s.io/klog/v2" ) -var ( - // AvailableAbsEnergyMetrics holds a list of absolute energy metrics - AvailableAbsEnergyMetrics []string - // AvailableDynEnergyMetrics holds a list of dynamic energy metrics - AvailableDynEnergyMetrics []string - // AvailableIdleEnergyMetrics holds a list of idle energy metrics - AvailableIdleEnergyMetrics []string -) +var defaultBPFMetrics = []string{ + config.CPUCycle, config.CPURefCycle, config.CPUInstruction, config.CacheMiss, config.CPUTime, + config.PageCacheHit, config.IRQNetTXLabel, config.IRQNetRXLabel, config.IRQBlockLabel, +} +// metricSets stores different sets of metrics for energy and resource usage. +type metricSets struct { + absEnergyMetrics []string + dynEnergyMetrics []string + idleEnergyMetrics []string + bpfMetrics []string +} + +// Stats stores resource and energy usage statistics. type Stats struct { - ResourceUsage map[string]types.UInt64StatCollection - EnergyUsage map[string]types.UInt64StatCollection + ResourceUsage map[string]types.UInt64StatCollection + EnergyUsage map[string]types.UInt64StatCollection + availableMetrics *metricSets +} + +// newMetricSets initializes and returns a new metricSets instance. +func newMetricSets() *metricSets { + return &metricSets{ + absEnergyMetrics: []string{ + config.AbsEnergyInCore, config.AbsEnergyInDRAM, config.AbsEnergyInUnCore, config.AbsEnergyInPkg, + config.AbsEnergyInGPU, config.AbsEnergyInOther, config.AbsEnergyInPlatform, + }, + dynEnergyMetrics: []string{ + config.DynEnergyInCore, config.DynEnergyInDRAM, config.DynEnergyInUnCore, config.DynEnergyInPkg, + config.DynEnergyInGPU, config.DynEnergyInOther, config.DynEnergyInPlatform, + }, + idleEnergyMetrics: []string{ + config.IdleEnergyInCore, config.IdleEnergyInDRAM, config.IdleEnergyInUnCore, config.IdleEnergyInPkg, + config.IdleEnergyInGPU, config.IdleEnergyInOther, config.IdleEnergyInPlatform, + }, + bpfMetrics: defaultBPFMetrics, + } } // NewStats creates a new Stats instance func NewStats(bpfSupportedMetrics bpf.SupportedMetrics) *Stats { - m := &Stats{ - ResourceUsage: make(map[string]types.UInt64StatCollection), - EnergyUsage: make(map[string]types.UInt64StatCollection), + stats := &Stats{ + ResourceUsage: make(map[string]types.UInt64StatCollection), + EnergyUsage: make(map[string]types.UInt64StatCollection), + availableMetrics: newMetricSets(), } - // initialize the energy metrics in the map - energyMetrics := []string{} - energyMetrics = append(energyMetrics, AvailableDynEnergyMetrics...) - energyMetrics = append(energyMetrics, AvailableAbsEnergyMetrics...) - energyMetrics = append(energyMetrics, AvailableIdleEnergyMetrics...) + // Initialize the energy metrics in the map. + energyMetrics := append([]string{}, stats.AbsEnergyMetrics()...) + energyMetrics = append(energyMetrics, stats.DynEnergyMetrics()...) + energyMetrics = append(energyMetrics, stats.IdleEnergyMetrics()...) for _, metricName := range energyMetrics { - m.EnergyUsage[metricName] = types.NewUInt64StatCollection() + stats.EnergyUsage[metricName] = types.NewUInt64StatCollection() } - // initialize the resource utilization metrics in the map - resMetrics := []string{} - for metricName := range bpfSupportedMetrics.HardwareCounters { - resMetrics = append(resMetrics, metricName) - } - for metricName := range bpfSupportedMetrics.SoftwareCounters { - resMetrics = append(resMetrics, metricName) - } - for _, metricName := range resMetrics { - m.ResourceUsage[metricName] = types.NewUInt64StatCollection() + // Initialize the resource utilization metrics in the map. + for _, metricName := range stats.BPFMetrics() { + stats.ResourceUsage[metricName] = types.NewUInt64StatCollection() } if config.EnabledGPU() { if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { - m.ResourceUsage[config.GPUComputeUtilization] = types.NewUInt64StatCollection() - m.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection() - m.ResourceUsage[config.IdleEnergyInGPU] = types.NewUInt64StatCollection() + stats.ResourceUsage[config.GPUComputeUtilization] = types.NewUInt64StatCollection() + stats.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection() + stats.ResourceUsage[config.IdleEnergyInGPU] = types.NewUInt64StatCollection() } } - return m + return stats } -// ResetDeltaValues reset all current value to 0 -func (m *Stats) ResetDeltaValues() { - for _, stat := range m.ResourceUsage { +// ResetDeltaValues resets all current values to 0. +func (s *Stats) ResetDeltaValues() { + for _, stat := range s.ResourceUsage { stat.ResetDeltaValues() } - for metric, stat := range m.EnergyUsage { + for metric, stat := range s.EnergyUsage { if strings.Contains(metric, "idle") { - continue // do not reset the idle power metrics + continue // Do not reset the idle power metrics. } stat.ResetDeltaValues() } } -func (m *Stats) String() string { +func (s *Stats) String() string { return fmt.Sprintf( "\tDyn ePkg (mJ): %s (eCore: %s eDram: %s eUncore: %s) eGPU (mJ): %s eOther (mJ): %s platform (mJ): %s \n"+ "\tIdle ePkg (mJ): %s (eCore: %s eDram: %s eUncore: %s) eGPU (mJ): %s eOther (mJ): %s platform (mJ): %s \n"+ "\tResUsage: %v\n", - m.EnergyUsage[config.DynEnergyInPkg], - m.EnergyUsage[config.DynEnergyInCore], - m.EnergyUsage[config.DynEnergyInDRAM], - m.EnergyUsage[config.DynEnergyInUnCore], - m.EnergyUsage[config.DynEnergyInGPU], - m.EnergyUsage[config.DynEnergyInOther], - m.EnergyUsage[config.DynEnergyInPlatform], - m.EnergyUsage[config.IdleEnergyInPkg], - m.EnergyUsage[config.IdleEnergyInCore], - m.EnergyUsage[config.IdleEnergyInDRAM], - m.EnergyUsage[config.IdleEnergyInUnCore], - m.EnergyUsage[config.IdleEnergyInGPU], - m.EnergyUsage[config.IdleEnergyInOther], - m.EnergyUsage[config.IdleEnergyInPlatform], - m.ResourceUsage) -} - -// UpdateDynEnergy calculates the dynamic energy -func (m *Stats) UpdateDynEnergy() { - for pkgID := range m.EnergyUsage[config.AbsEnergyInPkg] { - m.CalcDynEnergy(config.AbsEnergyInPkg, config.IdleEnergyInPkg, config.DynEnergyInPkg, pkgID) - m.CalcDynEnergy(config.AbsEnergyInCore, config.IdleEnergyInCore, config.DynEnergyInCore, pkgID) - m.CalcDynEnergy(config.AbsEnergyInUnCore, config.IdleEnergyInUnCore, config.DynEnergyInUnCore, pkgID) - m.CalcDynEnergy(config.AbsEnergyInDRAM, config.IdleEnergyInDRAM, config.DynEnergyInDRAM, pkgID) + s.EnergyUsage[config.DynEnergyInPkg], + s.EnergyUsage[config.DynEnergyInCore], + s.EnergyUsage[config.DynEnergyInDRAM], + s.EnergyUsage[config.DynEnergyInUnCore], + s.EnergyUsage[config.DynEnergyInGPU], + s.EnergyUsage[config.DynEnergyInOther], + s.EnergyUsage[config.DynEnergyInPlatform], + s.EnergyUsage[config.IdleEnergyInPkg], + s.EnergyUsage[config.IdleEnergyInCore], + s.EnergyUsage[config.IdleEnergyInDRAM], + s.EnergyUsage[config.IdleEnergyInUnCore], + s.EnergyUsage[config.IdleEnergyInGPU], + s.EnergyUsage[config.IdleEnergyInOther], + s.EnergyUsage[config.IdleEnergyInPlatform], + s.ResourceUsage, + ) +} + +// UpdateDynEnergy calculates the dynamic energy. +func (s *Stats) UpdateDynEnergy() { + for pkgID := range s.EnergyUsage[config.AbsEnergyInPkg] { + s.CalcDynEnergy(config.AbsEnergyInPkg, config.IdleEnergyInPkg, config.DynEnergyInPkg, pkgID) + s.CalcDynEnergy(config.AbsEnergyInCore, config.IdleEnergyInCore, config.DynEnergyInCore, pkgID) + s.CalcDynEnergy(config.AbsEnergyInUnCore, config.IdleEnergyInUnCore, config.DynEnergyInUnCore, pkgID) + s.CalcDynEnergy(config.AbsEnergyInDRAM, config.IdleEnergyInDRAM, config.DynEnergyInDRAM, pkgID) } - for sensorID := range m.EnergyUsage[config.AbsEnergyInPlatform] { - m.CalcDynEnergy(config.AbsEnergyInPlatform, config.IdleEnergyInPlatform, config.DynEnergyInPlatform, sensorID) + for sensorID := range s.EnergyUsage[config.AbsEnergyInPlatform] { + s.CalcDynEnergy(config.AbsEnergyInPlatform, config.IdleEnergyInPlatform, config.DynEnergyInPlatform, sensorID) } - // gpu metric + // GPU metric if config.EnabledGPU() { if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { - for gpuID := range m.EnergyUsage[config.AbsEnergyInGPU] { - m.CalcDynEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.DynEnergyInGPU, gpuID) + for gpuID := range s.EnergyUsage[config.AbsEnergyInGPU] { + s.CalcDynEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.DynEnergyInGPU, gpuID) } } } } -// CalcDynEnergy calculate the difference between the absolute and idle energy/power -func (m *Stats) CalcDynEnergy(absM, idleM, dynM, id string) { - if _, exist := m.EnergyUsage[absM][id]; !exist { +// CalcDynEnergy calculates the difference between the absolute and idle energy/power. +func (s *Stats) CalcDynEnergy(absM, idleM, dynM, id string) { + if _, exist := s.EnergyUsage[absM][id]; !exist { return } - totalPower := m.EnergyUsage[absM][id].GetDelta() - klog.V(6).Infof("Absolute Energy stat: %v (%s)", m.EnergyUsage[absM], id) + totalPower := s.EnergyUsage[absM][id].GetDelta() + klog.V(6).Infof("Absolute Energy stat: %v (%s)", s.EnergyUsage[absM], id) idlePower := uint64(0) - if idleStat, found := m.EnergyUsage[idleM][id]; found { + if idleStat, found := s.EnergyUsage[idleM][id]; found { idlePower = idleStat.GetDelta() - klog.V(6).Infof("Idle Energy stat: %v (%s)", m.EnergyUsage[idleM], id) + klog.V(6).Infof("Idle Energy stat: %v (%s)", s.EnergyUsage[idleM], id) } dynPower := calcDynEnergy(totalPower, idlePower) - m.EnergyUsage[dynM].SetDeltaStat(id, dynPower) - klog.V(6).Infof("Dynamic Energy stat: %v (%s)", m.EnergyUsage[dynM], id) + s.EnergyUsage[dynM].SetDeltaStat(id, dynPower) + klog.V(6).Infof("Dynamic Energy stat: %v (%s)", s.EnergyUsage[dynM], id) } +// calcDynEnergy calculates the dynamic energy. func calcDynEnergy(totalE, idleE uint64) uint64 { if (totalE == 0) || (idleE == 0) || (totalE < idleE) { return 0 @@ -160,6 +180,7 @@ func calcDynEnergy(totalE, idleE uint64) uint64 { return totalE - idleE } +// normalize normalizes the value if required. func normalize(val float64, shouldNormalize bool) float64 { if shouldNormalize { return val / float64(config.SamplePeriodSec()) @@ -167,77 +188,79 @@ func normalize(val float64, shouldNormalize bool) float64 { return val } -// ToEstimatorValues return values regarding metricNames. +// ToEstimatorValues returns values for the specified metric names, normalized if required. // The metrics can be related to resource utilization or power consumption. -// Since Kepler collects metrics at intervals of SamplePeriodSec, which is greater than 1 second, and the power models are trained to estimate power in 1 second interval, -// it is necessary to normalize the resource utilization by the SamplePeriodSec. Note that this is important because the power curve can be different for higher or lower resource usage within 1 second interval. -func (m *Stats) ToEstimatorValues(featuresName []string, shouldNormalize bool) []float64 { +// Since Kepler collects metrics at intervals of SamplePeriodSec, which is greater than 1 second, +// and the power models are trained to estimate power in 1 second interval. It is necessary to +// normalize the resource utilization by the SamplePeriodSec. This is important because the power +// curve can be different for higher or lower resource usage within 1 second interval. +func (s *Stats) ToEstimatorValues(featuresName []string, shouldNormalize bool) []float64 { featureValues := []float64{} for _, feature := range featuresName { - // verify all metrics that are part of the node resource usage metrics - if value, exists := m.ResourceUsage[feature]; exists { + // Verify all metrics that are part of the node resource usage metrics. + if value, exists := s.ResourceUsage[feature]; exists { featureValues = append(featureValues, normalize(float64(value.SumAllDeltaValues()), shouldNormalize)) continue } - // some features are not related to resource utilization, such as power metrics + // Some features are not related to resource utilization, such as power metrics. switch feature { - case config.GeneralUsageMetric(): // is an empty string for UNCORE and OTHER resource usage + case config.GeneralUsageMetric(): // Is an empty string for UNCORE and OTHER resource usage. featureValues = append(featureValues, 0) - case config.DynEnergyInPkg: // for dynamic PKG power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInPkg].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInPkg: // For dynamic PKG power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInPkg].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.DynEnergyInCore: // for dynamic CORE power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInCore].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInCore: // For dynamic CORE power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInCore].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.DynEnergyInDRAM: // for dynamic PKG power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInDRAM].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInDRAM: // For dynamic DRAM power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInDRAM].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.DynEnergyInUnCore: // for dynamic UNCORE power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInUnCore].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInUnCore: // For dynamic UNCORE power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInUnCore].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.DynEnergyInOther: // for dynamic OTHER power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInOther].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInOther: // For dynamic OTHER power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInOther].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.DynEnergyInPlatform: // for dynamic PLATFORM power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInPlatform].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInPlatform: // For dynamic PLATFORM power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInPlatform].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.DynEnergyInGPU: // for dynamic GPU power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInGPU].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInGPU: // For dynamic GPU power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInGPU].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInPkg: // for idle PKG power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInPkg].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInPkg: // For idle PKG power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInPkg].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInCore: // for idle CORE power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInCore].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInCore: // For idle CORE power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInCore].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInDRAM: // for idle PKG power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInDRAM].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInDRAM: // For idle DRAM power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInDRAM].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInUnCore: // for idle UNCORE power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInUnCore].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInUnCore: // For idle UNCORE power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInUnCore].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInOther: // for idle OTHER power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInOther].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInOther: // For idle OTHER power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInOther].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInPlatform: // for idle PLATFORM power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInPlatform].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInPlatform: // For idle PLATFORM power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInPlatform].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInGPU: // for idle GPU power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInGPU].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInGPU: // For idle GPU power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInGPU].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) default: @@ -247,3 +270,19 @@ func (m *Stats) ToEstimatorValues(featuresName []string, shouldNormalize bool) [ } return featureValues } + +func (s *Stats) AbsEnergyMetrics() []string { + return s.availableMetrics.absEnergyMetrics +} + +func (s *Stats) DynEnergyMetrics() []string { + return s.availableMetrics.dynEnergyMetrics +} + +func (s *Stats) IdleEnergyMetrics() []string { + return s.availableMetrics.idleEnergyMetrics +} + +func (s *Stats) BPFMetrics() []string { + return s.availableMetrics.bpfMetrics +} diff --git a/pkg/collector/stats/stats_test.go b/pkg/collector/stats/stats_test.go index b95a7140a8..2546272f8a 100644 --- a/pkg/collector/stats/stats_test.go +++ b/pkg/collector/stats/stats_test.go @@ -12,7 +12,6 @@ var _ = Describe("Stats", func() { config.GetConfig() config.SetEnabledHardwareCounterMetrics(false) supportedMetrics := bpf.DefaultSupportedMetrics() - InitAvailableParamAndMetrics() exp := []string{} Expect(len(GetProcessFeatureNames(supportedMetrics)) >= len(exp)).To(BeTrue()) }) diff --git a/pkg/collector/stats/test_utils.go b/pkg/collector/stats/test_utils.go index 07ea429d8e..83f95b3c27 100644 --- a/pkg/collector/stats/test_utils.go +++ b/pkg/collector/stats/test_utils.go @@ -37,20 +37,6 @@ func SetMockedCollectorMetrics() { err := gpu.Device().Init() // create structure instances that will be accessed to create a processMetric klog.Fatalln(err) } - - // initialize the Available metrics since they are used to create a new processMetrics instance - AvailableAbsEnergyMetrics = []string{ - config.AbsEnergyInCore, config.AbsEnergyInDRAM, config.AbsEnergyInUnCore, config.AbsEnergyInPkg, - config.AbsEnergyInGPU, config.AbsEnergyInOther, config.AbsEnergyInPlatform, - } - AvailableDynEnergyMetrics = []string{ - config.DynEnergyInCore, config.DynEnergyInDRAM, config.DynEnergyInUnCore, config.DynEnergyInPkg, - config.DynEnergyInGPU, config.DynEnergyInOther, config.DynEnergyInPlatform, - } - AvailableIdleEnergyMetrics = []string{ - config.IdleEnergyInCore, config.IdleEnergyInDRAM, config.IdleEnergyInUnCore, config.IdleEnergyInPkg, - config.IdleEnergyInGPU, config.IdleEnergyInOther, config.IdleEnergyInPlatform, - } } // CreateMockedProcessStats adds two containers with all metrics initialized diff --git a/pkg/collector/stats/utils.go b/pkg/collector/stats/utils.go index f019009139..0fa2d962db 100644 --- a/pkg/collector/stats/utils.go +++ b/pkg/collector/stats/utils.go @@ -17,57 +17,13 @@ limitations under the License. package stats import ( - "fmt" - "os" - "os/exec" - "regexp" - "runtime" - "strconv" - "strings" - "k8s.io/klog/v2" "github.com/sustainable-computing-io/kepler/pkg/bpf" "github.com/sustainable-computing-io/kepler/pkg/config" acc "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator" - - cpuidv2 "github.com/klauspost/cpuid/v2" - "gopkg.in/yaml.v3" ) -const ( - CPUModelDataPath = "/var/lib/kepler/data/cpus.yaml" - CPUPmuNamePath = "/sys/devices/cpu/caps/pmu_name" - CPUTopologyPath = "/sys/devices/system/cpu/cpu%d/topology/physical_package_id" -) - -type CPUModelData struct { - Core string `yaml:"core"` - Uarch string `yaml:"uarch"` - Family string `yaml:"family"` - Model string `yaml:"model"` - Stepping string `yaml:"stepping"` -} - -type CPUS struct { - cpusInfo []CPUModelData -} - -func InitAvailableParamAndMetrics() { - AvailableAbsEnergyMetrics = []string{ - config.AbsEnergyInCore, config.AbsEnergyInDRAM, config.AbsEnergyInUnCore, config.AbsEnergyInPkg, - config.AbsEnergyInGPU, config.AbsEnergyInOther, config.AbsEnergyInPlatform, - } - AvailableDynEnergyMetrics = []string{ - config.DynEnergyInCore, config.DynEnergyInDRAM, config.DynEnergyInUnCore, config.DynEnergyInPkg, - config.DynEnergyInGPU, config.DynEnergyInOther, config.DynEnergyInPlatform, - } - AvailableIdleEnergyMetrics = []string{ - config.IdleEnergyInCore, config.IdleEnergyInDRAM, config.IdleEnergyInUnCore, config.IdleEnergyInPkg, - config.IdleEnergyInGPU, config.IdleEnergyInOther, config.IdleEnergyInPlatform, - } -} - func GetProcessFeatureNames(bpfSupportedMetrics bpf.SupportedMetrics) []string { var metrics []string // bpf software counter metrics @@ -91,253 +47,3 @@ func GetProcessFeatureNames(bpfSupportedMetrics bpf.SupportedMetrics) []string { return metrics } - -func NodeName() string { - if nodeName := os.Getenv("NODE_NAME"); nodeName != "" { - return nodeName - } - nodeName, err := os.Hostname() - if err != nil { - klog.Fatalf("could not get the node name: %s", err) - } - return nodeName -} - -func getCPUArch() string { - arch, err := getCPUArchitecture() - if err == nil { - klog.V(3).Infof("Current CPU architecture: %s", arch) - return arch - } - klog.Errorf("getCPUArch failure: %s", err) - return "unknown" -} - -// getX86Architecture() uses "cpuid" tool to detect the current X86 CPU architecture. -// Per "cpuid" source code, the "uarch" section format in output is as follows: -// -// " (uarch synth) = {}, " -// -// Take Intel Xeon 4th Gen Scalable Processor(Codename: Sapphire Rapids) as example: -// $ cpuid -1 |grep uarch -// -// (uarch synth) = Intel Sapphire Rapids {Golden Cove}, Intel 7 -// -// In this example, the expected return string should be: "Sapphire Rapids". -func getX86Architecture() (string, error) { - // use cpuid to get CPUArchitecture - grep := exec.Command("grep", "uarch") - output := exec.Command("cpuid", "-1") - pipe, err := output.StdoutPipe() - if err != nil { - return "", err - } - defer pipe.Close() - - grep.Stdin = pipe - err = output.Start() - if err != nil { - klog.Errorf("cpuid command start failure: %s", err) - return "", err - } - res, err := grep.Output() - if err != nil { - klog.Errorf("grep cpuid command output failure: %s", err) - return "", err - } - - // format the CPUArchitecture result, the raw "uarch synth" string is like this: - // (uarch synth) = {}, - // example 1: "(uarch synth) = Intel Sapphire Rapids {Golden Cove}, Intel 7" - // example 2: "(uarch synth) = AMD Zen 2, 7nm", here the info is missing. - uarchSection := strings.Split(string(res), "=") - if len(uarchSection) != 2 { - return "", fmt.Errorf("cpuid grep output is unexpected") - } - // get the string contains only vendor/uarch/family info - // example 1: "Intel Sapphire Rapids {Golden Cove}" - // example 2: "AMD Zen 2" - vendorUarchFamily := strings.Split(strings.TrimSpace(uarchSection[1]), ",")[0] - - // remove the family info if necessary - var vendorUarch string - if strings.Contains(vendorUarchFamily, "{") { - vendorUarch = strings.TrimSpace(strings.Split(vendorUarchFamily, "{")[0]) - } else { - vendorUarch = vendorUarchFamily - } - - // get the uarch finally, e.g. "Sapphire Rapids", "Zen 2". - start := strings.Index(vendorUarch, " ") + 1 - uarch := vendorUarch[start:] - - if err = output.Wait(); err != nil { - klog.Errorf("cpuid command is not properly completed: %s", err) - } - - return uarch, err -} - -func getS390xArchitecture() (string, error) { - // use lscpu to get CPUArchitecture - grep := exec.Command("grep", "Machine type:") - output := exec.Command("lscpu") - pipe, err := output.StdoutPipe() - if err != nil { - return "", err - } - defer pipe.Close() - - grep.Stdin = pipe - err = output.Start() - if err != nil { - klog.Errorf("lscpu command start failure: %s", err) - return "", err - } - res, err := grep.Output() - if err != nil { - klog.Errorf("grep lscpu command output failure: %s", err) - return "", err - } - - // format the CPUArchitecture result - uarch := strings.Split(string(res), ":") - if len(uarch) != 2 { - return "", fmt.Errorf("lscpu grep output is unexpected") - } - - if err = output.Wait(); err != nil { - klog.Errorf("lscpu command is not properly completed: %s", err) - } - - return fmt.Sprintf("zSystems model %s", strings.TrimSpace(uarch[1])), err -} - -// readCPUModelData() reads the CPU model data from CPUModelDataPath and if -// the file does not exist, reads the data from ./data/cpus.yaml. -func readCPUModelData() ([]byte, error) { - data, err := os.ReadFile(CPUModelDataPath) - if err == nil { - return data, nil - } - if !os.IsNotExist(err) { - return nil, err - } - - klog.Infof("%s not found, reading local cpus.yaml", CPUModelDataPath) - return os.ReadFile("./data/cpus.yaml") -} - -// There are three options for Kepler to detect CPU microarchitecture: -// 1. Use tools such as 'cpuid', 'archspec', etc, to directly fetch CPU microarchitecture. -// -// 2. Use Golang libraries to detect the CPU's 'family'/'model'/'stepping' information, -// then use it to fetch the microarchitecture from community's CPU models data file. -// -// 3. Read Linux SYSFS file '/sys/devices/cpu/pmu_name', use the content as uarch. -// -// Option #1 is the first choice in Kepler, if the tools are N/A on platforms, -// getCPUMicroarchitecture() provides option #2 and #3 as the alternative solution. - -func getCPUMicroArchitecture(family, model, stepping string) (string, error) { - yamlBytes, err := readCPUModelData() - if err != nil { - klog.Errorf("failed to read cpus.yaml: %v", err) - return "", err - } - cpus := &CPUS{ - cpusInfo: []CPUModelData{}, - } - err = yaml.Unmarshal(yamlBytes, &cpus.cpusInfo) - if err != nil { - klog.Errorf("failed to parse cpus.yaml: %v", err) - return "", err - } - - for _, info := range cpus.cpusInfo { - // if family matches - if info.Family == family { - var reModel *regexp.Regexp - reModel, err = regexp.Compile(info.Model) - if err != nil { - return "", err - } - // if model matches - if reModel.FindString(model) == model { - // if there is a stepping - if info.Stepping != "" { - var reStepping *regexp.Regexp - reStepping, err = regexp.Compile(info.Stepping) - if err != nil { - return "", err - } - // if stepping does NOT match - if reStepping.FindString(stepping) == "" { - // no match - continue - } - } - return info.Uarch, nil - } - } - } - klog.V(3).Infof("CPU match not found for family %s, model %s, stepping %s. Use pmu_name as uarch.", family, model, stepping) - // fallback to option #3 - return getCPUPmuName() -} - -func getCPUPmuName() (pmuName string, err error) { - var data []byte - if data, err = os.ReadFile(CPUPmuNamePath); err != nil { - klog.V(3).Infoln(err) - return - } - pmuName = string(data) - return -} - -func getCPUArchitecture() (string, error) { - // check if there is a CPU architecture override - cpuArchOverride := config.CPUArchOverride() - if cpuArchOverride != "" { - klog.V(2).Infof("cpu arch override: %v\n", cpuArchOverride) - return cpuArchOverride, nil - } - - var ( - myCPUArch string - err error - ) - // get myCPUArch for x86-64 and ARM CPUs, for s390x CPUs, directly return result. - if runtime.GOARCH == "amd64" { - myCPUArch, err = getX86Architecture() - } else if runtime.GOARCH == "s390x" { - return getS390xArchitecture() - } else { - myCPUArch, err = getCPUPmuName() - } - - if err == nil { - return myCPUArch, nil - } else { - f, m, s := strconv.Itoa(cpuidv2.CPU.Family), strconv.Itoa(cpuidv2.CPU.Model), strconv.Itoa(cpuidv2.CPU.Stepping) - return getCPUMicroArchitecture(f, m, s) - } -} - -func getCPUPackageMap() (cpuPackageMap map[int32]string) { - cpuPackageMap = make(map[int32]string) - // check if mapping available - numCPU := int32(runtime.NumCPU()) - for cpu := int32(0); cpu < numCPU; cpu++ { - targetFileName := fmt.Sprintf(CPUTopologyPath, cpu) - value, err := os.ReadFile(targetFileName) - if err != nil { - klog.Errorf("cannot get CPU-Package map: %v", err) - return - } - cpuPackageMap[cpu] = strings.TrimSpace(string(value)) - } - klog.V(3).Infof("CPU-Package Map: %v\n", cpuPackageMap) - return -} diff --git a/pkg/collector/stats/vm_stats.go b/pkg/collector/stats/vm_stats.go index 2855bfb705..2da8c53865 100644 --- a/pkg/collector/stats/vm_stats.go +++ b/pkg/collector/stats/vm_stats.go @@ -18,17 +18,6 @@ package stats import "github.com/sustainable-computing-io/kepler/pkg/bpf" -var ( - // VMMetricNames holds the list of names of the vm metric - VMMetricNames []string - // VMFloatFeatureNames holds the feature name of the vm float stats. This is specific for the machine-learning based models. - VMFloatFeatureNames []string = []string{} - // VMUintFeaturesNames holds the feature name of the vm utint stats. This is specific for the machine-learning based models. - VMUintFeaturesNames []string - // VMFeaturesNames holds all the feature name of the vm stats. This is specific for the machine-learning based models. - VMFeaturesNames []string -) - type VMStats struct { Stats PID uint64 diff --git a/pkg/kubernetes/watcher.go b/pkg/kubernetes/watcher.go index 65cd1f1155..ad7e5023da 100644 --- a/pkg/kubernetes/watcher.go +++ b/pkg/kubernetes/watcher.go @@ -37,6 +37,7 @@ import ( "github.com/sustainable-computing-io/kepler/pkg/bpf" "github.com/sustainable-computing-io/kepler/pkg/collector/stats" "github.com/sustainable-computing-io/kepler/pkg/config" + "github.com/sustainable-computing-io/kepler/pkg/node" ) const ( @@ -109,7 +110,7 @@ func NewObjListWatcher(bpfSupportedMetrics bpf.SupportedMetrics) (*ObjListWatche return w, nil } optionsModifier := func(options *metav1.ListOptions) { - options.FieldSelector = fields.Set{"spec.nodeName": stats.NodeName()}.AsSelector().String() // to filter events per node + options.FieldSelector = fields.Set{"spec.nodeName": node.Name()}.AsSelector().String() // to filter events per node } objListWatcher := cache.NewFilteredListWatchFromClient( w.k8sCli.CoreV1().RESTClient(), diff --git a/pkg/metrics/node/metrics.go b/pkg/metrics/node/metrics.go index 95e2f43d60..050642acd9 100644 --- a/pkg/metrics/node/metrics.go +++ b/pkg/metrics/node/metrics.go @@ -85,7 +85,7 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { // update node info ch <- c.collectors["info"].MustMetric(1, - stats.NodeCPUArchitecture(), + c.NodeStats.CPUArchitecture(), components.GetSourceName(), platform.GetSourceName(), ) diff --git a/pkg/metrics/prometheus_collector_test.go b/pkg/metrics/prometheus_collector_test.go index 0d6e5173f1..dd6a547b1a 100644 --- a/pkg/metrics/prometheus_collector_test.go +++ b/pkg/metrics/prometheus_collector_test.go @@ -88,10 +88,7 @@ var _ = Describe("Test Prometheus Collector Unit", func() { nodeStats.UpdateDynEnergy() - model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), - stats.NodeMetadataFeatureNames(), - stats.NodeMetadataFeatureValues(), - bpfSupportedMetrics) + model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), bpfSupportedMetrics) model.UpdateProcessEnergy(processStats, &nodeStats) // get metrics from prometheus diff --git a/pkg/metrics/utils/utils.go b/pkg/metrics/utils/utils.go index 43f8eb43db..9d66ee1515 100644 --- a/pkg/metrics/utils/utils.go +++ b/pkg/metrics/utils/utils.go @@ -150,7 +150,7 @@ func collectEnergy(ch chan<- prometheus.Metric, instance interface{}, metricName if _, exist := node.EnergyUsage[metricName]; exist { for deviceID, utilization := range node.EnergyUsage[metricName] { value = float64(utilization.GetAggr()) / JouleMillijouleConversionFactor - labelValues = []string{deviceID, stats.NodeName(), mode} + labelValues = []string{deviceID, v.NodeName(), mode} collect(ch, collector, value, labelValues) } } @@ -208,7 +208,7 @@ func CollectResUtil(ch chan<- prometheus.Metric, instance interface{}, metricNam if _, exist := node.ResourceUsage[metricName]; exist { for deviceID, utilization := range node.ResourceUsage[metricName] { value = float64(utilization.GetAggr()) - labelValues = []string{deviceID, stats.NodeName()} + labelValues = []string{deviceID, v.NodeName()} collect(ch, collector, value, labelValues) } } diff --git a/pkg/model/benchmark_test.go b/pkg/model/benchmark_test.go index d4f8a16f47..74c81e68bd 100644 --- a/pkg/model/benchmark_test.go +++ b/pkg/model/benchmark_test.go @@ -44,7 +44,7 @@ func benchmarkNtesting(b *testing.B, processNumber int) { // The default estimator model is the ratio bpfSupportedMetrics := bpf.DefaultSupportedMetrics() - model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), stats.NodeMetadataFeatureNames(), stats.NodeMetadataFeatureValues(), bpfSupportedMetrics) + model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), bpfSupportedMetrics) // update container and node metrics b.ReportAllocs() diff --git a/pkg/model/model.go b/pkg/model/model.go index 7aa7cc35ab..417a951106 100644 --- a/pkg/model/model.go +++ b/pkg/model/model.go @@ -71,12 +71,12 @@ type PowerModelInterface interface { } // CreatePowerEstimatorModels checks validity of power model and set estimate functions -func CreatePowerEstimatorModels(processFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues []string, bpfSupportedMetrics bpf.SupportedMetrics) { +func CreatePowerEstimatorModels(processFeatureNames []string, bpfSupportedMetrics bpf.SupportedMetrics) { config.InitModelConfigMap() - CreateProcessPowerEstimatorModel(processFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues, bpfSupportedMetrics) + CreateProcessPowerEstimatorModel(processFeatureNames, bpfSupportedMetrics) // Node power estimator uses the process features to estimate node power, expect for the Ratio power model that contains additional metrics. - CreateNodePlatformPowerEstimatorModel(processFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues) - CreateNodeComponentPowerEstimatorModel(processFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues) + CreateNodePlatformPoweEstimatorModel(processFeatureNames) + CreateNodeComponentPowerEstimatorModel(processFeatureNames) } // createPowerModelEstimator called by CreatePowerEstimatorModels to initiate estimate function for each power model. diff --git a/pkg/model/node_component_energy.go b/pkg/model/node_component_energy.go index 78d2c38f23..d5e78992f2 100644 --- a/pkg/model/node_component_energy.go +++ b/pkg/model/node_component_energy.go @@ -22,6 +22,7 @@ import ( "github.com/sustainable-computing-io/kepler/pkg/collector/stats" "github.com/sustainable-computing-io/kepler/pkg/config" "github.com/sustainable-computing-io/kepler/pkg/model/types" + "github.com/sustainable-computing-io/kepler/pkg/node" "github.com/sustainable-computing-io/kepler/pkg/sensors/components" "github.com/sustainable-computing-io/kepler/pkg/sensors/components/source" "k8s.io/klog/v2" @@ -31,7 +32,9 @@ import ( var nodeComponentPowerModel PowerModelInterface // createNodeComponentPowerModelConfig: the node component power model url must be set by default. -func createNodeComponentPowerModelConfig(nodeFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues []string) *types.ModelConfig { +func createNodeComponentPowerModelConfig(nodeFeatureNames []string) *types.ModelConfig { + systemMetaDataFeatureNames := node.MetadataFeatureNames() + systemMetaDataFeatureValues := node.MetadataFeatureValues() modelConfig := CreatePowerModelConfig(config.NodeComponentsPowerKey()) if modelConfig.InitModelURL == "" { modelConfig.InitModelFilepath = config.GetDefaultPowerModelURL(modelConfig.ModelOutputType.String(), types.ComponentEnergySource) @@ -44,25 +47,21 @@ func createNodeComponentPowerModelConfig(nodeFeatureNames, systemMetaDataFeature } // CreateNodeComponentPowerEstimatorModel only create a new power model estimator if node components power metrics are not available -func CreateNodeComponentPowerEstimatorModel(nodeFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues []string) { - if components.IsSystemCollectionSupported() { - klog.Infof("Skipping creation of Node Component Power Model since the system collection is supported") - return - } - - modelConfig := createNodeComponentPowerModelConfig(nodeFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues) - // init func for NodeComponentPower +func CreateNodeComponentPowerEstimatorModel(nodeFeatureNames []string) { var err error - nodeComponentPowerModel, err = createPowerModelEstimator(modelConfig) - if err != nil { - klog.Errorf("Failed to create %s/%s Model from %s to estimate Node Component Power: %v", - modelConfig.ModelType, modelConfig.ModelOutputType, - modelConfig.SourceURL(), err) + if !components.IsSystemCollectionSupported() { + modelConfig := createNodeComponentPowerModelConfig(nodeFeatureNames) + // init func for NodeComponentPower + nodeComponentPowerModel, err = createPowerModelEstimator(modelConfig) + if err == nil { + klog.V(1).Infof("Using the %s Power Model to estimate Node Component Power", modelConfig.ModelType.String()+"/"+modelConfig.ModelOutputType.String()) + } else { + klog.Infof("Failed to create %s Power Model to estimate Node Component Power: %v\n", modelConfig.ModelType.String()+"/"+modelConfig.ModelOutputType.String(), err) + } + } else { + klog.Infof("Skipping creation of Node Component Power Model since the system collection is supported") return } - - klog.V(1).Infof("Using the %s/%s Model from %s to estimate Node Component Power", - modelConfig.ModelType, modelConfig.ModelOutputType, modelConfig.SourceURL()) } // IsNodeComponentPowerModelEnabled returns if the estimator has been enabled or not @@ -100,12 +99,12 @@ func GetNodeComponentPowers(nodeMetrics *stats.NodeStats, isIdlePower bool) (nod // UpdateNodeComponentEnergy sets the power model samples, get absolute powers, and set gauge value for each component energy func UpdateNodeComponentEnergy(nodeMetrics *stats.NodeStats) { - addEnergy(nodeMetrics, stats.AvailableAbsEnergyMetrics, absPower) + addEnergy(nodeMetrics, nodeMetrics.AbsEnergyMetrics(), absPower) } // UpdateNodeComponentIdleEnergy sets the power model samples to zeros, get idle powers, and set gauge value for each component idle energy func UpdateNodeComponentIdleEnergy(nodeMetrics *stats.NodeStats) { - addEnergy(nodeMetrics, stats.AvailableIdleEnergyMetrics, idlePower) + addEnergy(nodeMetrics, nodeMetrics.IdleEnergyMetrics(), idlePower) } func addEnergy(nodeMetrics *stats.NodeStats, metrics []string, isIdle bool) { diff --git a/pkg/model/node_platform_energy.go b/pkg/model/node_platform_energy.go index 1617aa3d17..346f82266b 100644 --- a/pkg/model/node_platform_energy.go +++ b/pkg/model/node_platform_energy.go @@ -22,6 +22,7 @@ import ( "github.com/sustainable-computing-io/kepler/pkg/collector/stats" "github.com/sustainable-computing-io/kepler/pkg/config" "github.com/sustainable-computing-io/kepler/pkg/model/types" + "github.com/sustainable-computing-io/kepler/pkg/node" "github.com/sustainable-computing-io/kepler/pkg/sensors/platform" "k8s.io/klog/v2" ) @@ -32,32 +33,28 @@ const ( var nodePlatformPowerModel PowerModelInterface -// CreateNodeComponentPoweEstimatorModel only create a new power model estimator if node platform power metrics are not available -func CreateNodePlatformPowerEstimatorModel(nodeFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues []string) { - if platform.IsSystemCollectionSupported() { - klog.Infof("Skipping creation of Node Platform Power Model since the system collection is supported") +// CreateNodeComponentPowerEstimatorModel only create a new power model estimator if node platform power metrics are not available +func CreateNodePlatformPoweEstimatorModel(nodeFeatureNames []string) { + systemMetaDataFeatureNames := node.MetadataFeatureNames() + systemMetaDataFeatureValues := node.MetadataFeatureValues() + if !platform.IsSystemCollectionSupported() { + modelConfig := CreatePowerModelConfig(config.NodePlatformPowerKey()) + if modelConfig.InitModelURL == "" { + modelConfig.InitModelFilepath = config.GetDefaultPowerModelURL(modelConfig.ModelOutputType.String(), types.PlatformEnergySource) + } + modelConfig.NodeFeatureNames = nodeFeatureNames + modelConfig.SystemMetaDataFeatureNames = systemMetaDataFeatureNames + modelConfig.SystemMetaDataFeatureValues = systemMetaDataFeatureValues + modelConfig.IsNodePowerModel = true + // init func for NodeTotalPower + var err error + nodePlatformPowerModel, err = createPowerModelEstimator(modelConfig) + if err == nil { + klog.V(1).Infof("Using the %s Power Model to estimate Node Platform Power", modelConfig.ModelType.String()+"/"+modelConfig.ModelOutputType.String()) + } else { + klog.Infof("Failed to create %s Power Model to estimate Node Platform Power: %v\n", modelConfig.ModelType.String()+"/"+modelConfig.ModelOutputType.String(), err) + } } - - modelConfig := CreatePowerModelConfig(config.NodePlatformPowerKey()) - if modelConfig.InitModelURL == "" { - modelConfig.InitModelFilepath = config.GetDefaultPowerModelURL(modelConfig.ModelOutputType.String(), types.PlatformEnergySource) - } - modelConfig.NodeFeatureNames = nodeFeatureNames - modelConfig.SystemMetaDataFeatureNames = systemMetaDataFeatureNames - modelConfig.SystemMetaDataFeatureValues = systemMetaDataFeatureValues - modelConfig.IsNodePowerModel = true - // - // init func for NodeTotalPower - var err error - nodePlatformPowerModel, err = createPowerModelEstimator(modelConfig) - if err != nil { - klog.Errorf("Failed to create %s/%s Model from %s to estimate Node Platform Power: %v", - modelConfig.ModelType, modelConfig.ModelOutputType, - modelConfig.SourceURL(), err) - return - } - klog.V(1).Infof("Using the %s/%s Model from %s to estimate Node Platform Power", - modelConfig.ModelType, modelConfig.ModelOutputType, modelConfig.SourceURL()) } // IsNodePlatformPowerModelEnabled returns if the estimator has been enabled or not diff --git a/pkg/model/process_energy.go b/pkg/model/process_energy.go index 98fda79737..cee07ef56f 100644 --- a/pkg/model/process_energy.go +++ b/pkg/model/process_energy.go @@ -23,6 +23,7 @@ import ( "github.com/sustainable-computing-io/kepler/pkg/collector/stats" "github.com/sustainable-computing-io/kepler/pkg/config" "github.com/sustainable-computing-io/kepler/pkg/model/types" + "github.com/sustainable-computing-io/kepler/pkg/node" acc "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator" "github.com/sustainable-computing-io/kepler/pkg/sensors/components/source" "github.com/sustainable-computing-io/kepler/pkg/utils" @@ -35,7 +36,9 @@ var ( ) // createProcessPowerModelConfig: the process component power model must be set by default. -func createProcessPowerModelConfig(powerSourceTarget string, processFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues []string, energySource string, bpfSupportedMetrics bpf.SupportedMetrics) (modelConfig *types.ModelConfig) { +func createProcessPowerModelConfig(powerSourceTarget string, processFeatureNames []string, energySource string, bpfSupportedMetrics bpf.SupportedMetrics) (modelConfig *types.ModelConfig) { + systemMetaDataFeatureNames := node.MetadataFeatureNames() + systemMetaDataFeatureValues := node.MetadataFeatureValues() modelConfig = CreatePowerModelConfig(powerSourceTarget) if modelConfig == nil { return nil @@ -105,9 +108,9 @@ func createProcessPowerModelConfig(powerSourceTarget string, processFeatureNames return modelConfig } -func CreateProcessPowerEstimatorModel(processFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues []string, bpfSupportedMetrics bpf.SupportedMetrics) { +func CreateProcessPowerEstimatorModel(processFeatureNames []string, bpfSupportedMetrics bpf.SupportedMetrics) { var err error - modelConfig := createProcessPowerModelConfig(config.ProcessPlatformPowerKey(), processFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues, types.PlatformEnergySource, bpfSupportedMetrics) + modelConfig := createProcessPowerModelConfig(config.ProcessPlatformPowerKey(), processFeatureNames, types.PlatformEnergySource, bpfSupportedMetrics) modelConfig.IsNodePowerModel = false processPlatformPowerModel, err = createPowerModelEstimator(modelConfig) if err == nil { @@ -117,7 +120,7 @@ func CreateProcessPowerEstimatorModel(processFeatureNames, systemMetaDataFeature klog.Infof("Failed to create %s Power Model to estimate Process Platform Power: %v\n", modelConfig.ModelType.String()+"/"+modelConfig.ModelOutputType.String(), err) } - modelConfig = createProcessPowerModelConfig(config.ProcessComponentsPowerKey(), processFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues, types.ComponentEnergySource, bpfSupportedMetrics) + modelConfig = createProcessPowerModelConfig(config.ProcessComponentsPowerKey(), processFeatureNames, types.ComponentEnergySource, bpfSupportedMetrics) modelConfig.IsNodePowerModel = false processComponentPowerModel, err = createPowerModelEstimator(modelConfig) if err == nil { diff --git a/pkg/model/process_energy_test.go b/pkg/model/process_energy_test.go index d5ccf9a11d..f6e355274a 100644 --- a/pkg/model/process_energy_test.go +++ b/pkg/model/process_energy_test.go @@ -33,9 +33,6 @@ var _ = Describe("ProcessPower", func() { var ( processStats map[uint64]*stats.ProcessStats nodeStats stats.NodeStats - - systemMetaDataFeatureNames = []string{"cpu_architecture"} - systemMetaDataFeatureValues = []string{"Sandy Bridge"} ) Context("with manually defined node power", func() { @@ -69,7 +66,7 @@ var _ = Describe("ProcessPower", func() { // getEstimatorMetrics bpfSupportedMetrics := bpf.DefaultSupportedMetrics() - CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), systemMetaDataFeatureNames, systemMetaDataFeatureValues, bpfSupportedMetrics) + CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), bpfSupportedMetrics) // initialize the node energy with aggregated energy, which will be used to calculate delta energy // add first values to be the idle power @@ -105,7 +102,7 @@ var _ = Describe("ProcessPower", func() { // getEstimatorMetrics bpfSupportedMetrics := bpf.DefaultSupportedMetrics() - CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), systemMetaDataFeatureNames, systemMetaDataFeatureValues, bpfSupportedMetrics) + CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), bpfSupportedMetrics) // initialize the node energy with aggregated energy, which will be used to calculate delta energy // add first values to be the idle power diff --git a/pkg/node/node.go b/pkg/node/node.go new file mode 100644 index 0000000000..4b98fd1c7d --- /dev/null +++ b/pkg/node/node.go @@ -0,0 +1,293 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package node + +import ( + "fmt" + "os" + "os/exec" + "regexp" + "runtime" + "strconv" + "strings" + + cpuidv2 "github.com/klauspost/cpuid/v2" + "gopkg.in/yaml.v3" + "k8s.io/klog/v2" + + "github.com/sustainable-computing-io/kepler/pkg/config" +) + +const ( + cpuModelDataPath = "/var/lib/kepler/data/cpus.yaml" + cpuPmuNamePath = "/sys/devices/cpu/caps/pmu_name" +) + +type nodeInfo struct { + name string + cpuArchitecture string + metadataFeatureNames []string + metadataFeatureValues []string +} + +type Node interface { + Name() string + CPUArchitecture() string + MetadataFeatureNames() []string + MetadataFeatureValues() []string +} + +type cpuModelData struct { + uarch string `yaml:"uarch"` + family string `yaml:"family"` + model string `yaml:"model"` + stepping string `yaml:"stepping"` +} + +type cpuInfo struct { + cpusInfo []cpuModelData +} + +func Name() string { + return nodeName() +} + +func CPUArchitecture() string { + return cpuArch() +} + +func MetadataFeatureNames() []string { + return []string{"cpu_architecture"} +} + +func MetadataFeatureValues() []string { + cpuArchitecture := cpuArch() + return []string{cpuArchitecture} +} + +func NewNodeInfo() Node { + cpuArchitecture := cpuArch() + return &nodeInfo{ + name: nodeName(), + cpuArchitecture: cpuArchitecture, + metadataFeatureNames: []string{"cpu_architecture"}, + metadataFeatureValues: []string{cpuArchitecture}, + } +} + +func (ni *nodeInfo) Name() string { + return ni.name +} + +func (ni *nodeInfo) CPUArchitecture() string { + return ni.cpuArchitecture +} + +func (ni *nodeInfo) MetadataFeatureNames() []string { + return ni.metadataFeatureNames +} + +func (ni *nodeInfo) MetadataFeatureValues() []string { + return ni.metadataFeatureValues +} + +func nodeName() string { + if nodeName := os.Getenv("NODE_NAME"); nodeName != "" { + return nodeName + } + nodeName, err := os.Hostname() + if err != nil { + klog.Fatalf("could not get the node name: %s", err) + } + return nodeName +} + +func cpuArch() string { + if arch, err := getCPUArchitecture(); err == nil { + klog.V(3).Infof("Current CPU architecture: %s", arch) + return arch + } else { + klog.Errorf("getCPUArch failure: %s", err) + return "unknown" + } +} + +func x86Architecture() (string, error) { + grep := exec.Command("grep", "uarch") + output := exec.Command("cpuid", "-1") + pipe, err := output.StdoutPipe() + if err != nil { + return "", err + } + defer pipe.Close() + + grep.Stdin = pipe + err = output.Start() + if err != nil { + klog.Errorf("cpuid command start failure: %s", err) + return "", err + } + res, err := grep.Output() + if err != nil { + klog.Errorf("grep cpuid command output failure: %s", err) + return "", err + } + + uarchSection := strings.Split(string(res), "=") + if len(uarchSection) != 2 { + return "", fmt.Errorf("cpuid grep output is unexpected") + } + vendorUarchFamily := strings.Split(strings.TrimSpace(uarchSection[1]), ",")[0] + + var vendorUarch string + if strings.Contains(vendorUarchFamily, "{") { + vendorUarch = strings.TrimSpace(strings.Split(vendorUarchFamily, "{")[0]) + } else { + vendorUarch = vendorUarchFamily + } + + start := strings.Index(vendorUarch, " ") + 1 + uarch := vendorUarch[start:] + + if err = output.Wait(); err != nil { + klog.Errorf("cpuid command is not properly completed: %s", err) + } + + return uarch, err +} + +func s390xArchitecture() (string, error) { + grep := exec.Command("grep", "Machine type:") + output := exec.Command("lscpu") + pipe, err := output.StdoutPipe() + if err != nil { + return "", err + } + defer pipe.Close() + + grep.Stdin = pipe + err = output.Start() + if err != nil { + klog.Errorf("lscpu command start failure: %s", err) + return "", err + } + res, err := grep.Output() + if err != nil { + klog.Errorf("grep lscpu command output failure: %s", err) + return "", err + } + + uarch := strings.Split(string(res), ":") + if len(uarch) != 2 { + return "", fmt.Errorf("lscpu grep output is unexpected") + } + + if err = output.Wait(); err != nil { + klog.Errorf("lscpu command is not properly completed: %s", err) + } + + return fmt.Sprintf("zSystems model %s", strings.TrimSpace(uarch[1])), err +} + +func readCPUModelData() ([]byte, error) { + data, err := os.ReadFile(cpuModelDataPath) + if err == nil { + return data, nil + } + if !os.IsNotExist(err) { + return nil, err + } + + klog.Infof("%s not found, reading local cpus.yaml", cpuModelDataPath) + return os.ReadFile("./data/cpus.yaml") +} + +func cpuMicroArchitecture(family, model, stepping string) (string, error) { + yamlBytes, err := readCPUModelData() + if err != nil { + klog.Errorf("failed to read cpus.yaml: %v", err) + return "", err + } + cpus := &cpuInfo{ + cpusInfo: []cpuModelData{}, + } + err = yaml.Unmarshal(yamlBytes, &cpus.cpusInfo) + if err != nil { + klog.Errorf("failed to parse cpus.yaml: %v", err) + return "", err + } + + for _, info := range cpus.cpusInfo { + if info.family == family { + var reModel *regexp.Regexp + reModel, err = regexp.Compile(info.model) + if err != nil { + return "", err + } + if reModel.FindString(model) == model { + if info.stepping != "" { + var reStepping *regexp.Regexp + reStepping, err = regexp.Compile(info.stepping) + if err != nil { + return "", err + } + if reStepping.FindString(stepping) == "" { + continue + } + } + return info.uarch, nil + } + } + } + klog.V(3).Infof("CPU match not found for family %s, model %s, stepping %s. Use pmu_name as uarch.", family, model, stepping) + return cpuPmuName() +} + +func cpuPmuName() (string, error) { + data, err := os.ReadFile(cpuPmuNamePath) + if err != nil { + klog.V(3).Infoln(err) + return "", err + } + return string(data), nil +} + +func getCPUArchitecture() (string, error) { + cpuArchOverride := config.CPUArchOverride() + if cpuArchOverride != "" { + klog.V(2).Infof("cpu arch override: %v\n", cpuArchOverride) + return cpuArchOverride, nil + } + + var ( + myCPUArch string + err error + ) + if runtime.GOARCH == "amd64" { + myCPUArch, err = x86Architecture() + } else if runtime.GOARCH == "s390x" { + return s390xArchitecture() + } else { + myCPUArch, err = cpuPmuName() + } + + if err == nil { + return myCPUArch, nil + } else { + f, m, s := strconv.Itoa(cpuidv2.CPU.Family), strconv.Itoa(cpuidv2.CPU.Model), strconv.Itoa(cpuidv2.CPU.Stepping) + return cpuMicroArchitecture(f, m, s) + } +} diff --git a/pkg/nodecred/csv_cred.go b/pkg/nodecred/csv_cred.go index 81ce46b9c0..1a05c3f676 100644 --- a/pkg/nodecred/csv_cred.go +++ b/pkg/nodecred/csv_cred.go @@ -21,7 +21,7 @@ import ( "fmt" "os" - metric_util "github.com/sustainable-computing-io/kepler/pkg/collector/stats" + "github.com/sustainable-computing-io/kepler/pkg/node" "k8s.io/klog/v2" ) @@ -61,7 +61,7 @@ func (c csvNodeCred) IsSupported(info map[string]string) bool { if filePath == "" { return false } else { - nodeName := metric_util.NodeName() + nodeName := node.Name() // read file from filePath userName, password, host, err := readCSVFile(filePath, nodeName) if err != nil { diff --git a/pkg/nodecred/csv_cred_test.go b/pkg/nodecred/csv_cred_test.go index d0de354396..9f77500132 100644 --- a/pkg/nodecred/csv_cred_test.go +++ b/pkg/nodecred/csv_cred_test.go @@ -20,7 +20,7 @@ import ( "os" "testing" - metric_util "github.com/sustainable-computing-io/kepler/pkg/collector/stats" + "github.com/sustainable-computing-io/kepler/pkg/node" ) func TestGetNodeCredByNodeName(t *testing.T) { @@ -95,7 +95,7 @@ func TestIsSupported(t *testing.T) { // set ENV variable NODE_NAME to "node1" os.Setenv("NODE_NAME", "node1") // check if getNodeName() returns "node1" - nodeName := metric_util.NodeName() + nodeName := node.Name() if nodeName != "node1" { t.Errorf("Expected nodeName: node1, got: %v", nodeName) }