diff --git a/pkg/collector/stats/stats.go b/pkg/collector/stats/stats.go index e4f699944d..5951d037a6 100644 --- a/pkg/collector/stats/stats.go +++ b/pkg/collector/stats/stats.go @@ -26,143 +26,152 @@ import ( "k8s.io/klog/v2" ) -var ( - // Default metric sets. In addition, each metric set is used by the model - // package to estimate different power usage metrics. - // AvailableAbsEnergyMetrics holds a list of absolute energy metrics - AvailableAbsEnergyMetrics = []string{ - config.AbsEnergyInCore, config.AbsEnergyInDRAM, config.AbsEnergyInUnCore, config.AbsEnergyInPkg, - config.AbsEnergyInGPU, config.AbsEnergyInOther, config.AbsEnergyInPlatform, - } - // AvailableDynEnergyMetrics holds a list of dynamic energy metrics - AvailableDynEnergyMetrics = []string{ - config.DynEnergyInCore, config.DynEnergyInDRAM, config.DynEnergyInUnCore, config.DynEnergyInPkg, - config.DynEnergyInGPU, config.DynEnergyInOther, config.DynEnergyInPlatform, - } - // AvailableIdleEnergyMetrics holds a list of idle energy metrics - AvailableIdleEnergyMetrics = []string{ - config.IdleEnergyInCore, config.IdleEnergyInDRAM, config.IdleEnergyInUnCore, config.IdleEnergyInPkg, - config.IdleEnergyInGPU, config.IdleEnergyInOther, config.IdleEnergyInPlatform, - } - // AvailableBPFMetrics holds a list of reasonable default bpf metrics - AvailableBPFMetrics = []string{ - config.CPUCycle, config.CPURefCycle, config.CPUInstruction, config.CacheMiss, config.CPUTime, - config.PageCacheHit, config.IRQNetTXLabel, config.IRQNetRXLabel, config.IRQBlockLabel, - } -) +var defaultBPFMetrics = []string{ + config.CPUCycle, config.CPURefCycle, config.CPUInstruction, config.CacheMiss, config.CPUTime, + config.PageCacheHit, config.IRQNetTXLabel, config.IRQNetRXLabel, config.IRQBlockLabel, +} +// MetricSets stores different sets of metrics for energy and resource usage. +type MetricSets struct { + AbsEnergyMetrics []string + DynEnergyMetrics []string + IdleEnergyMetrics []string + BPFMetrics []string +} + +// Stats stores resource and energy usage statistics. type Stats struct { - ResourceUsage map[string]types.UInt64StatCollection - EnergyUsage map[string]types.UInt64StatCollection + ResourceUsage map[string]types.UInt64StatCollection + EnergyUsage map[string]types.UInt64StatCollection + AvailableMetrics *MetricSets +} + +// newMetricSets initializes and returns a new MetricSets instance. +func newMetricSets() *MetricSets { + return &MetricSets{ + AbsEnergyMetrics: []string{ + config.AbsEnergyInCore, config.AbsEnergyInDRAM, config.AbsEnergyInUnCore, config.AbsEnergyInPkg, + config.AbsEnergyInGPU, config.AbsEnergyInOther, config.AbsEnergyInPlatform, + }, + DynEnergyMetrics: []string{ + config.DynEnergyInCore, config.DynEnergyInDRAM, config.DynEnergyInUnCore, config.DynEnergyInPkg, + config.DynEnergyInGPU, config.DynEnergyInOther, config.DynEnergyInPlatform, + }, + IdleEnergyMetrics: []string{ + config.IdleEnergyInCore, config.IdleEnergyInDRAM, config.IdleEnergyInUnCore, config.IdleEnergyInPkg, + config.IdleEnergyInGPU, config.IdleEnergyInOther, config.IdleEnergyInPlatform, + }, + BPFMetrics: defaultBPFMetrics, + } } -// NewStats creates a new Stats instance +// NewStats creates a new Stats instance. func NewStats() *Stats { - m := &Stats{ - ResourceUsage: make(map[string]types.UInt64StatCollection), - EnergyUsage: make(map[string]types.UInt64StatCollection), + stats := &Stats{ + ResourceUsage: make(map[string]types.UInt64StatCollection), + EnergyUsage: make(map[string]types.UInt64StatCollection), + AvailableMetrics: newMetricSets(), } - // initialize the energy metrics in the map - energyMetrics := []string{} - energyMetrics = append(energyMetrics, AvailableDynEnergyMetrics...) - energyMetrics = append(energyMetrics, AvailableAbsEnergyMetrics...) - energyMetrics = append(energyMetrics, AvailableIdleEnergyMetrics...) + // Initialize the energy metrics in the map. + energyMetrics := append([]string{}, stats.AvailableMetrics.AbsEnergyMetrics...) + energyMetrics = append(energyMetrics, stats.AvailableMetrics.DynEnergyMetrics...) + energyMetrics = append(energyMetrics, stats.AvailableMetrics.IdleEnergyMetrics...) for _, metricName := range energyMetrics { - m.EnergyUsage[metricName] = types.NewUInt64StatCollection() + stats.EnergyUsage[metricName] = types.NewUInt64StatCollection() } - // initialize the resource utilization metrics in the map - resMetrics := []string{} - resMetrics = append(resMetrics, AvailableBPFMetrics...) - for _, metricName := range resMetrics { - m.ResourceUsage[metricName] = types.NewUInt64StatCollection() + // Initialize the resource utilization metrics in the map. + for _, metricName := range stats.AvailableMetrics.BPFMetrics { + stats.ResourceUsage[metricName] = types.NewUInt64StatCollection() } if config.EnabledGPU { if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { - m.ResourceUsage[config.GPUComputeUtilization] = types.NewUInt64StatCollection() - m.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection() - m.ResourceUsage[config.IdleEnergyInGPU] = types.NewUInt64StatCollection() + stats.ResourceUsage[config.GPUComputeUtilization] = types.NewUInt64StatCollection() + stats.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection() + stats.ResourceUsage[config.IdleEnergyInGPU] = types.NewUInt64StatCollection() } } - return m + return stats } -// ResetDeltaValues reset all current value to 0 -func (m *Stats) ResetDeltaValues() { - for _, stat := range m.ResourceUsage { +// ResetDeltaValues resets all current values to 0. +func (s *Stats) ResetDeltaValues() { + for _, stat := range s.ResourceUsage { stat.ResetDeltaValues() } - for metric, stat := range m.EnergyUsage { + for metric, stat := range s.EnergyUsage { if strings.Contains(metric, "idle") { - continue // do not reset the idle power metrics + continue // Do not reset the idle power metrics. } stat.ResetDeltaValues() } } -func (m *Stats) String() string { +func (s *Stats) String() string { return fmt.Sprintf( "\tDyn ePkg (mJ): %s (eCore: %s eDram: %s eUncore: %s) eGPU (mJ): %s eOther (mJ): %s platform (mJ): %s \n"+ "\tIdle ePkg (mJ): %s (eCore: %s eDram: %s eUncore: %s) eGPU (mJ): %s eOther (mJ): %s platform (mJ): %s \n"+ "\tResUsage: %v\n", - m.EnergyUsage[config.DynEnergyInPkg], - m.EnergyUsage[config.DynEnergyInCore], - m.EnergyUsage[config.DynEnergyInDRAM], - m.EnergyUsage[config.DynEnergyInUnCore], - m.EnergyUsage[config.DynEnergyInGPU], - m.EnergyUsage[config.DynEnergyInOther], - m.EnergyUsage[config.DynEnergyInPlatform], - m.EnergyUsage[config.IdleEnergyInPkg], - m.EnergyUsage[config.IdleEnergyInCore], - m.EnergyUsage[config.IdleEnergyInDRAM], - m.EnergyUsage[config.IdleEnergyInUnCore], - m.EnergyUsage[config.IdleEnergyInGPU], - m.EnergyUsage[config.IdleEnergyInOther], - m.EnergyUsage[config.IdleEnergyInPlatform], - m.ResourceUsage) + s.EnergyUsage[config.DynEnergyInPkg], + s.EnergyUsage[config.DynEnergyInCore], + s.EnergyUsage[config.DynEnergyInDRAM], + s.EnergyUsage[config.DynEnergyInUnCore], + s.EnergyUsage[config.DynEnergyInGPU], + s.EnergyUsage[config.DynEnergyInOther], + s.EnergyUsage[config.DynEnergyInPlatform], + s.EnergyUsage[config.IdleEnergyInPkg], + s.EnergyUsage[config.IdleEnergyInCore], + s.EnergyUsage[config.IdleEnergyInDRAM], + s.EnergyUsage[config.IdleEnergyInUnCore], + s.EnergyUsage[config.IdleEnergyInGPU], + s.EnergyUsage[config.IdleEnergyInOther], + s.EnergyUsage[config.IdleEnergyInPlatform], + s.ResourceUsage, + ) } -// UpdateDynEnergy calculates the dynamic energy -func (m *Stats) UpdateDynEnergy() { - for pkgID := range m.EnergyUsage[config.AbsEnergyInPkg] { - m.CalcDynEnergy(config.AbsEnergyInPkg, config.IdleEnergyInPkg, config.DynEnergyInPkg, pkgID) - m.CalcDynEnergy(config.AbsEnergyInCore, config.IdleEnergyInCore, config.DynEnergyInCore, pkgID) - m.CalcDynEnergy(config.AbsEnergyInUnCore, config.IdleEnergyInUnCore, config.DynEnergyInUnCore, pkgID) - m.CalcDynEnergy(config.AbsEnergyInDRAM, config.IdleEnergyInDRAM, config.DynEnergyInDRAM, pkgID) +// UpdateDynEnergy calculates the dynamic energy. +func (s *Stats) UpdateDynEnergy() { + for pkgID := range s.EnergyUsage[config.AbsEnergyInPkg] { + s.CalcDynEnergy(config.AbsEnergyInPkg, config.IdleEnergyInPkg, config.DynEnergyInPkg, pkgID) + s.CalcDynEnergy(config.AbsEnergyInCore, config.IdleEnergyInCore, config.DynEnergyInCore, pkgID) + s.CalcDynEnergy(config.AbsEnergyInUnCore, config.IdleEnergyInUnCore, config.DynEnergyInUnCore, pkgID) + s.CalcDynEnergy(config.AbsEnergyInDRAM, config.IdleEnergyInDRAM, config.DynEnergyInDRAM, pkgID) } - for sensorID := range m.EnergyUsage[config.AbsEnergyInPlatform] { - m.CalcDynEnergy(config.AbsEnergyInPlatform, config.IdleEnergyInPlatform, config.DynEnergyInPlatform, sensorID) + for sensorID := range s.EnergyUsage[config.AbsEnergyInPlatform] { + s.CalcDynEnergy(config.AbsEnergyInPlatform, config.IdleEnergyInPlatform, config.DynEnergyInPlatform, sensorID) } - // gpu metric + // GPU metric if config.EnabledGPU { if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { - for gpuID := range m.EnergyUsage[config.AbsEnergyInGPU] { - m.CalcDynEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.DynEnergyInGPU, gpuID) + for gpuID := range s.EnergyUsage[config.AbsEnergyInGPU] { + s.CalcDynEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.DynEnergyInGPU, gpuID) } } } } -// CalcDynEnergy calculate the difference between the absolute and idle energy/power -func (m *Stats) CalcDynEnergy(absM, idleM, dynM, id string) { - if _, exist := m.EnergyUsage[absM][id]; !exist { +// CalcDynEnergy calculates the difference between the absolute and idle energy/power. +func (s *Stats) CalcDynEnergy(absM, idleM, dynM, id string) { + if _, exist := s.EnergyUsage[absM][id]; !exist { return } - totalPower := m.EnergyUsage[absM][id].GetDelta() - klog.V(6).Infof("Absolute Energy stat: %v (%s)", m.EnergyUsage[absM], id) + totalPower := s.EnergyUsage[absM][id].GetDelta() + klog.V(6).Infof("Absolute Energy stat: %v (%s)", s.EnergyUsage[absM], id) idlePower := uint64(0) - if idleStat, found := m.EnergyUsage[idleM][id]; found { + if idleStat, found := s.EnergyUsage[idleM][id]; found { idlePower = idleStat.GetDelta() - klog.V(6).Infof("Idle Energy stat: %v (%s)", m.EnergyUsage[idleM], id) + klog.V(6).Infof("Idle Energy stat: %v (%s)", s.EnergyUsage[idleM], id) } dynPower := calcDynEnergy(totalPower, idlePower) - m.EnergyUsage[dynM].SetDeltaStat(id, dynPower) - klog.V(6).Infof("Dynamic Energy stat: %v (%s)", m.EnergyUsage[dynM], id) + s.EnergyUsage[dynM].SetDeltaStat(id, dynPower) + klog.V(6).Infof("Dynamic Energy stat: %v (%s)", s.EnergyUsage[dynM], id) } +// calcDynEnergy calculates the dynamic energy. func calcDynEnergy(totalE, idleE uint64) uint64 { if (totalE == 0) || (idleE == 0) || (totalE < idleE) { return 0 @@ -170,6 +179,7 @@ func calcDynEnergy(totalE, idleE uint64) uint64 { return totalE - idleE } +// normalize normalizes the value if required. func normalize(val float64, shouldNormalize bool) float64 { if shouldNormalize { return val / float64(config.SamplePeriodSec) @@ -177,77 +187,79 @@ func normalize(val float64, shouldNormalize bool) float64 { return val } -// ToEstimatorValues return values regarding metricNames. +// ToEstimatorValues returns values for the specified metric names, normalized if required. // The metrics can be related to resource utilization or power consumption. -// Since Kepler collects metrics at intervals of SamplePeriodSec, which is greater than 1 second, and the power models are trained to estimate power in 1 second interval, -// it is necessary to normalize the resource utilization by the SamplePeriodSec. Note that this is important because the power curve can be different for higher or lower resource usage within 1 second interval. -func (m *Stats) ToEstimatorValues(featuresName []string, shouldNormalize bool) []float64 { +// Since Kepler collects metrics at intervals of SamplePeriodSec, which is greater than 1 second, +// and the power models are trained to estimate power in 1 second interval. It is necessary to +// normalize the resource utilization by the SamplePeriodSec. This is important because the power +// curve can be different for higher or lower resource usage within 1 second interval. +func (s *Stats) ToEstimatorValues(featuresName []string, shouldNormalize bool) []float64 { featureValues := []float64{} for _, feature := range featuresName { - // verify all metrics that are part of the node resource usage metrics - if value, exists := m.ResourceUsage[feature]; exists { + // Verify all metrics that are part of the node resource usage metrics. + if value, exists := s.ResourceUsage[feature]; exists { featureValues = append(featureValues, normalize(float64(value.SumAllDeltaValues()), shouldNormalize)) continue } - // some features are not related to resource utilization, such as power metrics + // Some features are not related to resource utilization, such as power metrics. switch feature { - case config.GeneralUsageMetric: // is an empty string for UNCORE and OTHER resource usage + case config.GeneralUsageMetric: // Is an empty string for UNCORE and OTHER resource usage. featureValues = append(featureValues, 0) - case config.DynEnergyInPkg: // for dynamic PKG power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInPkg].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInPkg: // For dynamic PKG power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInPkg].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.DynEnergyInCore: // for dynamic CORE power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInCore].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInCore: // For dynamic CORE power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInCore].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.DynEnergyInDRAM: // for dynamic PKG power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInDRAM].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInDRAM: // For dynamic DRAM power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInDRAM].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.DynEnergyInUnCore: // for dynamic UNCORE power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInUnCore].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInUnCore: // For dynamic UNCORE power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInUnCore].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.DynEnergyInOther: // for dynamic OTHER power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInOther].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInOther: // For dynamic OTHER power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInOther].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.DynEnergyInPlatform: // for dynamic PLATFORM power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInPlatform].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInPlatform: // For dynamic PLATFORM power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInPlatform].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.DynEnergyInGPU: // for dynamic GPU power consumption - value := normalize(float64(m.EnergyUsage[config.DynEnergyInGPU].SumAllDeltaValues()), shouldNormalize) + case config.DynEnergyInGPU: // For dynamic GPU power consumption. + value := normalize(float64(s.EnergyUsage[config.DynEnergyInGPU].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInPkg: // for idle PKG power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInPkg].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInPkg: // For idle PKG power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInPkg].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInCore: // for idle CORE power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInCore].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInCore: // For idle CORE power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInCore].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInDRAM: // for idle PKG power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInDRAM].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInDRAM: // For idle DRAM power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInDRAM].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInUnCore: // for idle UNCORE power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInUnCore].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInUnCore: // For idle UNCORE power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInUnCore].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInOther: // for idle OTHER power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInOther].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInOther: // For idle OTHER power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInOther].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInPlatform: // for idle PLATFORM power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInPlatform].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInPlatform: // For idle PLATFORM power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInPlatform].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) - case config.IdleEnergyInGPU: // for idle GPU power consumption - value := normalize(float64(m.EnergyUsage[config.IdleEnergyInGPU].SumAllDeltaValues()), shouldNormalize) + case config.IdleEnergyInGPU: // For idle GPU power consumption. + value := normalize(float64(s.EnergyUsage[config.IdleEnergyInGPU].SumAllDeltaValues()), shouldNormalize) featureValues = append(featureValues, value) default: diff --git a/pkg/collector/stats/utils.go b/pkg/collector/stats/utils.go index cabf33d623..fd1394d8e5 100644 --- a/pkg/collector/stats/utils.go +++ b/pkg/collector/stats/utils.go @@ -53,13 +53,13 @@ type CPUS struct { } func RegisterBPFStats(counters []string) { - AvailableBPFMetrics = counters + defaultBPFMetrics = counters } func GetProcessFeatureNames() []string { var metrics []string // bpf counter metrics - metrics = append(metrics, AvailableBPFMetrics...) + metrics = append(metrics, defaultBPFMetrics...) klog.V(3).Infof("Available ebpf counters: %v", metrics) // gpu metric diff --git a/pkg/model/node_component_energy.go b/pkg/model/node_component_energy.go index c1cca5d6c6..10712e0272 100644 --- a/pkg/model/node_component_energy.go +++ b/pkg/model/node_component_energy.go @@ -95,12 +95,12 @@ func GetNodeComponentPowers(nodeMetrics *stats.NodeStats, isIdlePower bool) (nod // UpdateNodeComponentEnergy sets the power model samples, get absolute powers, and set gauge value for each component energy func UpdateNodeComponentEnergy(nodeMetrics *stats.NodeStats) { - addEnergy(nodeMetrics, stats.AvailableAbsEnergyMetrics, absPower) + addEnergy(nodeMetrics, nodeMetrics.AvailableMetrics.AbsEnergyMetrics, absPower) } // UpdateNodeComponentIdleEnergy sets the power model samples to zeros, get idle powers, and set gauge value for each component idle energy func UpdateNodeComponentIdleEnergy(nodeMetrics *stats.NodeStats) { - addEnergy(nodeMetrics, stats.AvailableIdleEnergyMetrics, idlePower) + addEnergy(nodeMetrics, nodeMetrics.AvailableMetrics.IdleEnergyMetrics, idlePower) } func addEnergy(nodeMetrics *stats.NodeStats, metrics []string, isIdle bool) {