Skip to content

Commit

Permalink
feat(metrics): expose platform and components power source (#1133)
Browse files Browse the repository at this point in the history
This commit adds `platform_power_source` and `components_power_source`
to `kepler_node_info`. This allows users to filter metrics by a particular
power source by joining.

Signed-off-by: Sunil Thaha <[email protected]>
  • Loading branch information
sthaha authored Dec 19, 2023
1 parent 7c99198 commit 365d56f
Show file tree
Hide file tree
Showing 16 changed files with 106 additions and 36 deletions.
2 changes: 1 addition & 1 deletion cmd/validator/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ func main() {
}
}
if platform.IsSystemCollectionSupported() {
powerSource := platform.GetPowerSource()
powerSource := platform.GetSourceName()
switch powerSource {
case "hmc":
hmcEnable = true
Expand Down
12 changes: 10 additions & 2 deletions pkg/metrics/node/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ import (
"github.com/sustainable-computing-io/kepler/pkg/config"
"github.com/sustainable-computing-io/kepler/pkg/metrics/metricfactory"
"github.com/sustainable-computing-io/kepler/pkg/metrics/utils"
"github.com/sustainable-computing-io/kepler/pkg/sensors/components"
"github.com/sustainable-computing-io/kepler/pkg/sensors/platform"
)

const (
Expand Down Expand Up @@ -70,7 +72,9 @@ func (c *collector) initMetrics() {
}

// TODO: prometheus metric should be "node_info"
desc := metricfactory.MetricsPromDesc(context, "", "info", "os", []string{"cpu_architecture"})
desc := metricfactory.MetricsPromDesc(context, "", "info", "os", []string{
"cpu_architecture", "components_power_source", "platform_power_source",
})
c.descriptions["info"] = desc
c.collectors["info"] = metricfactory.NewPromCounter(desc)
}
Expand All @@ -91,5 +95,9 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) {
c.Mx.Unlock()

// update node info
ch <- c.collectors["info"].MustMetric(1, stats.NodeCPUArchitecture)
ch <- c.collectors["info"].MustMetric(1,
stats.NodeCPUArchitecture,
components.GetSourceName(),
platform.GetSourceName(),
)
}
3 changes: 3 additions & 0 deletions pkg/sensors/accelerator/gpu/power.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ var (
)

type acceleratorInterface interface {
// GetName returns the name of the collector
GetName() string

// Init initizalize and start the GPU metric collector
Init() error
// Shutdown stops the GPU metric collector
Expand Down
4 changes: 4 additions & 0 deletions pkg/sensors/accelerator/gpu/source/gpu_dummy.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ type GPUDummy struct {
collectionSupported bool
}

func (d *GPUDummy) GetName() string {
return "dummy"
}

// todo: refactor logic at invoking side, if gpu is not set?
func (d *GPUDummy) Init() error {
d.collectionSupported = false
Expand Down
4 changes: 4 additions & 0 deletions pkg/sensors/accelerator/gpu/source/gpu_nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ type GPUNvml struct {
collectionSupported bool
}

func (GPUNvml) GetName() string {
return "nvidia-nvml"
}

// Init initizalize and start the GPU metric collector
// the nvml only works if the container has support to GPU, e.g., it is using nvidia-docker2
// otherwise it will fail to load the libnvidia-ml.so.1
Expand Down
4 changes: 4 additions & 0 deletions pkg/sensors/accelerator/qat/source/qat_telemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ type QATTelemetry struct {
collectionSupported bool
}

func (QATTelemetry) GetName() string {
return "qat"
}

// Init initizalize and start the QAT metric collector
func (q *QATTelemetry) Init() (err error) {
defer func() {
Expand Down
18 changes: 12 additions & 6 deletions pkg/sensors/components/power.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ import (
)

type powerInterface interface {
// GetName() returns the name of the source / impl used for estimation
GetName() string
// GetAbsEnergyFromDram returns mJ in DRAM. Absolute energy is the sum of Idle + Dynamic energy.
GetAbsEnergyFromDram() (uint64, error)
// GetAbsEnergyFromCore returns mJ in CPU cores
Expand All @@ -41,37 +43,41 @@ type powerInterface interface {
}

var (
estimateImpl = &source.PowerEstimate{}
sysfsImpl = &source.PowerSysfs{}
msrImpl = &source.PowerMSR{}
apmXgeneSysfsImpl = &source.ApmXgeneSysfs{}
powerImpl powerInterface = sysfsImpl
enabled = true
powerImpl powerInterface = &source.PowerSysfs{}
enabled = true
)

func InitPowerImpl() {
sysfsImpl := &source.PowerSysfs{}
if sysfsImpl.IsSystemCollectionSupported() /*&& false*/ {
klog.V(1).Infoln("use sysfs to obtain power")
powerImpl = sysfsImpl
return
}

msrImpl := &source.PowerMSR{}
if msrImpl.IsSystemCollectionSupported() && config.EnabledMSR {
klog.V(1).Infoln("use MSR to obtain power")
powerImpl = msrImpl
return
}

apmXgeneSysfsImpl := &source.ApmXgeneSysfs{}
if apmXgeneSysfsImpl.IsSystemCollectionSupported() {
klog.V(1).Infoln("use Ampere Xgene sysfs to obtain power")
powerImpl = apmXgeneSysfsImpl
return
}

klog.V(1).Infoln("Unable to obtain power, use estimate method")
estimateImpl := &source.PowerEstimate{}
powerImpl = estimateImpl
}

func GetSourceName() string {
return powerImpl.GetName()
}

func GetAbsEnergyFromDram() (uint64, error) {
return powerImpl.GetAbsEnergyFromDram()
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/sensors/components/source/apm_xgene_sysfs.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ type ApmXgeneSysfs struct {
currTime time.Time
}

func (ApmXgeneSysfs) GetName() string {
return "ampere-xgene-hwmon"
}

func (r *ApmXgeneSysfs) IsSystemCollectionSupported() bool {
labelFiles, err := filepath.Glob(powerLabelPathTemplate)
if err != nil {
Expand Down
4 changes: 4 additions & 0 deletions pkg/sensors/components/source/dummy.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ var (

type PowerDummy struct{}

func (PowerDummy) GetName() string {
return "dummy"
}

func (r *PowerDummy) IsSystemCollectionSupported() bool {
return SystemCollectionSupported
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/sensors/components/source/estimate.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ type PowerEstimateData struct {
PerGBWatts float64 `csv:"GB/Chip"`
}

func (PowerEstimate) GetName() string {
return "estimator"
}

// If the Estimated Power is being used, it means that the system does not support Components Power Measurement
func (r *PowerEstimate) IsSystemCollectionSupported() bool {
return false
Expand Down
4 changes: 4 additions & 0 deletions pkg/sensors/components/source/rapl_msr.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ package source

type PowerMSR struct{}

func (PowerMSR) GetName() string {
return "rapl-msr"
}

func (r *PowerMSR) IsSystemCollectionSupported() bool {
return InitUnits() == nil
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/sensors/components/source/rapl_sysfs.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ func getMaxEnergyRange(eventName string) (uint64, error) {

type PowerSysfs struct{}

func (PowerSysfs) GetName() string {
return "rapl-sysfs"
}

func (r *PowerSysfs) IsSystemCollectionSupported() bool {
path := fmt.Sprintf(packageNamePathTemplate, 0)
_, err := os.ReadFile(path + energyFile)
Expand Down
63 changes: 36 additions & 27 deletions pkg/sensors/platform/power.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ import (
)

type powerInterface interface {
// GetName() returns the name of the platform power source
GetName() string
// GetAbsEnergyFromPlatform returns mJ in DRAM. Absolute energy is the sum of Idle + Dynamic energy.
GetAbsEnergyFromPlatform() (map[string]float64, error)
// StopPower stops the collection
Expand All @@ -33,48 +35,57 @@ type powerInterface interface {
IsSystemCollectionSupported() bool
}

// dummy satisfies the powerInterface and can be used as the default NOP source
type dummy struct {
}

func (dummy) GetName() string {
return "none"
}

func (dummy) IsSystemCollectionSupported() bool {
return false
}
func (dummy) StopPower() {
}

func (dummy) GetAbsEnergyFromPlatform() (map[string]float64, error) {
return nil, fmt.Errorf("dummy power source")
}

var (
powerImpl powerInterface
redfishImpl *source.RedFishClient
hmcImpl = &source.PowerHMC{}
powerSource = "none"
enabled = true
powerImpl powerInterface = &dummy{}
enabled = true
)

func InitPowerImpl() {
// switch the platform power collector source to hmc if the system architecture is s390x
// TODO: add redfish or ipmi as well.
if runtime.GOARCH == "s390x" {
klog.V(1).Infoln("use hmc to obtain power")
powerImpl = hmcImpl
powerSource = "hmc"
} else if redfishImpl = source.NewRedfishClient(); redfishImpl != nil && redfishImpl.IsSystemCollectionSupported() {
klog.V(1).Infoln("use redfish to obtain power")
powerImpl = redfishImpl
powerSource = "redfish"
} else if powerImpl = source.NewACPIPowerMeter(); powerImpl != nil && powerImpl.IsSystemCollectionSupported() {
klog.V(1).Infoln("use acpi to obtain power")
powerSource = "acpi"
powerImpl = &source.PowerHMC{}
} else if redfish := source.NewRedfishClient(); redfish != nil && redfish.IsSystemCollectionSupported() {
powerImpl = redfish
} else if acpi := source.NewACPIPowerMeter(); acpi != nil && acpi.CollectEnergy {
powerImpl = acpi
}

klog.V(1).Infof("using %s to obtain power", powerImpl.GetName())
}

func GetPowerSource() string {
return powerSource
func GetSourceName() string {
return powerImpl.GetName()
}

// GetAbsEnergyFromPlatform returns the absolute energy, which is the sum of Idle + Dynamic energy.
func GetAbsEnergyFromPlatform() (map[string]float64, error) {
if powerImpl != nil {
return powerImpl.GetAbsEnergyFromPlatform()
}
return nil, fmt.Errorf("powerImpl is nil")
return powerImpl.GetAbsEnergyFromPlatform()
}

func IsSystemCollectionSupported() bool {
if powerImpl != nil && enabled {
return powerImpl.IsSystemCollectionSupported()
if !enabled {
return false
}
return false
return powerImpl.IsSystemCollectionSupported()
}

// SetIsSystemCollectionSupported is used to enable or disable the system power collection.
Expand All @@ -84,7 +95,5 @@ func SetIsSystemCollectionSupported(enable bool) {
}

func StopPower() {
if powerImpl != nil {
powerImpl.StopPower()
}
powerImpl.StopPower()
}
4 changes: 4 additions & 0 deletions pkg/sensors/platform/source/acpi.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ func findACPIPowerPath() string {
return powerPath
}

func (ACPI) GetName() string {
return "acpi"
}

func (a *ACPI) StopPower() {
}

Expand Down
4 changes: 4 additions & 0 deletions pkg/sensors/platform/source/hmc.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ package source

type PowerHMC struct{}

func (a *PowerHMC) GetName() string {
return "hmc"
}

func (a *PowerHMC) StopPower() {
}

Expand Down
4 changes: 4 additions & 0 deletions pkg/sensors/platform/source/redfish.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ func NewRedfishClient() *RedFishClient {
return nil
}

func (*RedFishClient) GetName() string {
return "redfish"
}

func (rf *RedFishClient) IsSystemCollectionSupported() bool {
// goroutine for collecting power info from Redfish already exists
if rf.ticker != nil {
Expand Down

0 comments on commit 365d56f

Please sign in to comment.