diff --git a/cmd/exporter/exporter.go b/cmd/exporter/exporter.go index 5a1933897f..d70344ad6e 100644 --- a/cmd/exporter/exporter.go +++ b/cmd/exporter/exporter.go @@ -98,9 +98,9 @@ func healthProbe(w http.ResponseWriter, req *http.Request) { func main() { start := time.Now() klog.InitFlags(nil) - appConfig := newAppConfig() - flag.Parse() - + appConfig := newAppConfig() // Initialize appConfig and define flags + flag.Parse() // Parse command-line flags + config.GetConfig() // Initialize the configuration klog.Infof("Kepler running on version: %s", build.Version) registry := metrics.GetRegistry() @@ -149,7 +149,7 @@ func main() { stats.InitAvailableParamAndMetrics() - if config.EnabledGPU { + if config.EnabledGPU() { r := accelerator.GetRegistry() if a, err := accelerator.New(accelerator.GPU, true); err == nil { r.MustRegister(a) // Register the accelerator with the registry diff --git a/pkg/bpf/exporter.go b/pkg/bpf/exporter.go index ded8a88887..14643370c5 100644 --- a/pkg/bpf/exporter.go +++ b/pkg/bpf/exporter.go @@ -91,7 +91,7 @@ func (e *exporter) attach() error { // Set program global variables err = specs.RewriteConstants(map[string]interface{}{ - "SAMPLE_RATE": int32(config.BPFSampleRate), + "SAMPLE_RATE": int32(config.GetBPFSampleRate()), }) if err != nil { return fmt.Errorf("error rewriting program constants: %v", err) @@ -112,7 +112,7 @@ func (e *exporter) attach() error { } e.enabledSoftwareCounters[config.CPUTime] = struct{}{} - if config.ExposeIRQCounterMetrics { + if config.ExposeIRQCounterMetrics() { e.irqLink, err = link.AttachTracing(link.TracingOptions{ Program: e.bpfObjects.KeplerIrqTrace, AttachType: ebpf.AttachTraceRawTp, @@ -148,7 +148,7 @@ func (e *exporter) attach() error { } // Return early if hardware counters are not enabled - if !config.ExposeHardwareCounterMetrics { + if !config.ExposeHardwareCounterMetrics() { klog.Infof("Hardware counter metrics are disabled") return nil } diff --git a/pkg/bpf/test_utils.go b/pkg/bpf/test_utils.go index 4105f64b96..4575f90a33 100644 --- a/pkg/bpf/test_utils.go +++ b/pkg/bpf/test_utils.go @@ -23,7 +23,7 @@ func defaultHardwareCounters() sets.Set[string] { func defaultSoftwareCounters() sets.Set[string] { swCounters := sets.New(config.CPUTime, config.PageCacheHit) - if config.ExposeIRQCounterMetrics { + if config.ExposeIRQCounterMetrics() { swCounters.Insert(config.IRQNetTXLabel, config.IRQNetRXLabel, config.IRQBlockLabel) } return swCounters diff --git a/pkg/collector/energy/node_energy_collector.go b/pkg/collector/energy/node_energy_collector.go index cc40f03905..e2b0e450a6 100644 --- a/pkg/collector/energy/node_energy_collector.go +++ b/pkg/collector/energy/node_energy_collector.go @@ -66,7 +66,7 @@ func UpdateNodeComponentsEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup) // UpdateNodeGPUEnergy updates each GPU power consumption. Right now we don't support other types of accelerators func UpdateNodeGPUEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup) { defer wg.Done() - if config.EnabledGPU { + if config.EnabledGPU() { if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { gpuEnergy := gpu.Device().AbsEnergyFromDevice() for gpu, energy := range gpuEnergy { diff --git a/pkg/collector/metric_collector.go b/pkg/collector/metric_collector.go index e272f8ddfe..dad5c5a8ca 100644 --- a/pkg/collector/metric_collector.go +++ b/pkg/collector/metric_collector.go @@ -79,8 +79,8 @@ func (c *Collector) Initialize() error { // model component decide whether/how to init model.CreatePowerEstimatorModels( stats.GetProcessFeatureNames(c.bpfSupportedMetrics), - stats.NodeMetadataFeatureNames, - stats.NodeMetadataFeatureValues, + stats.NodeMetadataFeatureNames(), + stats.NodeMetadataFeatureValues(), c.bpfSupportedMetrics, ) @@ -161,7 +161,7 @@ func (c *Collector) updateProcessResourceUtilizationMetrics(wg *sync.WaitGroup) // update process metrics regarding the resource utilization to be used to calculate the energy consumption // we first updates the bpf which is responsible to include new processes in the ProcessStats collection resourceBpf.UpdateProcessBPFMetrics(c.bpfExporter, c.ProcessStats) - if config.EnabledGPU { + if config.EnabledGPU() { if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { accelerator.UpdateProcessGPUUtilizationMetrics(c.ProcessStats, c.bpfSupportedMetrics) } @@ -184,7 +184,7 @@ func (c *Collector) AggregateProcessResourceUtilizationMetrics() { // aggregate metrics per container if config.IsExposeContainerStatsEnabled() { if process.ContainerID != "" { - c.createContainerStatsIfNotExist(process.ContainerID, process.CGroupID, process.PID, config.EnabledEBPFCgroupID) + c.createContainerStatsIfNotExist(process.ContainerID, process.CGroupID, process.PID, config.EnabledEBPFCgroupID()) c.ContainerStats[process.ContainerID].ResourceUsage[metricName].AddDeltaStat(id, delta) foundContainer[process.ContainerID] = true } @@ -270,7 +270,7 @@ func (c *Collector) AggregateProcessEnergyUtilizationMetrics() { // aggregate metrics per container if config.IsExposeContainerStatsEnabled() { if process.ContainerID != "" { - c.createContainerStatsIfNotExist(process.ContainerID, process.CGroupID, process.PID, config.EnabledEBPFCgroupID) + c.createContainerStatsIfNotExist(process.ContainerID, process.CGroupID, process.PID, config.EnabledEBPFCgroupID()) c.ContainerStats[process.ContainerID].EnergyUsage[metricName].AddDeltaStat(id, delta) } } diff --git a/pkg/collector/metric_collector_test.go b/pkg/collector/metric_collector_test.go index ea5c2d37c6..9eb3c9a8f7 100644 --- a/pkg/collector/metric_collector_test.go +++ b/pkg/collector/metric_collector_test.go @@ -45,7 +45,7 @@ var _ = Describe("Test Collector Unit", func() { metricCollector := newMockCollector(bpfExporter) // The default estimator model is the ratio bpfSupportedMetrics := bpfExporter.SupportedMetrics() - model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), stats.NodeMetadataFeatureNames, stats.NodeMetadataFeatureValues, bpfSupportedMetrics) + model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), stats.NodeMetadataFeatureNames(), stats.NodeMetadataFeatureValues(), bpfSupportedMetrics) // update container and node metrics metricCollector.UpdateProcessEnergyUtilizationMetrics() metricCollector.AggregateProcessEnergyUtilizationMetrics() diff --git a/pkg/collector/resourceutilization/bpf/process_bpf_collector.go b/pkg/collector/resourceutilization/bpf/process_bpf_collector.go index d45bc59c3f..ff6347cc41 100644 --- a/pkg/collector/resourceutilization/bpf/process_bpf_collector.go +++ b/pkg/collector/resourceutilization/bpf/process_bpf_collector.go @@ -99,7 +99,7 @@ func UpdateProcessBPFMetrics(bpfExporter bpf.Exporter, processStats map[uint64]* } // if the pid is within a container, it will have a container ID - containerID, err := cgroup.GetContainerID(ct.CgroupId, ct.Pid, config.EnabledEBPFCgroupID) + containerID, err := cgroup.GetContainerID(ct.CgroupId, ct.Pid, config.EnabledEBPFCgroupID()) if err != nil { klog.V(6).Infof("failed to resolve container for PID %v (command=%s): %v, set containerID=%s", ct.Pid, comm, err, utils.SystemProcessName) } @@ -114,7 +114,7 @@ func UpdateProcessBPFMetrics(bpfExporter bpf.Exporter, processStats map[uint64]* } mapKey := ct.Pid - if ct.CgroupId == 1 && config.EnabledEBPFCgroupID { + if ct.CgroupId == 1 && config.EnabledEBPFCgroupID() { // we aggregate all kernel process to minimize overhead // all kernel process has cgroup id as 1 and pid 1 is also a kernel process mapKey = 1 diff --git a/pkg/collector/stats/benchmark_test.go b/pkg/collector/stats/benchmark_test.go index d4433f71bb..c5083de4ea 100644 --- a/pkg/collector/stats/benchmark_test.go +++ b/pkg/collector/stats/benchmark_test.go @@ -39,7 +39,7 @@ func benchmarkNtesting(b *testing.B, processNumber int) { // The default estimator model is the ratio bpfSupportedMetrics := bpf.DefaultSupportedMetrics() - model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), stats.NodeMetadataFeatureNames, stats.NodeMetadataFeatureValues, bpfSupportedMetrics) + model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), stats.NodeMetadataFeatureNames(), stats.NodeMetadataFeatureValues(), bpfSupportedMetrics) // update container and node metrics b.ReportAllocs() diff --git a/pkg/collector/stats/node_stats.go b/pkg/collector/stats/node_stats.go index 792225e374..201202bda8 100644 --- a/pkg/collector/stats/node_stats.go +++ b/pkg/collector/stats/node_stats.go @@ -25,17 +25,6 @@ import ( "github.com/sustainable-computing-io/kepler/pkg/utils" ) -var ( - NodeName = GetNodeName() - NodeCPUArchitecture = getCPUArch() - NodeCPUPackageMap = getCPUPackageMap() - - // NodeMetricNames holds the name of the system metadata information. - NodeMetadataFeatureNames []string = []string{"cpu_architecture"} - // SystemMetadata holds the metadata regarding the system information - NodeMetadataFeatureValues []string = []string{NodeCPUArchitecture} -) - type NodeStats struct { Stats @@ -43,6 +32,26 @@ type NodeStats struct { IdleResUtilization map[string]uint64 } +// NodeCPUArchitecture returns the CPU architecture +func NodeCPUArchitecture() string { + return getCPUArch() +} + +// NodeCPUPackageMap returns the CPU package map +func NodeCPUPackageMap() map[int32]string { + return getCPUPackageMap() +} + +// NodeMetadataFeatureNames returns the feature names for metadata +func NodeMetadataFeatureNames() []string { + return []string{"cpu_architecture"} +} + +// NodeMetadataFeatureValues returns the feature values for metadata +func NodeMetadataFeatureValues() []string { + return []string{NodeCPUArchitecture()} +} + func NewNodeStats(bpfSupportedMetrics bpf.SupportedMetrics) *NodeStats { return &NodeStats{ Stats: *NewStats(bpfSupportedMetrics), @@ -57,7 +66,7 @@ func (ne *NodeStats) ResetDeltaValues() { func (ne *NodeStats) UpdateIdleEnergyWithMinValue(isComponentsSystemCollectionSupported bool) { // gpu metric - if config.EnabledGPU { + if config.EnabledGPU() { if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { ne.CalcIdleEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.GPUComputeUtilization) } diff --git a/pkg/collector/stats/stats.go b/pkg/collector/stats/stats.go index a588101686..232e0cd30e 100644 --- a/pkg/collector/stats/stats.go +++ b/pkg/collector/stats/stats.go @@ -69,7 +69,7 @@ func NewStats(bpfSupportedMetrics bpf.SupportedMetrics) *Stats { m.ResourceUsage[metricName] = types.NewUInt64StatCollection() } - if config.EnabledGPU { + if config.EnabledGPU() { if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { m.ResourceUsage[config.GPUComputeUtilization] = types.NewUInt64StatCollection() m.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection() @@ -127,7 +127,7 @@ func (m *Stats) UpdateDynEnergy() { m.CalcDynEnergy(config.AbsEnergyInPlatform, config.IdleEnergyInPlatform, config.DynEnergyInPlatform, sensorID) } // gpu metric - if config.EnabledGPU { + if config.EnabledGPU() { if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { for gpuID := range m.EnergyUsage[config.AbsEnergyInGPU] { m.CalcDynEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.DynEnergyInGPU, gpuID) @@ -162,7 +162,7 @@ func calcDynEnergy(totalE, idleE uint64) uint64 { func normalize(val float64, shouldNormalize bool) float64 { if shouldNormalize { - return val / float64(config.SamplePeriodSec) + return val / float64(config.SamplePeriodSec()) } return val } @@ -181,7 +181,7 @@ func (m *Stats) ToEstimatorValues(featuresName []string, shouldNormalize bool) [ } // some features are not related to resource utilization, such as power metrics switch feature { - case config.GeneralUsageMetric: // is an empty string for UNCORE and OTHER resource usage + case config.GeneralUsageMetric(): // is an empty string for UNCORE and OTHER resource usage featureValues = append(featureValues, 0) case config.DynEnergyInPkg: // for dynamic PKG power consumption diff --git a/pkg/collector/stats/stats_test.go b/pkg/collector/stats/stats_test.go index 7080c1e54c..b95a7140a8 100644 --- a/pkg/collector/stats/stats_test.go +++ b/pkg/collector/stats/stats_test.go @@ -9,7 +9,8 @@ import ( var _ = Describe("Stats", func() { It("Test InitAvailableParamAndMetrics", func() { - config.ExposeHardwareCounterMetrics = false + config.GetConfig() + config.SetEnabledHardwareCounterMetrics(false) supportedMetrics := bpf.DefaultSupportedMetrics() InitAvailableParamAndMetrics() exp := []string{} diff --git a/pkg/collector/stats/test_utils.go b/pkg/collector/stats/test_utils.go index 03d9dfbd01..07ea429d8e 100644 --- a/pkg/collector/stats/test_utils.go +++ b/pkg/collector/stats/test_utils.go @@ -32,6 +32,7 @@ const ( // SetMockedCollectorMetrics adds all metric to a process, otherwise it will not create the right usageMetric with all elements. The usageMetric is used in the Prediction Power Models // TODO: do not use a fixed usageMetric array in the power models, a structured data is more disarable. func SetMockedCollectorMetrics() { + config.GetConfig() if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { err := gpu.Device().Init() // create structure instances that will be accessed to create a processMetric klog.Fatalln(err) @@ -50,9 +51,6 @@ func SetMockedCollectorMetrics() { config.IdleEnergyInCore, config.IdleEnergyInDRAM, config.IdleEnergyInUnCore, config.IdleEnergyInPkg, config.IdleEnergyInGPU, config.IdleEnergyInOther, config.IdleEnergyInPlatform, } - - NodeMetadataFeatureNames = []string{"cpu_architecture"} - NodeMetadataFeatureValues = []string{"Sandy Bridge"} } // CreateMockedProcessStats adds two containers with all metrics initialized diff --git a/pkg/collector/stats/utils.go b/pkg/collector/stats/utils.go index 32959ba53d..f019009139 100644 --- a/pkg/collector/stats/utils.go +++ b/pkg/collector/stats/utils.go @@ -81,7 +81,7 @@ func GetProcessFeatureNames(bpfSupportedMetrics bpf.SupportedMetrics) []string { klog.V(3).Infof("Available ebpf counters: %v", metrics) // gpu metric - if config.EnabledGPU { + if config.EnabledGPU() { if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil { gpuMetrics := []string{config.GPUComputeUtilization, config.GPUMemUtilization} metrics = append(metrics, gpuMetrics...) @@ -92,7 +92,7 @@ func GetProcessFeatureNames(bpfSupportedMetrics bpf.SupportedMetrics) []string { return metrics } -func GetNodeName() string { +func NodeName() string { if nodeName := os.Getenv("NODE_NAME"); nodeName != "" { return nodeName } @@ -298,7 +298,7 @@ func getCPUPmuName() (pmuName string, err error) { func getCPUArchitecture() (string, error) { // check if there is a CPU architecture override - cpuArchOverride := config.CPUArchOverride + cpuArchOverride := config.CPUArchOverride() if cpuArchOverride != "" { klog.V(2).Infof("cpu arch override: %v\n", cpuArchOverride) return cpuArchOverride, nil diff --git a/pkg/config/config.go b/pkg/config/config.go index eaeb63bc6f..d02a7e4296 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -24,17 +24,16 @@ import ( "regexp" "strconv" "strings" + "sync" - "github.com/sustainable-computing-io/kepler/pkg/model/types" "golang.org/x/sys/unix" "k8s.io/klog/v2" ) -const ( - cGroupIDMinKernelVersion = 4.18 - - // If this file is present, cgroups v2 is enabled on that node. - cGroupV2Path = "/sys/fs/cgroup/cgroup.controllers" +var ( + versionRegex = regexp.MustCompile(`^(\d+)\.(\d+).`) + instance *Config + once sync.Once ) type Client interface { @@ -42,132 +41,166 @@ type Client interface { getCgroupV2File() string } -type config struct { +// Configuration structs +type KeplerConfig struct { + KeplerNamespace string + EnabledEBPFCgroupID bool + EnabledGPU bool + EnabledMSR bool + EnableProcessStats bool + ExposeContainerStats bool + ExposeVMStats bool + ExposeHardwareCounterMetrics bool + ExposeIRQCounterMetrics bool + ExposeBPFMetrics bool + ExposeComponentPower bool + ExposeIdlePowerMetrics bool + EnableAPIServer bool + MockACPIPowerPath string + MaxLookupRetry int + KubeConfig string + BPFSampleRate int + EstimatorModel string + EstimatorSelectFilter string + CPUArchOverride string + MachineSpecFilePath string +} +type MetricsConfig struct { + CoreUsageMetric string + DRAMUsageMetric string + UncoreUsageMetric string + GPUUsageMetric string + GeneralUsageMetric string +} + +type RedfishConfig struct { + CredFilePath string + ProbeIntervalInSeconds string + SkipSSLVerify bool +} + +type ModelConfig struct { + ModelServerEnable bool + ModelServerEndpoint string + ModelConfigValues map[string]string + NodePlatformPowerKey string + NodeComponentsPowerKey string + ContainerPlatformPowerKey string + ContainerComponentsPowerKey string + ProcessPlatformPowerKey string + ProcessComponentsPowerKey string +} + +type LibvirtConfig struct { + MetadataURI string + MetadataToken string +} + +type Config struct { + ModelServerService string + KernelVersion float32 + Kepler KeplerConfig + SamplePeriodSec uint64 + Model ModelConfig + Metrics MetricsConfig + Redfish RedfishConfig + Libvirt LibvirtConfig + DCGMHostEngineEndpoint string +} + +// newConfig creates and returns a new Config instance. +func newConfig() *Config { + return &Config{ + ModelServerService: fmt.Sprintf("kepler-model-server.%s.svc.cluster.local", getConfig("KEPLER_NAMESPACE", defaultNamespace)), + Kepler: getKeplerConfig(), + SamplePeriodSec: uint64(getIntConfig("SAMPLE_PERIOD_SEC", defaultSamplePeriodSec)), + Model: getModelConfig(), + Metrics: getMetricsConfig(), + Redfish: getRedfishConfig(), + Libvirt: getLibvirtConfig(), + DCGMHostEngineEndpoint: getConfig("DCGM_HOST_ENGINE_ENDPOINT", ""), + KernelVersion: float32(0), + } } -var c config - -const ( - defaultMetricValue = "" - defaultNamespace = "kepler" - defaultModelServerPort = "8100" - defaultModelRequestPath = "/model" - defaultMaxLookupRetry = 500 - // MaxIRQ is the maximum number of IRQs to be monitored - MaxIRQ = 10 - - // defaultSamplePeriodSec is the time in seconds that the reader will wait before reading the metrics again - defaultSamplePeriodSec = 3 -) - -var ( - modelServerService = fmt.Sprintf("kepler-model-server.%s.svc.cluster.local", KeplerNamespace) - - KernelVersion = float32(0) - - KeplerNamespace = getConfig("KEPLER_NAMESPACE", defaultNamespace) - EnabledEBPFCgroupID = getBoolConfig("ENABLE_EBPF_CGROUPID", true) - EnabledGPU = getBoolConfig("ENABLE_GPU", false) - EnabledMSR = getBoolConfig("ENABLE_MSR", false) - EnableProcessStats = getBoolConfig("ENABLE_PROCESS_METRICS", false) - ExposeContainerStats = getBoolConfig("EXPOSE_CONTAINER_METRICS", true) - ExposeVMStats = getBoolConfig("EXPOSE_VM_METRICS", true) - ExposeHardwareCounterMetrics = getBoolConfig("EXPOSE_HW_COUNTER_METRICS", true) - ExposeIRQCounterMetrics = getBoolConfig("EXPOSE_IRQ_COUNTER_METRICS", true) - ExposeBPFMetrics = getBoolConfig("EXPOSE_BPF_METRICS", true) - ExposeComponentPower = getBoolConfig("EXPOSE_COMPONENT_POWER", true) - ExposeIdlePowerMetrics = getBoolConfig("EXPOSE_ESTIMATED_IDLE_POWER_METRICS", false) - MockACPIPowerPath = getConfig("MOCK_ACPI_POWER_PATH", "") - - MetricPathKey = "METRIC_PATH" - BindAddressKey = "BIND_ADDRESS" - CPUArchOverride = getConfig("CPU_ARCH_OVERRIDE", "") - MaxLookupRetry = getIntConfig("MAX_LOOKUP_RETRY", defaultMaxLookupRetry) - BPFSampleRate = getIntConfig("EXPERIMENTAL_BPF_SAMPLE_RATE", 0) - - EstimatorModel = getConfig("ESTIMATOR_MODEL", defaultMetricValue) // auto-select - EstimatorSelectFilter = getConfig("ESTIMATOR_SELECT_FILTER", defaultMetricValue) // no filter - CoreUsageMetric = getConfig("CORE_USAGE_METRIC", CPUInstruction) - DRAMUsageMetric = getConfig("DRAM_USAGE_METRIC", CacheMiss) - UncoreUsageMetric = getConfig("UNCORE_USAGE_METRIC", defaultMetricValue) // no metric (evenly divided) - GpuUsageMetric = getConfig("GPU_USAGE_METRIC", GPUComputeUtilization) // no metric (evenly divided) - GeneralUsageMetric = getConfig("GENERAL_USAGE_METRIC", defaultMetricValue) // for uncategorized energy - - SamplePeriodSec = uint64(getIntConfig("SAMPLE_PERIOD_SEC", defaultSamplePeriodSec)) - - versionRegex = regexp.MustCompile(`^(\d+)\.(\d+).`) +// GetConfig returns the singleton Config instance, creating it if necessary. +func GetConfig() *Config { + once.Do(func() { + instance = newConfig() + }) + return instance +} + +// SetConfig replaces the global instance +func SetConfig(conf *Config) { + instance = conf +} + +func getKeplerConfig() KeplerConfig { + return KeplerConfig{ + KeplerNamespace: getConfig("KEPLER_NAMESPACE", defaultNamespace), + EnabledEBPFCgroupID: getBoolConfig("ENABLE_EBPF_CGROUPID", true), + EnabledGPU: getBoolConfig("ENABLE_GPU", false), + EnabledMSR: getBoolConfig("ENABLE_MSR", false), + EnableProcessStats: getBoolConfig("ENABLE_PROCESS_METRICS", false), + ExposeContainerStats: getBoolConfig("EXPOSE_CONTAINER_METRICS", true), + ExposeVMStats: getBoolConfig("EXPOSE_VM_METRICS", true), + ExposeHardwareCounterMetrics: getBoolConfig("EXPOSE_HW_COUNTER_METRICS", true), + ExposeIRQCounterMetrics: getBoolConfig("EXPOSE_IRQ_COUNTER_METRICS", true), + ExposeBPFMetrics: getBoolConfig("EXPOSE_BPF_METRICS", true), + ExposeComponentPower: getBoolConfig("EXPOSE_COMPONENT_POWER", true), + ExposeIdlePowerMetrics: getBoolConfig("EXPOSE_ESTIMATED_IDLE_POWER_METRICS", false), + EnableAPIServer: getBoolConfig("ENABLE_API_SERVER", false), + MockACPIPowerPath: getConfig("MOCK_ACPI_POWER_PATH", ""), + MaxLookupRetry: getIntConfig("MAX_LOOKUP_RETRY", defaultMaxLookupRetry), + KubeConfig: getConfig("KUBE_CONFIG", defaultKubeConfig), + BPFSampleRate: getIntConfig("EXPERIMENTAL_BPF_SAMPLE_RATE", defaultBPFSampleRate), + EstimatorModel: getConfig("ESTIMATOR_MODEL", defaultMetricValue), + EstimatorSelectFilter: getConfig("ESTIMATOR_SELECT_FILTER", defaultMetricValue), // no filter + CPUArchOverride: getConfig("CPU_ARCH_OVERRIDE", defaultCPUArchOverride), + } +} - configDir = "/etc/kepler/kepler.config" - - // nvidia dcgm hostengine endpoint - DCGMHostEngineEndpoint = getConfig("NVIDIA_HOSTENGINE_ENDPOINT", "localhost:5555") - - // redfish cred file path - redfishCredFilePath string - redfishProbeIntervalInSeconds = getConfig("REDFISH_PROBE_INTERVAL_IN_SECONDS", "60") - redfishSkipSSLVerify = getBoolConfig("REDFISH_SKIP_SSL_VERIFY", true) - - //////////////////////////////////// - DefaultMachineSpecFilePath = "/etc/kepler/models/machine/spec.json" - machineSpecFilePath string - ModelServerEnable = getBoolConfig("MODEL_SERVER_ENABLE", false) - ModelServerEndpoint = SetModelServerReqEndpoint() - // for model config - ModelConfigValues map[string]string - // model_parameter_prefix - NodePlatformPowerKey = "NODE_TOTAL" - NodeComponentsPowerKey = "NODE_COMPONENTS" - ContainerPlatformPowerKey = "CONTAINER_TOTAL" - ContainerComponentsPowerKey = "CONTAINER_COMPONENTS" - ProcessPlatformPowerKey = "PROCESS_TOTAL" - ProcessComponentsPowerKey = "PROCESS_COMPONENTS" - - // model_parameter_attribute - RatioEnabledKey = "RATIO" // the default container power model is RATIO but ESTIMATOR or LINEAR_REGRESSION can be used - EstimatorEnabledKey = "ESTIMATOR" - LocalRegressorEnabledKey = "LOCAL_REGRESSOR" - InitModelURLKey = "INIT_URL" - FixedTrainerNameKey = "TRAINER" - FixedNodeTypeKey = "NODE_TYPE" - ModelFiltersKey = "FILTERS" - DefaultTrainerName = types.LinearRegressionTrainer - //////////////////////////////////// - - // KubeConfig is used to start k8s client with the pod running outside the cluster - KubeConfig = "" - EnableAPIServer = false - - //////////////////////////////////// - - // Libvirt Metadata config - LibvirtMetadataURI = getConfig("LIBVIRT_METADATA_URI", "") - LibvirtMetadataToken = getConfig("LIBVIRT_METADATA_TOKEN", "name") -) +func getMetricsConfig() MetricsConfig { + return MetricsConfig{ + CoreUsageMetric: getConfig("CORE_USAGE_METRIC", CPUInstruction), + DRAMUsageMetric: getConfig("DRAM_USAGE_METRIC", CacheMiss), + UncoreUsageMetric: getConfig("UNCORE_USAGE_METRIC", defaultMetricValue), + GPUUsageMetric: getConfig("GPU_USAGE_METRIC", GPUComputeUtilization), + GeneralUsageMetric: getConfig("GENERAL_USAGE_METRIC", defaultMetricValue), + } +} -// return local path to power model weight -// e.g., /var/lib/kepler/data/model_weight/acpi_AbsPowerModel.json -func GetDefaultPowerModelURL(modelOutputType, energySource string) string { - return fmt.Sprintf(`/var/lib/kepler/data/model_weight/%s_%sModel.json`, energySource, modelOutputType) +func getRedfishConfig() RedfishConfig { + return RedfishConfig{ + CredFilePath: getConfig("REDFISH_CRED_FILE_PATH", ""), + ProbeIntervalInSeconds: getConfig("REDFISH_PROBE_INTERVAL_IN_SECONDS", "60"), + SkipSSLVerify: getBoolConfig("REDFISH_SKIP_SSL_VERIFY", true), + } } -func logBoolConfigs() { - if klog.V(5).Enabled() { - klog.V(5).Infof("ENABLE_EBPF_CGROUPID: %t", EnabledEBPFCgroupID) - klog.V(5).Infof("ENABLE_GPU: %t", EnabledGPU) - klog.V(5).Infof("ENABLE_PROCESS_METRICS: %t", EnableProcessStats) - klog.V(5).Infof("EXPOSE_HW_COUNTER_METRICS: %t", ExposeHardwareCounterMetrics) - klog.V(5).Infof("EXPOSE_IRQ_COUNTER_METRICS: %t", ExposeIRQCounterMetrics) - klog.V(5).Infof("EXPOSE_BPF_METRICS: %t", ExposeBPFMetrics) - klog.V(5).Infof("EXPOSE_COMPONENT_POWER: %t", ExposeComponentPower) - klog.V(5).Infof("EXPOSE_ESTIMATED_IDLE_POWER_METRICS: %t. This only impacts when the power is estimated using pre-prained models. Estimated idle power is meaningful only when Kepler is running on bare-metal or with a single virtual machine (VM) on the node.", ExposeIdlePowerMetrics) - klog.V(5).Infof("EXPERIMENTAL_BPF_SAMPLE_RATE: %d", BPFSampleRate) +func getModelConfig() ModelConfig { + return ModelConfig{ + ModelServerEnable: getBoolConfig("MODEL_SERVER_ENABLE", false), + ModelServerEndpoint: setModelServerReqEndpoint(), + ModelConfigValues: GetModelConfigMap(), + NodePlatformPowerKey: getConfig("NODE_TOTAL_POWER_KEY", defaultNodePlatformPowerKey), + NodeComponentsPowerKey: getConfig("NODE_COMPONENTS_POWER_KEY", defaultNodeComponentsPowerKey), + ContainerPlatformPowerKey: getConfig("CONTAINER_TOTAL_POWER_KEY", defaultContainerPlatformPowerKey), + ContainerComponentsPowerKey: getConfig("CONTAINER_COMPONENTS_POWER_KEY", defaultContainerComponentsPowerKey), + ProcessPlatformPowerKey: getConfig("PROCESS_TOTAL_POWER_KEY", defaultProcessPlatformPowerKey), + ProcessComponentsPowerKey: getConfig("PROCESS_COMPONENTS_POWER_KEY", defaultProcessComponentsPowerKey), } } -func LogConfigs() { - logBoolConfigs() +func getLibvirtConfig() LibvirtConfig { + return LibvirtConfig{ + MetadataURI: getConfig("LIBVIRT_METADATA_URI", ""), + MetadataToken: getConfig("LIBVIRT_METADATA_TOKEN", "name"), + } } +// Helper functions func getBoolConfig(configKey string, defaultBool bool) bool { defaultValue := "false" if defaultBool { @@ -178,7 +211,7 @@ func getBoolConfig(configKey string, defaultBool bool) bool { func getIntConfig(configKey string, defaultInt int) int { defaultValue := fmt.Sprintf("%d", defaultInt) - value, err := strconv.Atoi((getConfig(configKey, defaultValue))) + value, err := strconv.Atoi(getConfig(configKey, defaultValue)) if err == nil { return value } @@ -202,91 +235,76 @@ func getConfig(key, defaultValue string) string { return defaultValue } -func SetRedfishCredFilePath(credFilePath string) { - redfishCredFilePath = credFilePath -} - -func GetRedfishCredFilePath() string { - return redfishCredFilePath -} - -func SetRedfishProbeIntervalInSeconds(interval string) { - redfishProbeIntervalInSeconds = interval -} - -func GetRedfishProbeIntervalInSeconds() int { - // convert string "redfishProbeIntervalInSeconds" to int - probeInterval, err := strconv.Atoi(redfishProbeIntervalInSeconds) - if err != nil { - klog.Warning("failed to convert redfishProbeIntervalInSeconds to int", err) - return 60 +func setModelServerReqEndpoint() string { + modelServerURL := getConfig("MODEL_SERVER_URL", "kepler-model-server") + if modelServerURL == "kepler-model-server" { + modelServerPort := strings.TrimSuffix(getConfig("MODEL_SERVER_PORT", defaultModelServerPort), "\n") + modelServerURL = fmt.Sprintf("http://%s:%s", modelServerURL, modelServerPort) } - return probeInterval -} - -func SetRedfishSkipSSLVerify(skipSSLVerify bool) { - redfishSkipSSLVerify = skipSSLVerify + modelReqPath := getConfig("MODEL_SERVER_MODEL_REQ_PATH", defaultModelRequestPath) + return modelServerURL + modelReqPath } -func GetRedfishSkipSSLVerify() bool { - return redfishSkipSSLVerify +// return local path to power model weight +// e.g., /var/lib/kepler/data/model_weight/acpi_AbsPowerModel.json +func GetDefaultPowerModelURL(modelOutputType, energySource string) string { + return fmt.Sprintf(`/var/lib/kepler/data/model_weight/%s_%sModel.json`, energySource, modelOutputType) } -func SetModelServerReqEndpoint() (modelServerReqEndpoint string) { - modelServerURL := getConfig("MODEL_SERVER_URL", modelServerService) - if modelServerURL == modelServerService { - modelServerPort := getConfig("MODEL_SERVER_PORT", defaultModelServerPort) - modelServerPort = strings.TrimSuffix(modelServerPort, "\n") // trim \n for kustomized manifest - modelServerURL = fmt.Sprintf("http://%s:%s", modelServerURL, modelServerPort) +func logBoolConfigs() { + if klog.V(5).Enabled() { + klog.V(5).Infof("ENABLE_EBPF_CGROUPID: %t", instance.Kepler.EnabledEBPFCgroupID) + klog.V(5).Infof("ENABLE_GPU: %t", instance.Kepler.EnabledGPU) + klog.V(5).Infof("ENABLE_PROCESS_METRICS: %t", instance.Kepler.EnableProcessStats) + klog.V(5).Infof("EXPOSE_HW_COUNTER_METRICS: %t", instance.Kepler.ExposeHardwareCounterMetrics) + klog.V(5).Infof("EXPOSE_IRQ_COUNTER_METRICS: %t", instance.Kepler.ExposeIRQCounterMetrics) + klog.V(5).Infof("EXPOSE_BPF_METRICS: %t", instance.Kepler.ExposeBPFMetrics) + klog.V(5).Infof("EXPOSE_COMPONENT_POWER: %t", instance.Kepler.ExposeComponentPower) + klog.V(5).Infof("EXPOSE_ESTIMATED_IDLE_POWER_METRICS: %t. This only impacts when the power is estimated using pre-prained models. Estimated idle power is meaningful only when Kepler is running on bare-metal or with a single virtual machine (VM) on the node.", instance.Kepler.ExposeIdlePowerMetrics) + klog.V(5).Infof("EXPERIMENTAL_BPF_SAMPLE_RATE: %d", instance.Kepler.BPFSampleRate) } - modelReqPath := getConfig("MODEL_SERVER_MODEL_REQ_PATH", defaultModelRequestPath) - modelServerReqEndpoint = modelServerURL + modelReqPath - return } -func GetMockACPIPowerPath() string { - return MockACPIPowerPath +func LogConfigs() { + logBoolConfigs() } -func SetMachineSpecFilePath(specFilePath string) { - machineSpecFilePath = specFilePath +func SetRedfishCredFilePath(credFilePath string) { + ensureConfigInitialized() + instance.Redfish.CredFilePath = credFilePath } -// GetMachineSpec initializes a map of MachineSpecValues from MACHINE_SPEC -func GetMachineSpec() *MachineSpec { - if machineSpecFilePath != "" { - if spec, err := readMachineSpec(machineSpecFilePath); err == nil { - return spec - } else { - klog.Warningf("failed to read spec from %s: %v, use default machine spec", machineSpecFilePath, err) - } - } - return getDefaultMachineSpec() +func SetRedfishProbeIntervalInSeconds(interval string) { + ensureConfigInitialized() + instance.Redfish.ProbeIntervalInSeconds = interval } -// InitModelConfigMap initializes map of config from MODEL_CONFIG -func InitModelConfigMap() { - ModelConfigValues = GetModelConfigMap() +func SetRedfishSkipSSLVerify(skipSSLVerify bool) { + ensureConfigInitialized() + instance.Redfish.SkipSSLVerify = skipSSLVerify } +// SetEnabledEBPFCgroupID enables or disables eBPF code to collect cgroup ID +// based on kernel version and cgroup version. // SetEnabledEBPFCgroupID enables the eBPF code to collect cgroup id if the system has kernel version > 4.18 func SetEnabledEBPFCgroupID(enabled bool) { // set to false if any config source set it to false - enabled = enabled && EnabledEBPFCgroupID + enabled = enabled && instance.Kepler.EnabledEBPFCgroupID klog.Infoln("using gCgroup ID in the BPF program:", enabled) - KernelVersion = getKernelVersion(c) - klog.Infoln("kernel version:", KernelVersion) - if (enabled) && (KernelVersion >= cGroupIDMinKernelVersion) && (isCGroupV2(c)) { - EnabledEBPFCgroupID = true + instance.KernelVersion = getKernelVersion(instance) + klog.Infoln("kernel version:", instance.KernelVersion) + if (enabled) && (instance.KernelVersion >= cGroupIDMinKernelVersion) && (isCGroupV2(instance)) { + instance.Kepler.EnabledEBPFCgroupID = true } else { - EnabledEBPFCgroupID = false + instance.Kepler.EnabledEBPFCgroupID = false } } // SetEnabledHardwareCounterMetrics enables the exposure of hardware counter metrics func SetEnabledHardwareCounterMetrics(enabled bool) { + ensureConfigInitialized() // set to false is any config source set it to false - ExposeHardwareCounterMetrics = enabled && ExposeHardwareCounterMetrics + instance.Kepler.ExposeHardwareCounterMetrics = enabled && instance.Kepler.ExposeHardwareCounterMetrics } // SetEnabledIdlePower allows enabling idle power exposure in Kepler's metrics. When direct power metrics access is available, @@ -297,73 +315,93 @@ func SetEnabledHardwareCounterMetrics(enabled bool) { // Idle power prediction is limited to bare-metal or single VM setups. // Know the number of running VMs becomes crucial for achieving a fair distribution of idle power, particularly when following the GHG (Greenhouse Gas) protocol. func SetEnabledIdlePower(enabled bool) { + ensureConfigInitialized() // set to true is any config source set it to true or if system power metrics are available - ExposeIdlePowerMetrics = enabled || ExposeIdlePowerMetrics - if ExposeIdlePowerMetrics { + instance.Kepler.ExposeIdlePowerMetrics = enabled || instance.Kepler.ExposeIdlePowerMetrics + if instance.Kepler.ExposeIdlePowerMetrics { klog.Infoln("The Idle power will be exposed. Are you running on Baremetal or using single VM per node?") } } -// IsIdlePowerEnabled always return true if Kepler has access to system power metrics. -// However, if pre-trained power models are being used, Kepler should only expose metrics if the user is aware of the implications. -func IsIdlePowerEnabled() bool { - return ExposeIdlePowerMetrics +// SetEnabledGPU enables the exposure of gpu metrics +func SetEnabledGPU(enabled bool) { + ensureConfigInitialized() + // set to true if any config source set it to true + instance.Kepler.EnabledGPU = enabled || instance.Kepler.EnabledGPU } -// IsExposeProcessStatsEnabled returns false if process metrics are disabled to minimize overhead in the Kepler standalone mode. -func IsExposeProcessStatsEnabled() bool { - return EnableProcessStats +func SetModelServerEnable(enabled bool) { + ensureConfigInitialized() + instance.Model.ModelServerEnable = enabled || instance.Model.ModelServerEnable } -// IsExposeContainerStatsEnabled returns false if container metrics are disabled to minimize overhead in the Kepler standalone mode. -func IsExposeContainerStatsEnabled() bool { - return ExposeContainerStats +// SetEnabledMSR enables the exposure of MSR metrics +func SetEnabledMSR(enabled bool) { + ensureConfigInitialized() + // set to true if any config source set it to true + instance.Kepler.EnabledMSR = enabled || instance.Kepler.EnabledMSR } -// IsExposeVMStatsEnabled returns false if VM metrics are disabled to minimize overhead. -func IsExposeVMStatsEnabled() bool { - return ExposeVMStats +// SetKubeConfig set kubeconfig file +func SetKubeConfig(k string) { + ensureConfigInitialized() + instance.Kepler.KubeConfig = k } -// IsExposeBPFMetricsEnabled returns false if BPF Metrics metrics are disabled to minimize overhead. -func IsExposeBPFMetricsEnabled() bool { - return ExposeBPFMetrics +// SetEnableAPIServer enables Kepler to watch apiserver +func SetEnableAPIServer(enabled bool) { + ensureConfigInitialized() + instance.Kepler.EnableAPIServer = enabled } -// IsExposeComponentPowerEnabled returns false if component power metrics are disabled to minimize overhead. -func IsExposeComponentPowerEnabled() bool { - return ExposeComponentPower +func SetEstimatorConfig(modelName, selectFilter string) { + ensureConfigInitialized() + instance.Kepler.EstimatorModel = modelName + instance.Kepler.EstimatorSelectFilter = selectFilter } -// SetEnabledGPU enables the exposure of gpu metrics -func SetEnabledGPU(enabled bool) { - // set to true if any config source set it to true - EnabledGPU = enabled || EnabledGPU +func SetModelServerEndpoint(serverEndpoint string) { + ensureConfigInitialized() + instance.Model.ModelServerEndpoint = serverEndpoint } -// SetEnabledMSR enables the exposure of MSR metrics -func SetEnabledMSR(enabled bool) { - // set to true if any config source set it to true - EnabledMSR = enabled || EnabledMSR +func SetMachineSpecFilePath(specFilePath string) { + ensureConfigInitialized() + instance.Kepler.MachineSpecFilePath = specFilePath } -// SetKubeConfig set kubeconfig file -func SetKubeConfig(k string) { - KubeConfig = k +// GetMachineSpec initializes a map of MachineSpecValues from MACHINE_SPEC +func GetMachineSpec() *MachineSpec { + ensureConfigInitialized() + if instance.Kepler.MachineSpecFilePath != "" { + if spec, err := readMachineSpec(instance.Kepler.MachineSpecFilePath); err == nil { + return spec + } else { + klog.Warningf("failed to read spec from %s: %v, use default machine spec", instance.Kepler.MachineSpecFilePath, err) + } + } + return getDefaultMachineSpec() } -// SetEnableAPIServer enables Kepler to watch apiserver -func SetEnableAPIServer(enabled bool) { - EnableAPIServer = enabled +func GetMetricPath(cmdSet string) string { + return getConfig(metricPathKey, cmdSet) +} + +func GetBindAddress(cmdSet string) string { + return getConfig(bindAddressKey, cmdSet) +} + +func SetGPUUsageMetric(metric string) { + instance.Metrics.GPUUsageMetric = metric } -func (c config) getUnixName() (unix.Utsname, error) { +func (c *Config) getUnixName() (unix.Utsname, error) { var utsname unix.Utsname err := unix.Uname(&utsname) return utsname, err } -func (c config) getCgroupV2File() string { +func (c *Config) getCgroupV2File() string { return cGroupV2Path } @@ -408,28 +446,80 @@ func isCGroupV2(c Client) bool { // Get cgroup version, return 1 or 2 func GetCGroupVersion() int { - if isCGroupV2(c) { + if isCGroupV2(instance) { return 2 } else { return 1 } } -func SetEstimatorConfig(modelName, selectFilter string) { - EstimatorModel = modelName - EstimatorSelectFilter = selectFilter +// InitModelConfigMap initializes map of config from MODEL_CONFIG +func InitModelConfigMap() { + ensureConfigInitialized() + if instance.Model.ModelConfigValues == nil { + instance.Model.ModelConfigValues = GetModelConfigMap() + } } -func SetModelServerEndpoint(serverEndpoint string) { - ModelServerEndpoint = serverEndpoint +// EnsureConfigInitialized checks if the instance is initialized, and if not, initializes it. +func ensureConfigInitialized() { + if instance == nil { + once.Do(func() { + instance = newConfig() + }) + } } -func GetMetricPath(cmdSet string) string { - return getConfig(MetricPathKey, cmdSet) +// IsIdlePowerEnabled always return true if Kepler has access to system power metrics. +// However, if pre-trained power models are being used, Kepler should only expose metrics if the user is aware of the implications. +func IsIdlePowerEnabled() bool { + ensureConfigInitialized() + return instance.Kepler.ExposeIdlePowerMetrics } -func GetBindAddress(cmdSet string) string { - return getConfig(BindAddressKey, cmdSet) +// IsExposeProcessStatsEnabled returns false if process metrics are disabled to minimize overhead in the Kepler standalone mode. +func IsExposeProcessStatsEnabled() bool { + ensureConfigInitialized() + return instance.Kepler.EnableProcessStats +} + +// IsExposeContainerStatsEnabled returns false if container metrics are disabled to minimize overhead in the Kepler standalone mode. +func IsExposeContainerStatsEnabled() bool { + ensureConfigInitialized() + return instance.Kepler.ExposeContainerStats +} + +// IsExposeVMStatsEnabled returns false if VM metrics are disabled to minimize overhead. +func IsExposeVMStatsEnabled() bool { + ensureConfigInitialized() + return instance.Kepler.ExposeVMStats +} + +// IsExposeBPFMetricsEnabled returns false if BPF Metrics metrics are disabled to minimize overhead. +func IsExposeBPFMetricsEnabled() bool { + ensureConfigInitialized() + return instance.Kepler.ExposeBPFMetrics +} + +// IsExposeComponentPowerEnabled returns false if component power metrics are disabled to minimize overhead. +func IsExposeComponentPowerEnabled() bool { + ensureConfigInitialized() + return instance.Kepler.ExposeComponentPower +} + +func IsEnabledMSR() bool { + ensureConfigInitialized() + return instance.Kepler.EnabledMSR +} + +func IsModelServerEnabled() bool { + ensureConfigInitialized() + return instance.Model.ModelServerEnable +} + +func ModelServerEndpoint() string { + ensureConfigInitialized() + return instance.Model.ModelServerEndpoint } func GetModelConfigMap() map[string]string { @@ -447,14 +537,134 @@ func GetModelConfigMap() map[string]string { return configMap } -func SetGpuUsageMetric(metric string) { - GpuUsageMetric = metric -} - func GetLibvirtMetadataURI() string { - return LibvirtMetadataURI + ensureConfigInitialized() + return instance.Libvirt.MetadataURI } func GetLibvirtMetadataToken() string { - return LibvirtMetadataToken + ensureConfigInitialized() + return instance.Libvirt.MetadataToken +} + +func ExposeIRQCounterMetrics() bool { + ensureConfigInitialized() + return instance.Kepler.ExposeIRQCounterMetrics +} + +func GetBPFSampleRate() int { + ensureConfigInitialized() + return instance.Kepler.BPFSampleRate +} + +func GetRedfishCredFilePath() string { + ensureConfigInitialized() + return instance.Redfish.CredFilePath +} +func GetRedfishProbeIntervalInSeconds() int { + ensureConfigInitialized() + // convert string "redfishProbeIntervalInSeconds" to int + probeInterval, err := strconv.Atoi(instance.Redfish.ProbeIntervalInSeconds) + if err != nil { + klog.Warning("failed to convert redfishProbeIntervalInSeconds to int", err) + return 60 + } + return probeInterval +} + +func GetRedfishSkipSSLVerify() bool { + ensureConfigInitialized() + return instance.Redfish.SkipSSLVerify +} +func GetMockACPIPowerPath() string { + ensureConfigInitialized() + return instance.Kepler.MockACPIPowerPath +} + +func ExposeHardwareCounterMetrics() bool { + ensureConfigInitialized() + return instance.Kepler.ExposeHardwareCounterMetrics +} + +func EnabledGPU() bool { + ensureConfigInitialized() + return instance.Kepler.EnabledGPU +} + +func SamplePeriodSec() uint64 { + ensureConfigInitialized() + return instance.SamplePeriodSec +} + +func CoreUsageMetric() string { + ensureConfigInitialized() + return instance.Metrics.CoreUsageMetric +} + +func DRAMUsageMetric() string { + ensureConfigInitialized() + return instance.Metrics.DRAMUsageMetric +} + +func GPUUsageMetric() string { + ensureConfigInitialized() + return instance.Metrics.GPUUsageMetric +} + +func CPUArchOverride() string { + ensureConfigInitialized() + return instance.Kepler.CPUArchOverride +} + +func GeneralUsageMetric() string { + ensureConfigInitialized() + return instance.Metrics.GeneralUsageMetric +} + +func KubeConfig() string { + ensureConfigInitialized() + return instance.Kepler.KubeConfig +} + +func EnabledEBPFCgroupID() bool { + ensureConfigInitialized() + return instance.Kepler.EnabledEBPFCgroupID +} + +func NodePlatformPowerKey() string { + ensureConfigInitialized() + return instance.Model.NodePlatformPowerKey +} + +func NodeComponentsPowerKey() string { + ensureConfigInitialized() + return instance.Model.NodeComponentsPowerKey +} + +func ContainerPlatformPowerKey() string { + ensureConfigInitialized() + return instance.Model.ContainerPlatformPowerKey +} + +func ModelConfigValues(k string) string { + ensureConfigInitialized() + return instance.Model.ModelConfigValues[k] +} + +func ContainerComponentsPowerKey() string { + ensureConfigInitialized() + return instance.Model.ContainerComponentsPowerKey +} +func ProcessPlatformPowerKey() string { + ensureConfigInitialized() + return instance.Model.ProcessPlatformPowerKey +} +func ProcessComponentsPowerKey() string { + ensureConfigInitialized() + return instance.Model.ProcessComponentsPowerKey +} + +func APIServerEnabled() bool { + ensureConfigInitialized() + return instance.Kepler.EnableAPIServer } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 50836f91b3..bc338f3e3a 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -115,11 +115,12 @@ var _ = Describe("Test Configuration", func() { Expect(float32(-1)).To(Equal(getKernelVersion(mockc))) }) It("Test real kernel version", func() { + conf := GetConfig() // we assume running on Linux env should be bigger than 3.0 // env now, so make it 3.0 as minimum test: switch runtime.GOOS { case "linux": - Expect(true).To(Equal(getKernelVersion(c) > 3.0)) + Expect(true).To(Equal(getKernelVersion(conf) > 3.0)) default: // no test } diff --git a/pkg/config/types.go b/pkg/config/types.go index dc7a5ee475..961ef1ddc0 100644 --- a/pkg/config/types.go +++ b/pkg/config/types.go @@ -68,4 +68,39 @@ const ( IdleEnergyInGPU = "idle_energy_in_gpu" IdleEnergyInOther = "idle_energy_in_other" IdleEnergyInPlatform = "idle_energy_in_platform" + + cGroupIDMinKernelVersion = 4.18 + // If this file is present, cgroups v2 is enabled on that node. + cGroupV2Path = "/sys/fs/cgroup/cgroup.controllers" + metricPathKey = "METRIC_PATH" + bindAddressKey = "BIND_ADDRESS" + // model_parameter_attributes + EstimatorEnabledKey = "ESTIMATOR" + LocalRegressorEnabledKey = "LOCAL_REGRESSOR" + InitModelURLKey = "INIT_URL" + FixedTrainerNameKey = "TRAINER" + ModelFiltersKey = "FILTERS" + DefaultTrainerName = "SGDRegressorTrainer" + // Local defaults + defaultMetricValue = "" + defaultNamespace = "kepler" + defaultModelServerPort = "8100" + defaultModelRequestPath = "/model" + defaultMaxLookupRetry = 500 + // MaxIRQ is the maximum number of IRQs to be monitored + MaxIRQ = 10 + // defaultSamplePeriodSec is the time in seconds that the reader will wait before reading the metrics again + defaultSamplePeriodSec = 3 + configDir = "/etc/kepler/kepler.config" + defaultKubeConfig = "" + defaultBPFSampleRate = 0 + defaultCPUArchOverride = "" + // model_parameter_prefix + defaultNodePlatformPowerKey = "NODE_TOTAL" + defaultNodeComponentsPowerKey = "NODE_COMPONENTS" + defaultContainerPlatformPowerKey = "CONTAINER_TOTAL" + defaultContainerComponentsPowerKey = "CONTAINER_COMPONENTS" + defaultProcessPlatformPowerKey = "PROCESS_TOTAL" + defaultProcessComponentsPowerKey = "PROCESS_COMPONENTS" + DefaultMachineSpecFilePath = "/etc/kepler/models/machine/spec.json" ) diff --git a/pkg/kubernetes/watcher.go b/pkg/kubernetes/watcher.go index 3d67e7e9b7..65cd1f1155 100644 --- a/pkg/kubernetes/watcher.go +++ b/pkg/kubernetes/watcher.go @@ -76,14 +76,14 @@ type ObjListWatcher struct { func newK8sClient() *kubernetes.Clientset { var restConf *rest.Config var err error - if config.KubeConfig == "" { + if config.KubeConfig() == "" { // creates the in-cluster config restConf, err = rest.InClusterConfig() klog.Infoln("Using in cluster k8s config") } else { // use the current context in kubeconfig - restConf, err = clientcmd.BuildConfigFromFlags("", config.KubeConfig) - klog.Infoln("Using out cluster k8s config: ", config.KubeConfig) + restConf, err = clientcmd.BuildConfigFromFlags("", config.KubeConfig()) + klog.Infoln("Using out cluster k8s config: ", config.KubeConfig()) } if err != nil { klog.Infof("failed to get config: %v", err) @@ -105,11 +105,11 @@ func NewObjListWatcher(bpfSupportedMetrics bpf.SupportedMetrics) (*ObjListWatche bpfSupportedMetrics: bpfSupportedMetrics, workqueue: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()), } - if w.k8sCli == nil || !config.EnableAPIServer { + if w.k8sCli == nil || !config.APIServerEnabled() { return w, nil } optionsModifier := func(options *metav1.ListOptions) { - options.FieldSelector = fields.Set{"spec.nodeName": stats.GetNodeName()}.AsSelector().String() // to filter events per node + options.FieldSelector = fields.Set{"spec.nodeName": stats.NodeName()}.AsSelector().String() // to filter events per node } objListWatcher := cache.NewFilteredListWatchFromClient( w.k8sCli.CoreV1().RESTClient(), diff --git a/pkg/libvirt/resolve_vm_test.go b/pkg/libvirt/resolve_vm_test.go index 657edddeff..308f67ce15 100644 --- a/pkg/libvirt/resolve_vm_test.go +++ b/pkg/libvirt/resolve_vm_test.go @@ -23,6 +23,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/sustainable-computing-io/kepler/pkg/config" ) const ( @@ -37,6 +38,7 @@ var _ = Describe("Test LibVirt", func() { BeforeEach(func() { mockProcDir = createTempDir() + config.GetConfig() }) AfterEach(func() { diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 5094ce453d..c96c4c2e41 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -27,10 +27,6 @@ import ( "k8s.io/klog/v2" ) -var ( - samplePeriod = time.Duration(config.SamplePeriodSec * 1000 * uint64(time.Millisecond)) -) - type CollectorManager struct { // StatsCollector is responsible to collect resource and energy consumption metrics and calculate them when needed StatsCollector *collector.Collector @@ -70,6 +66,8 @@ func (m *CollectorManager) Start() error { return err } + samplePeriod := time.Duration((config.SamplePeriodSec()) * uint64(time.Second)) + go func() { ticker := time.NewTicker(samplePeriod) for { diff --git a/pkg/metrics/metricfactory/metric_factory.go b/pkg/metrics/metricfactory/metric_factory.go index 466261d190..aeb68a899f 100644 --- a/pkg/metrics/metricfactory/metric_factory.go +++ b/pkg/metrics/metricfactory/metric_factory.go @@ -86,7 +86,7 @@ func SCMetricsPromDesc(context string, bpfSupportedMetrics bpf.SupportedMetrics) func GPUUsageMetricsPromDesc(context string) (descriptions map[string]*prometheus.Desc) { descriptions = make(map[string]*prometheus.Desc) - if config.EnabledGPU { + if config.EnabledGPU() { if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { for _, name := range consts.GPUMetricNames { descriptions[name] = resMetricsPromDesc(context, name, gpu.Device().Name()) diff --git a/pkg/metrics/node/metrics.go b/pkg/metrics/node/metrics.go index b11eabae5c..95e2f43d60 100644 --- a/pkg/metrics/node/metrics.go +++ b/pkg/metrics/node/metrics.go @@ -85,7 +85,7 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { // update node info ch <- c.collectors["info"].MustMetric(1, - stats.NodeCPUArchitecture, + stats.NodeCPUArchitecture(), components.GetSourceName(), platform.GetSourceName(), ) diff --git a/pkg/metrics/prometheus_collector_test.go b/pkg/metrics/prometheus_collector_test.go index 6e48757470..0d6e5173f1 100644 --- a/pkg/metrics/prometheus_collector_test.go +++ b/pkg/metrics/prometheus_collector_test.go @@ -89,8 +89,8 @@ var _ = Describe("Test Prometheus Collector Unit", func() { nodeStats.UpdateDynEnergy() model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), - stats.NodeMetadataFeatureNames, - stats.NodeMetadataFeatureValues, + stats.NodeMetadataFeatureNames(), + stats.NodeMetadataFeatureValues(), bpfSupportedMetrics) model.UpdateProcessEnergy(processStats, &nodeStats) diff --git a/pkg/metrics/utils/utils.go b/pkg/metrics/utils/utils.go index fd1a124a36..43f8eb43db 100644 --- a/pkg/metrics/utils/utils.go +++ b/pkg/metrics/utils/utils.go @@ -36,7 +36,7 @@ func CollectEnergyMetrics(ch chan<- prometheus.Metric, instance interface{}, col if config.IsExposeComponentPowerEnabled() { // collect the dynamic energy metrics for i, collectorName := range consts.EnergyMetricNames { - if collectorName == config.GPU && !config.EnabledGPU { + if collectorName == config.GPU && !config.EnabledGPU() { continue } collectEnergy(ch, instance, consts.DynEnergyMetricNames[i], "dynamic", collectors[collectorName]) @@ -57,7 +57,7 @@ func CollectResUtilizationMetrics(ch chan<- prometheus.Metric, instance interfac for collectorName := range bpfSupportedMetrics.HardwareCounters { CollectResUtil(ch, instance, collectorName, collectors[collectorName]) } - if config.EnabledGPU { + if config.EnabledGPU() { if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { for _, collectorName := range consts.GPUMetricNames { CollectResUtil(ch, instance, collectorName, collectors[collectorName]) @@ -150,7 +150,7 @@ func collectEnergy(ch chan<- prometheus.Metric, instance interface{}, metricName if _, exist := node.EnergyUsage[metricName]; exist { for deviceID, utilization := range node.EnergyUsage[metricName] { value = float64(utilization.GetAggr()) / JouleMillijouleConversionFactor - labelValues = []string{deviceID, stats.NodeName, mode} + labelValues = []string{deviceID, stats.NodeName(), mode} collect(ch, collector, value, labelValues) } } @@ -208,7 +208,7 @@ func CollectResUtil(ch chan<- prometheus.Metric, instance interface{}, metricNam if _, exist := node.ResourceUsage[metricName]; exist { for deviceID, utilization := range node.ResourceUsage[metricName] { value = float64(utilization.GetAggr()) - labelValues = []string{deviceID, stats.NodeName} + labelValues = []string{deviceID, stats.NodeName()} collect(ch, collector, value, labelValues) } } diff --git a/pkg/model/benchmark_test.go b/pkg/model/benchmark_test.go index 71f3950766..d4f8a16f47 100644 --- a/pkg/model/benchmark_test.go +++ b/pkg/model/benchmark_test.go @@ -44,7 +44,7 @@ func benchmarkNtesting(b *testing.B, processNumber int) { // The default estimator model is the ratio bpfSupportedMetrics := bpf.DefaultSupportedMetrics() - model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), stats.NodeMetadataFeatureNames, stats.NodeMetadataFeatureValues, bpfSupportedMetrics) + model.CreatePowerEstimatorModels(stats.GetProcessFeatureNames(bpfSupportedMetrics), stats.NodeMetadataFeatureNames(), stats.NodeMetadataFeatureValues(), bpfSupportedMetrics) // update container and node metrics b.ReportAllocs() diff --git a/pkg/model/estimator/local/ratio.go b/pkg/model/estimator/local/ratio.go index 7c8934ee36..42411fffd1 100644 --- a/pkg/model/estimator/local/ratio.go +++ b/pkg/model/estimator/local/ratio.go @@ -40,7 +40,7 @@ const ( DramUsageMetric UncoreUsageMetric OtherUsageMetric - GpuUsageMetric + GPUUsageMetric PkgDynPower CoreDynPower DramDynPower @@ -180,7 +180,7 @@ func (r *RatioPowerModel) GetGPUPower(isIdlePower bool) ([]uint64, error) { if isIdlePower { processPower = uint64Division(r.nodeFeatureValues[GpuIdlePower], numProcesses) } else { - processPower = r.getPowerByRatio(processIdx, int(GpuUsageMetric), int(GpuDynPower), numProcesses) + processPower = r.getPowerByRatio(processIdx, int(GPUUsageMetric), int(GpuDynPower), numProcesses) } nodeComponentsPowerOfAllProcesses = append(nodeComponentsPowerOfAllProcesses, processPower) } diff --git a/pkg/model/estimator/local/ratio_model_test.go b/pkg/model/estimator/local/ratio_model_test.go index ab3f3c241e..55e9d7a2c8 100644 --- a/pkg/model/estimator/local/ratio_model_test.go +++ b/pkg/model/estimator/local/ratio_model_test.go @@ -50,37 +50,37 @@ var _ = Describe("Test Ratio Unit", func() { val = pMetric.ResourceUsage[config.CPUTime][stats.MockedSocketID].GetDelta() nodeStats.ResourceUsage[config.CPUTime].AddDeltaStat(stats.MockedSocketID, val) } - Expect(nodeStats.ResourceUsage[config.CoreUsageMetric][utils.GenericSocketID].GetDelta()).Should(BeEquivalentTo(90000)) + Expect(nodeStats.ResourceUsage[config.CoreUsageMetric()][utils.GenericSocketID].GetDelta()).Should(BeEquivalentTo(90000)) // The default estimator model is the ratio model := RatioPowerModel{ ProcessFeatureNames: []string{ - config.CoreUsageMetric, // for PKG resource usage - config.CoreUsageMetric, // for CORE resource usage - config.DRAMUsageMetric, // for DRAM resource usage - config.GeneralUsageMetric, // for UNCORE resource usage - config.GeneralUsageMetric, // for OTHER resource usage - config.GpuUsageMetric, // for GPU resource usage + config.CoreUsageMetric(), // for PKG resource usage + config.CoreUsageMetric(), // for CORE resource usage + config.DRAMUsageMetric(), // for DRAM resource usage + config.GeneralUsageMetric(), // for UNCORE resource usage + config.GeneralUsageMetric(), // for OTHER resource usage + config.GPUUsageMetric(), // for GPU resource usage }, NodeFeatureNames: []string{ - config.CoreUsageMetric, // for PKG resource usage - config.CoreUsageMetric, // for CORE resource usage - config.DRAMUsageMetric, // for DRAM resource usage - config.GeneralUsageMetric, // for UNCORE resource usage - config.GeneralUsageMetric, // for OTHER resource usage - config.GpuUsageMetric, // for GPU resource usage - config.DynEnergyInPkg, // for dynamic PKG power consumption - config.DynEnergyInCore, // for dynamic CORE power consumption - config.DynEnergyInDRAM, // for dynamic PKG power consumption - config.DynEnergyInUnCore, // for dynamic UNCORE power consumption - config.DynEnergyInOther, // for dynamic OTHER power consumption - config.DynEnergyInGPU, // for dynamic GPU power consumption - config.IdleEnergyInPkg, // for idle PKG power consumption - config.IdleEnergyInCore, // for idle CORE power consumption - config.IdleEnergyInDRAM, // for idle PKG power consumption - config.IdleEnergyInUnCore, // for idle UNCORE power consumption - config.IdleEnergyInOther, // for idle OTHER power consumption - config.IdleEnergyInGPU, // for idle GPU power consumption + config.CoreUsageMetric(), // for PKG resource usage + config.CoreUsageMetric(), // for CORE resource usage + config.DRAMUsageMetric(), // for DRAM resource usage + config.GeneralUsageMetric(), // for UNCORE resource usage + config.GeneralUsageMetric(), // for OTHER resource usage + config.GPUUsageMetric(), // for GPU resource usage + config.DynEnergyInPkg, // for dynamic PKG power consumption + config.DynEnergyInCore, // for dynamic CORE power consumption + config.DynEnergyInDRAM, // for dynamic PKG power consumption + config.DynEnergyInUnCore, // for dynamic UNCORE power consumption + config.DynEnergyInOther, // for dynamic OTHER power consumption + config.DynEnergyInGPU, // for dynamic GPU power consumption + config.IdleEnergyInPkg, // for idle PKG power consumption + config.IdleEnergyInCore, // for idle CORE power consumption + config.IdleEnergyInDRAM, // for idle PKG power consumption + config.IdleEnergyInUnCore, // for idle UNCORE power consumption + config.IdleEnergyInOther, // for idle OTHER power consumption + config.IdleEnergyInGPU, // for idle GPU power consumption }, } model.ResetSampleIdx() diff --git a/pkg/model/estimator/local/regressor/regressor.go b/pkg/model/estimator/local/regressor/regressor.go index b800922d07..2f58ecfbf1 100644 --- a/pkg/model/estimator/local/regressor/regressor.go +++ b/pkg/model/estimator/local/regressor/regressor.go @@ -87,7 +87,7 @@ func (r *Regressor) Start() error { r.enabled = false r.coreRatio = 1 // try getting weight from model server if it is enabled - if config.ModelServerEnable && config.ModelServerEndpoint != "" { + if config.IsModelServerEnabled() && config.ModelServerEndpoint() != "" { weight, err = r.getWeightFromServer() klog.V(3).Infof("Regression Model (%s): getWeightFromServer: %v (error: %v)", outputStr, weight, err) } diff --git a/pkg/model/estimator/local/regressor/regressor_test.go b/pkg/model/estimator/local/regressor/regressor_test.go index ebd9138a9c..8f2d24eb77 100644 --- a/pkg/model/estimator/local/regressor/regressor_test.go +++ b/pkg/model/estimator/local/regressor/regressor_test.go @@ -122,8 +122,9 @@ func genHandlerFunc(curvefit []float64, trainerName string) (handlerFunc func(w } func genRegressor(outputType types.ModelOutputType, energySource, modelServerEndpoint, modelWeightsURL, modelWeightFilepath, trainerName string) Regressor { - config.ModelServerEnable = true - config.ModelServerEndpoint = modelServerEndpoint + config.GetConfig() + config.SetModelServerEnable(true) + config.SetModelServerEndpoint(modelServerEndpoint) return Regressor{ ModelServerEndpoint: modelServerEndpoint, OutputType: outputType, diff --git a/pkg/model/model.go b/pkg/model/model.go index 7c6f4d4a15..7aa7cc35ab 100644 --- a/pkg/model/model.go +++ b/pkg/model/model.go @@ -104,7 +104,7 @@ func createPowerModelEstimator(modelConfig *types.ModelConfig) (PowerModelInterf trainerName = config.DefaultTrainerName } model := ®ressor.Regressor{ - ModelServerEndpoint: config.ModelServerEndpoint, + ModelServerEndpoint: config.ModelServerEndpoint(), OutputType: modelConfig.ModelOutputType, EnergySource: modelConfig.EnergySource, TrainerName: trainerName, @@ -190,18 +190,18 @@ func getModelConfigKey(modelItem, attribute string) string { // getPowerModelType return the model type for a given power source, such as platform or components power sources // The default power model type is Ratio func getPowerModelType(powerSourceTarget string) (modelType types.ModelType) { - useEstimatorSidecarStr := config.ModelConfigValues[getModelConfigKey(powerSourceTarget, config.EstimatorEnabledKey)] + useEstimatorSidecarStr := config.ModelConfigValues(getModelConfigKey(powerSourceTarget, config.EstimatorEnabledKey)) if strings.EqualFold(useEstimatorSidecarStr, "true") { modelType = types.EstimatorSidecar return } - useLocalRegressor := config.ModelConfigValues[getModelConfigKey(powerSourceTarget, config.LocalRegressorEnabledKey)] + useLocalRegressor := config.ModelConfigValues(getModelConfigKey(powerSourceTarget, config.LocalRegressorEnabledKey)) if strings.EqualFold(useLocalRegressor, "true") { modelType = types.Regressor return } // set the default node power model as Regressor - if powerSourceTarget == config.NodePlatformPowerKey || powerSourceTarget == config.NodeComponentsPowerKey { + if powerSourceTarget == config.NodePlatformPowerKey() || powerSourceTarget == config.NodeComponentsPowerKey() { modelType = types.Regressor return } @@ -212,20 +212,20 @@ func getPowerModelType(powerSourceTarget string) (modelType types.ModelType) { // getPowerModelTrainerName return the trainer name for a given power source, such as platform or components power sources func getPowerModelTrainerName(powerSourceTarget string) (trainerName string) { - trainerName = config.ModelConfigValues[getModelConfigKey(powerSourceTarget, config.FixedTrainerNameKey)] + trainerName = config.ModelConfigValues(getModelConfigKey(powerSourceTarget, config.FixedTrainerNameKey)) return } // getPowerModelFilter return the model filter for a given power source, such as platform or components power sources // The model filter is used to select a model, for example selecting a model with the acceptable error: 'mae:0.5' func getPowerModelFilter(powerSourceTarget string) (selectFilter string) { - selectFilter = config.ModelConfigValues[getModelConfigKey(powerSourceTarget, config.ModelFiltersKey)] + selectFilter = config.ModelConfigValues(getModelConfigKey(powerSourceTarget, config.ModelFiltersKey)) return } // getPowerModelDownloadURL return the url to download the pre-trained power model for a given power source, such as platform or components power sources func getPowerModelDownloadURL(powerSourceTarget string) (url string) { - url = config.ModelConfigValues[getModelConfigKey(powerSourceTarget, config.InitModelURLKey)] + url = config.ModelConfigValues(getModelConfigKey(powerSourceTarget, config.InitModelURLKey)) return } @@ -235,17 +235,17 @@ func getPowerModelDownloadURL(powerSourceTarget string) (url string) { // PlatformEnergySource values. Therefore, we must not replace it here func getPowerModelEnergySource(powerSourceTarget string) (energySource string) { switch powerSourceTarget { - case config.ContainerPlatformPowerKey: + case config.ContainerPlatformPowerKey(): return types.PlatformEnergySource - case config.ContainerComponentsPowerKey: + case config.ContainerComponentsPowerKey(): return types.ComponentEnergySource - case config.ProcessPlatformPowerKey: + case config.ProcessPlatformPowerKey(): return types.PlatformEnergySource - case config.ProcessComponentsPowerKey: + case config.ProcessComponentsPowerKey(): return types.ComponentEnergySource - case config.NodePlatformPowerKey: + case config.NodePlatformPowerKey(): return types.PlatformEnergySource - case config.NodeComponentsPowerKey: + case config.NodeComponentsPowerKey(): return types.ComponentEnergySource } return "" @@ -256,13 +256,13 @@ func getPowerModelEnergySource(powerSourceTarget string) (energySource string) { // AbsPower for Node, DynPower for process and process func getPowerModelOutputType(powerSourceTarget string) types.ModelOutputType { switch powerSourceTarget { - case config.ProcessComponentsPowerKey: + case config.ProcessComponentsPowerKey(): return types.DynPower - case config.ProcessPlatformPowerKey: + case config.ProcessPlatformPowerKey(): return types.DynPower - case config.NodePlatformPowerKey: + case config.NodePlatformPowerKey(): return types.AbsPower - case config.NodeComponentsPowerKey: + case config.NodeComponentsPowerKey(): return types.AbsPower } return types.Unsupported @@ -271,9 +271,9 @@ func getPowerModelOutputType(powerSourceTarget string) types.ModelOutputType { // isNodeLevel return the true if current power key is node platform or node components func isNodeLevel(powerSourceTarget string) bool { switch powerSourceTarget { - case config.NodePlatformPowerKey: + case config.NodePlatformPowerKey(): return true - case config.NodeComponentsPowerKey: + case config.NodeComponentsPowerKey(): return true } return false diff --git a/pkg/model/node_component_energy.go b/pkg/model/node_component_energy.go index b60464dc87..78d2c38f23 100644 --- a/pkg/model/node_component_energy.go +++ b/pkg/model/node_component_energy.go @@ -32,7 +32,7 @@ var nodeComponentPowerModel PowerModelInterface // createNodeComponentPowerModelConfig: the node component power model url must be set by default. func createNodeComponentPowerModelConfig(nodeFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues []string) *types.ModelConfig { - modelConfig := CreatePowerModelConfig(config.NodeComponentsPowerKey) + modelConfig := CreatePowerModelConfig(config.NodeComponentsPowerKey()) if modelConfig.InitModelURL == "" { modelConfig.InitModelFilepath = config.GetDefaultPowerModelURL(modelConfig.ModelOutputType.String(), types.ComponentEnergySource) } @@ -111,9 +111,9 @@ func UpdateNodeComponentIdleEnergy(nodeMetrics *stats.NodeStats) { func addEnergy(nodeMetrics *stats.NodeStats, metrics []string, isIdle bool) { for socket, power := range GetNodeComponentPowers(nodeMetrics, isIdle) { strID := fmt.Sprintf("%d", socket) - nodeMetrics.EnergyUsage[metrics[0]].SetDeltaStat(strID, power.Core*config.SamplePeriodSec) - nodeMetrics.EnergyUsage[metrics[1]].SetDeltaStat(strID, power.DRAM*config.SamplePeriodSec) - nodeMetrics.EnergyUsage[metrics[2]].SetDeltaStat(strID, power.Uncore*config.SamplePeriodSec) - nodeMetrics.EnergyUsage[metrics[3]].SetDeltaStat(strID, power.Pkg*config.SamplePeriodSec) + nodeMetrics.EnergyUsage[metrics[0]].SetDeltaStat(strID, power.Core*config.SamplePeriodSec()) + nodeMetrics.EnergyUsage[metrics[1]].SetDeltaStat(strID, power.DRAM*config.SamplePeriodSec()) + nodeMetrics.EnergyUsage[metrics[2]].SetDeltaStat(strID, power.Uncore*config.SamplePeriodSec()) + nodeMetrics.EnergyUsage[metrics[3]].SetDeltaStat(strID, power.Pkg*config.SamplePeriodSec()) } } diff --git a/pkg/model/node_platform_energy.go b/pkg/model/node_platform_energy.go index 363974ada8..1617aa3d17 100644 --- a/pkg/model/node_platform_energy.go +++ b/pkg/model/node_platform_energy.go @@ -38,7 +38,7 @@ func CreateNodePlatformPowerEstimatorModel(nodeFeatureNames, systemMetaDataFeatu klog.Infof("Skipping creation of Node Platform Power Model since the system collection is supported") } - modelConfig := CreatePowerModelConfig(config.NodePlatformPowerKey) + modelConfig := CreatePowerModelConfig(config.NodePlatformPowerKey()) if modelConfig.InitModelURL == "" { modelConfig.InitModelFilepath = config.GetDefaultPowerModelURL(modelConfig.ModelOutputType.String(), types.PlatformEnergySource) } @@ -98,7 +98,7 @@ func GetNodePlatformPower(nodeMetrics *stats.NodeStats, isIdlePower bool) (platf func UpdateNodePlatformEnergy(nodeMetrics *stats.NodeStats) { platformPower := GetNodePlatformPower(nodeMetrics, absPower) for sourceID, power := range platformPower { - nodeMetrics.EnergyUsage[config.AbsEnergyInPlatform].SetDeltaStat(sourceID, power*config.SamplePeriodSec) + nodeMetrics.EnergyUsage[config.AbsEnergyInPlatform].SetDeltaStat(sourceID, power*config.SamplePeriodSec()) } } @@ -106,6 +106,6 @@ func UpdateNodePlatformEnergy(nodeMetrics *stats.NodeStats) { func UpdateNodePlatformIdleEnergy(nodeMetrics *stats.NodeStats) { platformPower := GetNodePlatformPower(nodeMetrics, idlePower) for sourceID, power := range platformPower { - nodeMetrics.EnergyUsage[config.IdleEnergyInPlatform].SetDeltaStat(sourceID, power*config.SamplePeriodSec) + nodeMetrics.EnergyUsage[config.IdleEnergyInPlatform].SetDeltaStat(sourceID, power*config.SamplePeriodSec()) } } diff --git a/pkg/model/process_energy.go b/pkg/model/process_energy.go index 68f4807bed..98fda79737 100644 --- a/pkg/model/process_energy.go +++ b/pkg/model/process_energy.go @@ -50,10 +50,10 @@ func createProcessPowerModelConfig(powerSourceTarget string, processFeatureNames // Ratio power model has different features than the other estimators. // Ratio power model has node resource and power consumption as features, as it is used to calculate the ratio. if modelConfig.ModelType == types.Ratio { - if powerSourceTarget == config.ProcessComponentsPowerKey { - pkgUsageMetric := config.CoreUsageMetric - coreUsageMetric := config.CoreUsageMetric - dramUsageMetric := config.DRAMUsageMetric + if powerSourceTarget == config.ProcessComponentsPowerKey() { + pkgUsageMetric := config.CoreUsageMetric() + coreUsageMetric := config.CoreUsageMetric() + dramUsageMetric := config.DRAMUsageMetric() if !bpfSupportedMetrics.HardwareCounters.Has(config.CPUTime) { // Given that there is no HW counter in some scenarios (e.g. on VMs), we have to use CPUTime data. // Although a busy CPU is more likely to be accessing memory the CPU utilization (CPUTime) does not directly @@ -62,12 +62,12 @@ func createProcessPowerModelConfig(powerSourceTarget string, processFeatureNames } // ProcessFeatureNames contains the metrics that represents the process resource utilization modelConfig.ProcessFeatureNames = []string{ - pkgUsageMetric, // for PKG resource usage - coreUsageMetric, // for CORE resource usage - dramUsageMetric, // for DRAM resource usage - config.GeneralUsageMetric, // for UNCORE resource usage - config.GeneralUsageMetric, // for OTHER resource usage - config.GpuUsageMetric, // for GPU resource usage + pkgUsageMetric, // for PKG resource usage + coreUsageMetric, // for CORE resource usage + dramUsageMetric, // for DRAM resource usage + config.GeneralUsageMetric(), // for UNCORE resource usage + config.GeneralUsageMetric(), // for OTHER resource usage + config.GPUUsageMetric(), // for GPU resource usage } // NodeFeatureNames contains the metrics that represents the node resource utilization plus the dynamic and idle power power consumption modelConfig.NodeFeatureNames = modelConfig.ProcessFeatureNames @@ -85,8 +85,8 @@ func createProcessPowerModelConfig(powerSourceTarget string, processFeatureNames config.IdleEnergyInOther, // for idle OTHER power consumption config.IdleEnergyInGPU, // for idle GPU power consumption }...) - } else if powerSourceTarget == config.ProcessPlatformPowerKey { - platformUsageMetric := config.CoreUsageMetric + } else if powerSourceTarget == config.ProcessPlatformPowerKey() { + platformUsageMetric := config.CoreUsageMetric() if !bpfSupportedMetrics.HardwareCounters.Has(config.CPUTime) { // Given that there is no HW counter in some scenarios (e.g. on VMs), we have to use CPUTime data. platformUsageMetric = config.CPUTime @@ -107,7 +107,7 @@ func createProcessPowerModelConfig(powerSourceTarget string, processFeatureNames func CreateProcessPowerEstimatorModel(processFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues []string, bpfSupportedMetrics bpf.SupportedMetrics) { var err error - modelConfig := createProcessPowerModelConfig(config.ProcessPlatformPowerKey, processFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues, types.PlatformEnergySource, bpfSupportedMetrics) + modelConfig := createProcessPowerModelConfig(config.ProcessPlatformPowerKey(), processFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues, types.PlatformEnergySource, bpfSupportedMetrics) modelConfig.IsNodePowerModel = false processPlatformPowerModel, err = createPowerModelEstimator(modelConfig) if err == nil { @@ -117,7 +117,7 @@ func CreateProcessPowerEstimatorModel(processFeatureNames, systemMetaDataFeature klog.Infof("Failed to create %s Power Model to estimate Process Platform Power: %v\n", modelConfig.ModelType.String()+"/"+modelConfig.ModelOutputType.String(), err) } - modelConfig = createProcessPowerModelConfig(config.ProcessComponentsPowerKey, processFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues, types.ComponentEnergySource, bpfSupportedMetrics) + modelConfig = createProcessPowerModelConfig(config.ProcessComponentsPowerKey(), processFeatureNames, systemMetaDataFeatureNames, systemMetaDataFeatureValues, types.ComponentEnergySource, bpfSupportedMetrics) modelConfig.IsNodePowerModel = false processComponentPowerModel, err = createPowerModelEstimator(modelConfig) if err == nil { @@ -195,7 +195,7 @@ func addEstimatedEnergy(processIDList []uint64, processesMetrics map[uint64]*sta klog.V(5).Infoln("Could not estimate the Process Components Power") } // estimate the associated power consumption of GPU for each process - if config.EnabledGPU { + if config.EnabledGPU() { if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil { processGPUPower, errGPU = processComponentPowerModel.GetGPUPower(isIdlePower) if errGPU != nil { @@ -217,7 +217,7 @@ func addEstimatedEnergy(processIDList []uint64, processesMetrics map[uint64]*sta if errComp == nil { // add PKG power consumption // since Kepler collects metrics at intervals of SamplePeriodSec, which is greater than 1 second, it is necessary to calculate the energy consumption for the entire waiting period - energy = processComponentsPower[i].Pkg * config.SamplePeriodSec + energy = processComponentsPower[i].Pkg * config.SamplePeriodSec() if isIdlePower { processesMetrics[processID].EnergyUsage[config.IdleEnergyInPkg].SetDeltaStat(utils.GenericSocketID, energy) } else { @@ -225,7 +225,7 @@ func addEstimatedEnergy(processIDList []uint64, processesMetrics map[uint64]*sta } // add CORE power consumption - energy = processComponentsPower[i].Core * config.SamplePeriodSec + energy = processComponentsPower[i].Core * config.SamplePeriodSec() if isIdlePower { processesMetrics[processID].EnergyUsage[config.IdleEnergyInCore].SetDeltaStat(utils.GenericSocketID, energy) } else { @@ -233,7 +233,7 @@ func addEstimatedEnergy(processIDList []uint64, processesMetrics map[uint64]*sta } // add DRAM power consumption - energy = processComponentsPower[i].DRAM * config.SamplePeriodSec + energy = processComponentsPower[i].DRAM * config.SamplePeriodSec() if isIdlePower { processesMetrics[processID].EnergyUsage[config.IdleEnergyInDRAM].SetDeltaStat(utils.GenericSocketID, energy) } else { @@ -241,7 +241,7 @@ func addEstimatedEnergy(processIDList []uint64, processesMetrics map[uint64]*sta } // add Uncore power consumption - energy = processComponentsPower[i].Uncore * config.SamplePeriodSec + energy = processComponentsPower[i].Uncore * config.SamplePeriodSec() if isIdlePower { processesMetrics[processID].EnergyUsage[config.IdleEnergyInUnCore].SetDeltaStat(utils.GenericSocketID, energy) } else { @@ -250,7 +250,7 @@ func addEstimatedEnergy(processIDList []uint64, processesMetrics map[uint64]*sta // add GPU power consumption if errGPU == nil { - energy = processGPUPower[i] * (config.SamplePeriodSec) + energy = processGPUPower[i] * (config.SamplePeriodSec()) if isIdlePower { processesMetrics[processID].EnergyUsage[config.IdleEnergyInGPU].SetDeltaStat(utils.GenericSocketID, energy) } else { @@ -260,7 +260,7 @@ func addEstimatedEnergy(processIDList []uint64, processesMetrics map[uint64]*sta } if errPlat == nil { - energy = processPlatformPower[i] * config.SamplePeriodSec + energy = processPlatformPower[i] * config.SamplePeriodSec() if isIdlePower { processesMetrics[processID].EnergyUsage[config.IdleEnergyInPlatform].SetDeltaStat(utils.GenericSocketID, energy) } else { @@ -277,7 +277,7 @@ func addEstimatedEnergy(processIDList []uint64, processesMetrics map[uint64]*sta } else { otherPower = processPlatformPower[i] - processComponentsPower[i].Pkg - processComponentsPower[i].DRAM } - energy = otherPower * config.SamplePeriodSec + energy = otherPower * config.SamplePeriodSec() if isIdlePower { processesMetrics[processID].EnergyUsage[config.IdleEnergyInOther].SetDeltaStat(utils.GenericSocketID, energy) } else { diff --git a/pkg/nodecred/csv_cred.go b/pkg/nodecred/csv_cred.go index bfa63db990..81ce46b9c0 100644 --- a/pkg/nodecred/csv_cred.go +++ b/pkg/nodecred/csv_cred.go @@ -61,7 +61,7 @@ func (c csvNodeCred) IsSupported(info map[string]string) bool { if filePath == "" { return false } else { - nodeName := metric_util.GetNodeName() + nodeName := metric_util.NodeName() // read file from filePath userName, password, host, err := readCSVFile(filePath, nodeName) if err != nil { diff --git a/pkg/nodecred/csv_cred_test.go b/pkg/nodecred/csv_cred_test.go index be5af4adbb..d0de354396 100644 --- a/pkg/nodecred/csv_cred_test.go +++ b/pkg/nodecred/csv_cred_test.go @@ -95,7 +95,7 @@ func TestIsSupported(t *testing.T) { // set ENV variable NODE_NAME to "node1" os.Setenv("NODE_NAME", "node1") // check if getNodeName() returns "node1" - nodeName := metric_util.GetNodeName() + nodeName := metric_util.NodeName() if nodeName != "node1" { t.Errorf("Expected nodeName: node1, got: %v", nodeName) } diff --git a/pkg/sensors/accelerator/device/sources/nvml.go b/pkg/sensors/accelerator/device/sources/nvml.go index 21b8e8353c..0db0e5570d 100644 --- a/pkg/sensors/accelerator/device/sources/nvml.go +++ b/pkg/sensors/accelerator/device/sources/nvml.go @@ -234,7 +234,7 @@ func (n *GPUNvml) ProcessResourceUtilizationPerDevice(dev any, since time.Durati } if !n.processUtilizationSupported { // If processUtilizationSupported is false, try deviceGetMPSComputeRunningProcesses_v3 to use memory usage to ratio power usage - config.GpuUsageMetric = config.GPUMemUtilization + config.GPUUsageMetric = config.GPUMemUtilization processInfo, ret := d.DeviceHandler.(nvml.Device).GetComputeRunningProcesses() if ret != nvml.SUCCESS { if ret == nvml.ERROR_NOT_FOUND { diff --git a/pkg/sensors/components/power.go b/pkg/sensors/components/power.go index 291a08504f..ad7932a0f2 100644 --- a/pkg/sensors/components/power.go +++ b/pkg/sensors/components/power.go @@ -62,7 +62,7 @@ func InitPowerImpl() { } msrImpl := &source.PowerMSR{} - if msrImpl.IsSystemCollectionSupported() && config.EnabledMSR { + if msrImpl.IsSystemCollectionSupported() && config.IsEnabledMSR() { klog.V(1).Infoln("use MSR to obtain power") powerImpl = msrImpl return diff --git a/pkg/sensors/platform/source/acpi.go b/pkg/sensors/platform/source/acpi.go index ba81e70904..1800da2f32 100644 --- a/pkg/sensors/platform/source/acpi.go +++ b/pkg/sensors/platform/source/acpi.go @@ -172,7 +172,7 @@ func (a *ACPI) GetAbsEnergyFromPlatform() (map[string]float64, error) { if err == nil { // since Kepler collects metrics at intervals of SamplePeriodSec, which is greater than 1 second, it is // necessary to calculate the energy consumption for the entire waiting period - power[sensorIDPrefix+strconv.Itoa(int(i))] = float64(currPower / 1000 * config.SamplePeriodSec) /*miliJoules*/ + power[sensorIDPrefix+strconv.Itoa(int(i))] = float64(currPower / 1000 * config.SamplePeriodSec()) /*miliJoules*/ } else { return power, err } diff --git a/pkg/sensors/platform/source/redfish_test.go b/pkg/sensors/platform/source/redfish_test.go index b62b7c9406..93542a81a5 100644 --- a/pkg/sensors/platform/source/redfish_test.go +++ b/pkg/sensors/platform/source/redfish_test.go @@ -22,9 +22,12 @@ import ( "net/http" "net/http/httptest" "testing" + + "github.com/sustainable-computing-io/kepler/pkg/config" ) func TestRedFishClient_IsPowerSupported(t *testing.T) { + config.GetConfig() // Create a mock HTTP server server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/redfish/v1/Systems" {