From c84bfd0bbecda576b93d6e7a3ea6d25097d7bcc7 Mon Sep 17 00:00:00 2001 From: Sunil Thaha Date: Wed, 28 Aug 2024 14:02:20 +1000 Subject: [PATCH 1/6] feat(models): log model source url Signed-off-by: Sunil Thaha --- pkg/model/node_component_energy.go | 7 +++++-- pkg/model/node_platform_energy.go | 7 +++++-- pkg/model/types/types.go | 16 ++++++++++++++-- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/pkg/model/node_component_energy.go b/pkg/model/node_component_energy.go index 8f31cef4f9..b60464dc87 100644 --- a/pkg/model/node_component_energy.go +++ b/pkg/model/node_component_energy.go @@ -55,11 +55,14 @@ func CreateNodeComponentPowerEstimatorModel(nodeFeatureNames, systemMetaDataFeat var err error nodeComponentPowerModel, err = createPowerModelEstimator(modelConfig) if err != nil { - klog.Errorf("Failed to create %s/%s Model to estimate Node Component Power: %v", modelConfig.ModelType, modelConfig.ModelOutputType, err) + klog.Errorf("Failed to create %s/%s Model from %s to estimate Node Component Power: %v", + modelConfig.ModelType, modelConfig.ModelOutputType, + modelConfig.SourceURL(), err) return } - klog.V(1).Infof("Using the %s/%s Model to estimate Node Component Power", modelConfig.ModelType, modelConfig.ModelOutputType) + klog.V(1).Infof("Using the %s/%s Model from %s to estimate Node Component Power", + modelConfig.ModelType, modelConfig.ModelOutputType, modelConfig.SourceURL()) } // IsNodeComponentPowerModelEnabled returns if the estimator has been enabled or not diff --git a/pkg/model/node_platform_energy.go b/pkg/model/node_platform_energy.go index 7d772427f7..363974ada8 100644 --- a/pkg/model/node_platform_energy.go +++ b/pkg/model/node_platform_energy.go @@ -51,10 +51,13 @@ func CreateNodePlatformPowerEstimatorModel(nodeFeatureNames, systemMetaDataFeatu var err error nodePlatformPowerModel, err = createPowerModelEstimator(modelConfig) if err != nil { - klog.Errorf("Failed to create %s/%s Model to estimate Node Platform Power: %v", modelConfig.ModelType, modelConfig.ModelOutputType, err) + klog.Errorf("Failed to create %s/%s Model from %s to estimate Node Platform Power: %v", + modelConfig.ModelType, modelConfig.ModelOutputType, + modelConfig.SourceURL(), err) return } - klog.V(1).Infof("Using the %s/%s Model to estimate Node Platform Power", modelConfig.ModelType, modelConfig.ModelOutputType) + klog.V(1).Infof("Using the %s/%s Model from %s to estimate Node Platform Power", + modelConfig.ModelType, modelConfig.ModelOutputType, modelConfig.SourceURL()) } // IsNodePlatformPowerModelEnabled returns if the estimator has been enabled or not diff --git a/pkg/model/types/types.go b/pkg/model/types/types.go index a74f0053f1..51b5a215ee 100644 --- a/pkg/model/types/types.go +++ b/pkg/model/types/types.go @@ -16,8 +16,10 @@ limitations under the License. package types -type ModelType int -type ModelOutputType int +type ( + ModelType int + ModelOutputType int +) const ( // Power Model types @@ -25,6 +27,7 @@ const ( Regressor // estimation happens within kepler, but pre-trained model parameters are downloaded externally EstimatorSidecar // estimation happens in the sidecar with a loaded pre-trained power model ) + const ( // Power Model Output types // Absolute Power Model (AbsPower): is the power model trained by measured power (including the idle power) @@ -33,6 +36,7 @@ const ( DynPower Unsupported ) + const ( // Define energy source PlatformEnergySource = "acpi" @@ -99,3 +103,11 @@ type ModelConfig struct { SystemMetaDataFeatureNames []string SystemMetaDataFeatureValues []string } + +func (c *ModelConfig) SourceURL() string { + if c.InitModelURL != "" { + return c.InitModelURL + } + + return c.InitModelFilepath +} From e54a8e6f8bb6d72706fb23397bb358b0fc308b54 Mon Sep 17 00:00:00 2001 From: Sunil Thaha Date: Thu, 29 Aug 2024 10:46:17 +1000 Subject: [PATCH 2/6] chore(models): reformat json Signed-off-by: Sunil Thaha --- data/model_weight/acpi_AbsPowerModel.json | 18 +++++- data/model_weight/acpi_DynPowerModel.json | 18 +++++- .../intel_rapl_AbsPowerModel.json | 60 ++++++++++++++++++- .../intel_rapl_DynPowerModel.json | 60 ++++++++++++++++++- 4 files changed, 152 insertions(+), 4 deletions(-) diff --git a/data/model_weight/acpi_AbsPowerModel.json b/data/model_weight/acpi_AbsPowerModel.json index 8f74af34d6..dd4db5a7cd 100644 --- a/data/model_weight/acpi_AbsPowerModel.json +++ b/data/model_weight/acpi_AbsPowerModel.json @@ -1 +1,17 @@ -{"model_name": "SGDRegressorTrainer_0", "platform": {"All_Weights": {"Bias_Weight": 220.9079278650894, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 29.028228361462897}}}}} +{ + "model_name": "SGDRegressorTrainer_0", + "platform": { + "All_Weights": { + "Bias_Weight": 220.9079278650894, + "Categorical_Variables": {}, + "Numerical_Variables": { + "bpf_cpu_time_ms": { + "scale": 5911.969193263386, + "mean": 0, + "variance": 0, + "weight": 29.028228361462897 + } + } + } + } +} diff --git a/data/model_weight/acpi_DynPowerModel.json b/data/model_weight/acpi_DynPowerModel.json index 8149bb4456..25bdb48a06 100644 --- a/data/model_weight/acpi_DynPowerModel.json +++ b/data/model_weight/acpi_DynPowerModel.json @@ -1 +1,17 @@ -{"model_name": "SGDRegressorTrainer_0", "platform": {"All_Weights": {"Bias_Weight": 49.56491877218095, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 28.501356366108837}}}}} +{ + "model_name": "SGDRegressorTrainer_0", + "platform": { + "All_Weights": { + "Bias_Weight": 49.56491877218095, + "Categorical_Variables": {}, + "Numerical_Variables": { + "bpf_cpu_time_ms": { + "scale": 5911.969193263386, + "mean": 0, + "variance": 0, + "weight": 28.501356366108837 + } + } + } + } +} diff --git a/data/model_weight/intel_rapl_AbsPowerModel.json b/data/model_weight/intel_rapl_AbsPowerModel.json index c77f702950..edef702192 100644 --- a/data/model_weight/intel_rapl_AbsPowerModel.json +++ b/data/model_weight/intel_rapl_AbsPowerModel.json @@ -1 +1,59 @@ -{"model_name": "SGDRegressorTrainer_0", "package": {"All_Weights": {"Bias_Weight": 69.91739430907396, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 22.16772409328642}}}}, "core": {"All_Weights": {"Bias_Weight": 0.0, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 0.0}}}}, "uncore": {"All_Weights": {"Bias_Weight": 0.0, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 0.0}}}}, "dram": {"All_Weights": {"Bias_Weight": 47.142633336743344, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 3.57348245077466}}}}} +{ + "model_name": "SGDRegressorTrainer_0", + "package": { + "All_Weights": { + "Bias_Weight": 69.91739430907396, + "Categorical_Variables": {}, + "Numerical_Variables": { + "bpf_cpu_time_ms": { + "scale": 5911.969193263386, + "mean": 0, + "variance": 0, + "weight": 22.16772409328642 + } + } + } + }, + "core": { + "All_Weights": { + "Bias_Weight": 0.0, + "Categorical_Variables": {}, + "Numerical_Variables": { + "bpf_cpu_time_ms": { + "scale": 5911.969193263386, + "mean": 0, + "variance": 0, + "weight": 0.0 + } + } + } + }, + "uncore": { + "All_Weights": { + "Bias_Weight": 0.0, + "Categorical_Variables": {}, + "Numerical_Variables": { + "bpf_cpu_time_ms": { + "scale": 5911.969193263386, + "mean": 0, + "variance": 0, + "weight": 0.0 + } + } + } + }, + "dram": { + "All_Weights": { + "Bias_Weight": 47.142633336743344, + "Categorical_Variables": {}, + "Numerical_Variables": { + "bpf_cpu_time_ms": { + "scale": 5911.969193263386, + "mean": 0, + "variance": 0, + "weight": 3.57348245077466 + } + } + } + } +} diff --git a/data/model_weight/intel_rapl_DynPowerModel.json b/data/model_weight/intel_rapl_DynPowerModel.json index 0ef3801fba..08f5bdf88b 100644 --- a/data/model_weight/intel_rapl_DynPowerModel.json +++ b/data/model_weight/intel_rapl_DynPowerModel.json @@ -1 +1,59 @@ -{"model_name": "SGDRegressorTrainer_0", "package": {"All_Weights": {"Bias_Weight": 38.856412561925055, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 22.258830113477515}}}}, "core": {"All_Weights": {"Bias_Weight": 0.0, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 0.0}}}}, "uncore": {"All_Weights": {"Bias_Weight": 0.0, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 0.0}}}}, "dram": {"All_Weights": {"Bias_Weight": 9.080889901856153, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 3.0358946796490924}}}}} +{ + "model_name": "SGDRegressorTrainer_0", + "package": { + "All_Weights": { + "Bias_Weight": 38.856412561925055, + "Categorical_Variables": {}, + "Numerical_Variables": { + "bpf_cpu_time_ms": { + "scale": 5911.969193263386, + "mean": 0, + "variance": 0, + "weight": 22.258830113477515 + } + } + } + }, + "core": { + "All_Weights": { + "Bias_Weight": 0.0, + "Categorical_Variables": {}, + "Numerical_Variables": { + "bpf_cpu_time_ms": { + "scale": 5911.969193263386, + "mean": 0, + "variance": 0, + "weight": 0.0 + } + } + } + }, + "uncore": { + "All_Weights": { + "Bias_Weight": 0.0, + "Categorical_Variables": {}, + "Numerical_Variables": { + "bpf_cpu_time_ms": { + "scale": 5911.969193263386, + "mean": 0, + "variance": 0, + "weight": 0.0 + } + } + } + }, + "dram": { + "All_Weights": { + "Bias_Weight": 9.080889901856153, + "Categorical_Variables": {}, + "Numerical_Variables": { + "bpf_cpu_time_ms": { + "scale": 5911.969193263386, + "mean": 0, + "variance": 0, + "weight": 3.0358946796490924 + } + } + } + } +} From fcc8e0a14e847e3f48bacd32cb73138709b287a9 Mon Sep 17 00:00:00 2001 From: Sunil Thaha Date: Thu, 29 Aug 2024 10:48:22 +1000 Subject: [PATCH 3/6] feat(models): update intel-rapl abspower to 0.7.11 Signed-off-by: Sunil Thaha --- .../intel_rapl_AbsPowerModel.json | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/data/model_weight/intel_rapl_AbsPowerModel.json b/data/model_weight/intel_rapl_AbsPowerModel.json index edef702192..c3082992e1 100644 --- a/data/model_weight/intel_rapl_AbsPowerModel.json +++ b/data/model_weight/intel_rapl_AbsPowerModel.json @@ -2,14 +2,16 @@ "model_name": "SGDRegressorTrainer_0", "package": { "All_Weights": { - "Bias_Weight": 69.91739430907396, + "Bias_Weight": 143.81206324506658, "Categorical_Variables": {}, "Numerical_Variables": { "bpf_cpu_time_ms": { - "scale": 5911.969193263386, - "mean": 0, - "variance": 0, - "weight": 22.16772409328642 + "scale": 95877.00000000001, + "weight": 305.148172889668 + }, + "bpf_page_cache_hit": { + "scale": 1.0, + "weight": 0.0 } } } @@ -20,9 +22,11 @@ "Categorical_Variables": {}, "Numerical_Variables": { "bpf_cpu_time_ms": { - "scale": 5911.969193263386, - "mean": 0, - "variance": 0, + "scale": 95877.00000000001, + "weight": 0.0 + }, + "bpf_page_cache_hit": { + "scale": 1.0, "weight": 0.0 } } @@ -34,9 +38,11 @@ "Categorical_Variables": {}, "Numerical_Variables": { "bpf_cpu_time_ms": { - "scale": 5911.969193263386, - "mean": 0, - "variance": 0, + "scale": 95877.00000000001, + "weight": 0.0 + }, + "bpf_page_cache_hit": { + "scale": 1.0, "weight": 0.0 } } @@ -44,14 +50,16 @@ }, "dram": { "All_Weights": { - "Bias_Weight": 47.142633336743344, + "Bias_Weight": 18.616285480502686, "Categorical_Variables": {}, "Numerical_Variables": { "bpf_cpu_time_ms": { - "scale": 5911.969193263386, - "mean": 0, - "variance": 0, - "weight": 3.57348245077466 + "scale": 95877.00000000001, + "weight": 8.434209551249596 + }, + "bpf_page_cache_hit": { + "scale": 1.0, + "weight": 0.0 } } } From 7303454d6aeb8dc4d52735bbb24007849243d03f Mon Sep 17 00:00:00 2001 From: Sunil Thaha Date: Thu, 29 Aug 2024 11:10:33 +1000 Subject: [PATCH 4/6] feat(models): update acpi abspower to 0.7.11 Signed-off-by: Sunil Thaha --- data/model_weight/acpi_AbsPowerModel.json | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/data/model_weight/acpi_AbsPowerModel.json b/data/model_weight/acpi_AbsPowerModel.json index dd4db5a7cd..091dd31636 100644 --- a/data/model_weight/acpi_AbsPowerModel.json +++ b/data/model_weight/acpi_AbsPowerModel.json @@ -2,14 +2,16 @@ "model_name": "SGDRegressorTrainer_0", "platform": { "All_Weights": { - "Bias_Weight": 220.9079278650894, + "Bias_Weight": 63.732101547596734, "Categorical_Variables": {}, "Numerical_Variables": { "bpf_cpu_time_ms": { - "scale": 5911.969193263386, - "mean": 0, - "variance": 0, - "weight": 29.028228361462897 + "scale": 3991.690709751875, + "weight": 60.735455390707436 + }, + "bpf_page_cache_hit": { + "scale": 1.0, + "weight": 0.0 } } } From ecd5f545479c62fc3408565e6bc9f747ea530e90 Mon Sep 17 00:00:00 2001 From: Sunil Thaha Date: Thu, 29 Aug 2024 11:11:54 +1000 Subject: [PATCH 5/6] feat(models): update acpi dyn to 0.7.11 Signed-off-by: Sunil Thaha --- data/model_weight/acpi_DynPowerModel.json | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/data/model_weight/acpi_DynPowerModel.json b/data/model_weight/acpi_DynPowerModel.json index 25bdb48a06..be8c659fe1 100644 --- a/data/model_weight/acpi_DynPowerModel.json +++ b/data/model_weight/acpi_DynPowerModel.json @@ -2,14 +2,16 @@ "model_name": "SGDRegressorTrainer_0", "platform": { "All_Weights": { - "Bias_Weight": 49.56491877218095, + "Bias_Weight": 2.2983085534792616, "Categorical_Variables": {}, "Numerical_Variables": { "bpf_cpu_time_ms": { - "scale": 5911.969193263386, - "mean": 0, - "variance": 0, - "weight": 28.501356366108837 + "scale": 3991.690709751875, + "weight": 59.30618884592585 + }, + "bpf_page_cache_hit": { + "scale": 1.0, + "weight": 0.0 } } } From 34d27b811248f58a3dbdd8e14caafcab47a5f643 Mon Sep 17 00:00:00 2001 From: Sunil Thaha Date: Thu, 29 Aug 2024 12:31:29 +1000 Subject: [PATCH 6/6] fix: do not probe for power-meters when disabled Previously, when DISABLE_POWER_METER is set, kepler would still probe system for power-meters resulting in kepler_node_info to produce incorrect results for components_power_source and platform_power_source. E.g. kepler_node_info{ components_power_source="rapl-sysfs", cpu_architecture="Skylake", instance="kepler-latest:8888", job="latest", platform_power_source="acpi", source="os" } The commit fixes this to use the fake power-meters so that kepler_node_info now shows ``` kepler_node_info{components_power_source="estimator", cpu_architecture="Skylake", instance="kepler-dev:8888", job="dev", platform_power_source="none", source="os" } ``` Signed-off-by: Sunil Thaha --- pkg/sensors/components/power.go | 6 ++++++ pkg/sensors/platform/power.go | 10 ++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pkg/sensors/components/power.go b/pkg/sensors/components/power.go index 9dc227ddb1..291a08504f 100644 --- a/pkg/sensors/components/power.go +++ b/pkg/sensors/components/power.go @@ -48,6 +48,12 @@ var ( ) func InitPowerImpl() { + if !enabled { + klog.V(1).Infoln("System power collection is disabled, using estimate method") + powerImpl = &source.PowerEstimate{} + return + } + sysfsImpl := &source.PowerSysfs{} if sysfsImpl.IsSystemCollectionSupported() /*&& false*/ { klog.V(1).Infoln("use sysfs to obtain power") diff --git a/pkg/sensors/platform/power.go b/pkg/sensors/platform/power.go index 18b8bdf1bc..bae186eeaf 100644 --- a/pkg/sensors/platform/power.go +++ b/pkg/sensors/platform/power.go @@ -37,8 +37,7 @@ type powerInterface interface { } // dummy satisfies the powerInterface and can be used as the default NOP source -type dummy struct { -} +type dummy struct{} func (dummy) GetName() string { return "none" @@ -47,6 +46,7 @@ func (dummy) GetName() string { func (dummy) IsSystemCollectionSupported() bool { return false } + func (dummy) StopPower() { } @@ -60,6 +60,12 @@ var ( ) func InitPowerImpl() { + if !enabled { + klog.V(1).Infoln("System power collection is disabled, using dummy method") + powerImpl = &dummy{} + return + } + // switch the platform power collector source to hmc if the system architecture is s390x // TODO: add redfish or ipmi as well. if runtime.GOARCH == "s390x" {