Skip to content

Commit

Permalink
Support local xgboost regressor (#1052)
Browse files Browse the repository at this point in the history
* regressor: support local xgboost regressor

Signed-off-by: Huamin Chen <[email protected]>

* fix lint

Signed-off-by: Huamin Chen <[email protected]>

* update CI

Signed-off-by: Huamin Chen <[email protected]>

* use new xgboost model

Signed-off-by: Huamin Chen <[email protected]>

* use xgboost regressor

Signed-off-by: Huamin Chen <[email protected]>

---------

Signed-off-by: Huamin Chen <[email protected]>
  • Loading branch information
rootfs authored Nov 24, 2023
1 parent 43e2b28 commit c0d6a3f
Show file tree
Hide file tree
Showing 13 changed files with 501 additions and 84 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/developer_local.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
go-version-file: go.mod
- name: Prepare environment
run: |
brew install cpuid
brew install cpuid xgboost
cd doc/ && sudo ./dev/prepare_dev_env.sh && cd -
git config --global --add safe.directory /kepler
- name: Run
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/golang.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,12 @@ jobs:
with:
go-version-file: go.mod
- name: Run go vet
run: go vet ./...
run: |
wget https://github.com/sustainable-computing-io/kepler-ci-artifacts/releases/download/v0.26.0/xgboost-2.0.1-Linux.sh.tar.gz
tar -zxvf xgboost-2.0.1-Linux.sh.tar.gz
sudo sh xgboost-2.0.1-Linux.sh --skip-license --prefix=/usr/local
sudo ldconfig
go vet ./...
vulnerability_detect:
runs-on: ubuntu-latest
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/unit_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ jobs:
uses: sustainable-computing-io/[email protected]
with:
ebpfprovider: bcc
xgboost_version: 2.0.1
- name: Prepare environment
run: |
sudo apt-get install -y cpuid
Expand Down
14 changes: 7 additions & 7 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,13 @@ var (
ProcessComponentsPowerKey = "PROCESS_COMPONENTS"

// model_parameter_attribute
RatioEnabledKey = "RATIO" // the default container power model is RATIO but ESTIMATOR or LINEAR_REGRESSION can be used
EstimatorEnabledKey = "ESTIMATOR"
LinearRegressionEnabledKey = "LINEAR_REGRESSION"
InitModelURLKey = "INIT_URL"
FixedTrainerNameKey = "TRAINER"
FixedNodeTypeKey = "NODE_TYPE"
ModelFiltersKey = "FILTERS"
RatioEnabledKey = "RATIO" // the default container power model is RATIO but ESTIMATOR or LINEAR_REGRESSION can be used
EstimatorEnabledKey = "ESTIMATOR"
LocalRegressorEnabledKey = "LOCAL_REGRESSION"
InitModelURLKey = "INIT_URL"
FixedTrainerNameKey = "TRAINER"
FixedNodeTypeKey = "NODE_TYPE"
ModelFiltersKey = "FILTERS"
////////////////////////////////////

// KubeConfig is used to start k8s client with the pod running outside the cluster
Expand Down
132 changes: 89 additions & 43 deletions pkg/model/estimator/local/lr.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,26 +59,34 @@ type ModelRequest struct {
/*
ModelWeights, AllWeight, CategoricalFeature, NormalizedNumericalFeature define structure of model weight
{
"All_Weights":
"All_Weights":
{
"Bias_Weight": 1.0,
"Categorical_Variables": {"cpu_architecture": {"Sky Lake": {"weight": 1.0}}},
"Numerical_Variables": {"cpu_cycles": {"mean": 0, "variance": 1.0, "weight": 1.0}}
}
}
},
"XGboost_Weights": "base64_encoded_dmlc_xgboost_model_json"
}
*/

type ModelWeights struct {
AllWeights `json:"All_Weights"`
}

type AllWeights struct {
BiasWeight float64 `json:"Bias_Weight"`
CategoricalVariables map[string]map[string]CategoricalFeature `json:"Categorical_Variables"`
NumericalVariables map[string]NormalizedNumericalFeature `json:"Numerical_Variables"`
XGBoostWeight string `json:"XGBoost_Weights"`
XGBoostModel XGBoostModelWeight
RegressorType types.RegressorType
}

type CategoricalFeature struct {
Weight float64 `json:"weight"`
}

type NormalizedNumericalFeature struct {
Scale float64 `json:"scale"` // to normalize the data
Weight float64 `json:"weight"`
Expand All @@ -96,24 +104,45 @@ func (weights ModelWeights) getIndexedWeights(usageMetrics, systemFeatures []str
return
}

// predict applies normalization and linear regression to usageMetricValues and systemMetaDataFeatureValues
// predict applies normalization and local regression to usageMetricValues and systemMetaDataFeatureValues
func (weights ModelWeights) predict(usageMetricNames []string, usageMetricValues [][]float64, systemMetaDataFeatureNames, systemMetaDataFeatureValues []string) []float64 {
categoricalWeights, numericalWeights := weights.getIndexedWeights(usageMetricNames, systemMetaDataFeatureNames)
basePower := weights.AllWeights.BiasWeight
for index, coeffMap := range categoricalWeights {
basePower += coeffMap[systemMetaDataFeatureValues[index]].Weight
}
var powers []float64
for _, vals := range usageMetricValues {
power := basePower
for index, coeff := range numericalWeights {
if coeff.Weight == 0 {
continue
switch weights.RegressorType {
case types.LinearRegressor:
for _, vals := range usageMetricValues {
power := basePower
for index, coeff := range numericalWeights {
if coeff.Weight == 0 {
continue
}
normalizedX := vals[index] / coeff.Scale
power += coeff.Weight * normalizedX
}
normalizedX := vals[index] / coeff.Scale
power += coeff.Weight * normalizedX
powers = append(powers, power)
}
powers = append(powers, power)
case types.XGBoostRegressor:
for _, vals := range usageMetricValues {
data := make([]float32, len(vals))
for index, coeff := range numericalWeights {
if coeff.Weight == 0 {
continue
}
data[index] = float32(vals[index] / coeff.Scale)
}
power, err := weights.XGBoostModel.PredictFromData(data)
if err != nil {
klog.Errorf("XGBoostModel.PredictFromData failed: %v", err)
return []float64{}
}
powers = append(powers, power[0])
}
default:
klog.Errorf("RegressorType %v is not supported", weights.RegressorType)
}
return powers
}
Expand Down Expand Up @@ -141,10 +170,11 @@ ComponentModelWeights defines structure for multiple (power component's) weights
}
}
*/

type ComponentModelWeights map[string]ModelWeights

// LinearRegressor defines power estimator with linear regression approach
type LinearRegressor struct {
// LocalRegressor defines power estimator with regression approach
type LocalRegressor struct {
ModelServerEndpoint string
OutputType types.ModelOutputType
EnergySource string
Expand All @@ -168,8 +198,25 @@ type LinearRegressor struct {
modelWeight *ComponentModelWeights
}

// init model weight
func (r *LocalRegressor) initModelWeight(content *ComponentModelWeights) error {
r.modelWeight = content
for k, v := range *r.modelWeight {
if v.XGBoostWeight != "" {
err := v.XGBoostModel.LoadFromBuffer(v.XGBoostWeight)
if err != nil {
return fmt.Errorf("failed to load %v xgboost model: %v", k, err)
}
} else {
v.RegressorType = types.LinearRegressor
}
(*r.modelWeight)[k] = v
}
return nil
}

// Start returns nil if model weight is obtainable
func (r *LinearRegressor) Start() error {
func (r *LocalRegressor) Start() error {
var err error
var weight *ComponentModelWeights
outputStr := r.OutputType.String()
Expand All @@ -186,8 +233,7 @@ func (r *LinearRegressor) Start() error {
}
if weight != nil {
r.enabled = true
r.modelWeight = weight
return nil
return r.initModelWeight(weight)
} else {
if err == nil {
err = fmt.Errorf("the model LR (%s): has no config", outputStr)
Expand All @@ -198,7 +244,7 @@ func (r *LinearRegressor) Start() error {
}

// getWeightFromServer tries getting weights for Kepler Model Server
func (r *LinearRegressor) getWeightFromServer() (*ComponentModelWeights, error) {
func (r *LocalRegressor) getWeightFromServer() (*ComponentModelWeights, error) {
modelRequest := ModelRequest{
MetricNames: append(r.FloatFeatureNames, r.SystemMetaDataFeatureNames...),
OutputType: r.OutputType.String(),
Expand Down Expand Up @@ -242,7 +288,7 @@ func (r *LinearRegressor) getWeightFromServer() (*ComponentModelWeights, error)

// loadWeightFromURLorLocal get weight from either local or URL
// if string start with '/', we take it as local file
func (r *LinearRegressor) loadWeightFromURLorLocal() (*ComponentModelWeights, error) {
func (r *LocalRegressor) loadWeightFromURLorLocal() (*ComponentModelWeights, error) {
var body []byte
var err error

Expand All @@ -262,7 +308,7 @@ func (r *LinearRegressor) loadWeightFromURLorLocal() (*ComponentModelWeights, er
}

// loadWeightFromLocal tries loading weights from local file given by r.ModelWeightsURL
func (r *LinearRegressor) loadWeightFromLocal() ([]byte, error) {
func (r *LocalRegressor) loadWeightFromLocal() ([]byte, error) {
data, err := os.ReadFile(r.ModelWeightsFilepath)
if err != nil {
return nil, err
Expand All @@ -271,7 +317,7 @@ func (r *LinearRegressor) loadWeightFromLocal() ([]byte, error) {
}

// loadWeightFromURL tries loading weights from initial model URL
func (r *LinearRegressor) loadWeightFromURL() ([]byte, error) {
func (r *LocalRegressor) loadWeightFromURL() ([]byte, error) {
if r.ModelWeightsURL == "" {
return nil, fmt.Errorf("ModelWeightsURL is empty")
}
Expand All @@ -295,7 +341,7 @@ func (r *LinearRegressor) loadWeightFromURL() ([]byte, error) {
}

// GetPlatformPower applies ModelWeight prediction and return a list of power associated to each process/container/pod
func (r *LinearRegressor) GetPlatformPower(isIdlePower bool) ([]float64, error) {
func (r *LocalRegressor) GetPlatformPower(isIdlePower bool) ([]float64, error) {
if !r.enabled {
return []float64{}, fmt.Errorf("disabled power model call: %s", r.OutputType.String())
}
Expand All @@ -316,7 +362,7 @@ func (r *LinearRegressor) GetPlatformPower(isIdlePower bool) ([]float64, error)
}

// GetComponentsPower applies each component's ModelWeight prediction and return a map of component power associated to each process/container/pod
func (r *LinearRegressor) GetComponentsPower(isIdlePower bool) ([]source.NodeComponentsEnergy, error) {
func (r *LocalRegressor) GetComponentsPower(isIdlePower bool) ([]source.NodeComponentsEnergy, error) {
if !r.enabled {
return []source.NodeComponentsEnergy{}, fmt.Errorf("disabled power model call: %s", r.OutputType.String())
}
Expand Down Expand Up @@ -349,11 +395,11 @@ func (r *LinearRegressor) GetComponentsPower(isIdlePower bool) ([]source.NodeCom
}

// GetComponentsPower returns GPU Power in Watts associated to each each process/container/pod
func (r *LinearRegressor) GetGPUPower(isIdlePower bool) ([]float64, error) {
func (r *LocalRegressor) GetGPUPower(isIdlePower bool) ([]float64, error) {
return []float64{}, fmt.Errorf("current power model does not support GPUs")
}

func (r *LinearRegressor) addFloatFeatureValues(x []float64) {
func (r *LocalRegressor) addFloatFeatureValues(x []float64) {
for i, feature := range x {
// floatFeatureValues is a cyclic list, where we only append a new value if it is necessary.
if r.xidx < len(r.floatFeatureValues) {
Expand All @@ -377,53 +423,53 @@ func (r *LinearRegressor) addFloatFeatureValues(x []float64) {
}

// AddContainerFeatureValues adds the the x for prediction, which are the explanatory variables (or the independent variable) of regression.
// LinearRegressor is trained off-line then we cannot Add training samples. We might implement it in the future.
// The LinearRegressor does not differentiate node or container power estimation, the difference will only be the amount of resource utilization
func (r *LinearRegressor) AddContainerFeatureValues(x []float64) {
// LocalRegressor is trained off-line then we cannot Add training samples. We might implement it in the future.
// The LocalRegressor does not differentiate node or container power estimation, the difference will only be the amount of resource utilization
func (r *LocalRegressor) AddContainerFeatureValues(x []float64) {
r.addFloatFeatureValues(x)
}

// AddNodeFeatureValues adds the the x for prediction, which is the variable used to calculate the ratio.
// LinearRegressor is not trained, then we cannot Add training samples, only samples for prediction.
// The LinearRegressor does not differentiate node or container power estimation, the difference will only be the amount of resource utilization
func (r *LinearRegressor) AddNodeFeatureValues(x []float64) {
// LocalRegressor is not trained, then we cannot Add training samples, only samples for prediction.
// The LocalRegressor does not differentiate node or container power estimation, the difference will only be the amount of resource utilization
func (r *LocalRegressor) AddNodeFeatureValues(x []float64) {
r.addFloatFeatureValues(x)
}

// AddDesiredOutValue adds the the y, which is the response variable (or the dependent variable) of regression.
// LinearRegressor is trained off-line then we do not add Y for trainning. We might implement it in the future.
func (r *LinearRegressor) AddDesiredOutValue(y float64) {
// LocalRegressor is trained off-line then we do not add Y for trainning. We might implement it in the future.
func (r *LocalRegressor) AddDesiredOutValue(y float64) {
}

// ResetSampleIdx set the sample vector index to 0 to overwrite the old samples with new ones for trainning or prediction.
func (r *LinearRegressor) ResetSampleIdx() {
func (r *LocalRegressor) ResetSampleIdx() {
r.xidx = 0
}

// Train triggers the regressiong fit after adding data points to create a new power model.
// LinearRegressor is trained off-line then we cannot trigger the trainning. We might implement it in the future.
func (r *LinearRegressor) Train() error {
// LocalRegressor is trained off-line then we cannot trigger the trainning. We might implement it in the future.
func (r *LocalRegressor) Train() error {
return nil
}

// IsEnabled returns true if the power model was trained and is active
func (r *LinearRegressor) IsEnabled() bool {
func (r *LocalRegressor) IsEnabled() bool {
return r.enabled
}

// GetModelType returns the model type
func (r *LinearRegressor) GetModelType() types.ModelType {
return types.LinearRegressor
func (r *LocalRegressor) GetModelType() types.ModelType {
return types.LocalRegressor
}

// GetContainerFeatureNamesList returns the list of float features that the model was configured to use
// The LinearRegressor does not differentiate node or container power estimation, the difference will only be the amount of resource utilization
func (r *LinearRegressor) GetContainerFeatureNamesList() []string {
// The LocalRegressor does not differentiate node or container power estimation, the difference will only be the amount of resource utilization
func (r *LocalRegressor) GetContainerFeatureNamesList() []string {
return r.FloatFeatureNames
}

// GetNodeFeatureNamesList returns the list of float features that the model was configured to use
// The LinearRegressor does not differentiate node or container power estimation, the difference will only be the amount of resource utilization
func (r *LinearRegressor) GetNodeFeatureNamesList() []string {
// The LocalRegressor does not differentiate node or container power estimation, the difference will only be the amount of resource utilization
func (r *LocalRegressor) GetNodeFeatureNamesList() []string {
return r.FloatFeatureNames
}
Loading

0 comments on commit c0d6a3f

Please sign in to comment.