diff --git a/pkg/observer/controller.go b/pkg/observer/controller.go index cf7f30a..c622da6 100644 --- a/pkg/observer/controller.go +++ b/pkg/observer/controller.go @@ -2,16 +2,11 @@ package observer import ( "context" - "errors" "net/http" "sort" - "strconv" "time" - "github.com/cenkalti/backoff/v4" - "github.com/prometheus/client_golang/api" "github.com/prometheus/client_golang/prometheus/promhttp" - "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" @@ -19,7 +14,6 @@ import ( v1 "github.com/atlassian-labs/cyclops/pkg/apis/atlassian/v1" "github.com/atlassian-labs/cyclops/pkg/generation" "github.com/atlassian-labs/cyclops/pkg/k8s" - promv1 "github.com/prometheus/client_golang/api/prometheus/v1" ) var apiVersion = "undefined" //nolint:golint,varcheck,deadcode,unused @@ -277,66 +271,6 @@ func (c *controller) dropInProgressNodeGroups(nodeGroups v1.NodeGroupList, cnrs return restingNodeGroups } -// get the cluster-autoscaler last scaleUp activity time -func stringToTime(s string) (time.Time, error) { - sec, err := strconv.ParseInt(s, 10, 64) - if err != nil { - return time.Time{}, err - } - return time.Unix(sec, 0), nil -} - -// query cluster-autoscaler metrics to figure out if it's safe to start a new CNR -func (c *controller) safeToStartCycle() bool { - client, err := api.NewClient(api.Config{ - Address: c.PrometheusAddress, - }) - if err != nil { - // Prometheus might not be installed in the cluster. return true if it can't connect - klog.Errorln("Error creating client:", err) - return true - } - - v1api := promv1.NewAPI(client) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - // scaleDown metric is updated every cycle cluster-autoscaler is checking if the cluster should scaleDown - // scaleDown does not get checked and therefore not updated when the cluster is scaling up since no check for scaleDown is needed - result, warnings, err := v1api.Query(ctx, "cluster_autoscaler_last_activity{activity='scaleDown'}", time.Now()) - if err != nil { - // cluster-autoscaler might not be installed in the cluster. return true if it can't find the metrics of run the query - klog.Errorln("Error querying Prometheus:", err) - return true - } - if len(warnings) > 0 { - klog.Errorln("Warnings:", warnings) - } - - v := result.(model.Vector) - // cluster-autoscaler should always gives a response if it's active - if v.Len() == 0 { - klog.Errorln("Empty response from prometheus") - return true - } - - scaleUpTime := v[v.Len()-1].Value.String() - t, err := stringToTime(scaleUpTime) - if err != nil { - klog.Errorln("Error converting the time:", err) - return false - } - - // cluster_autoscaler_last_activity values will update every PrometheusScrapeInterval in non-scaling scenario - lastScaleEvent := time.Since(t) - if lastScaleEvent > c.PrometheusScrapeInterval { - klog.Infoln("Scale up event recently happened") - return false - } - klog.V(3).Infoln("No scale up event") - - return true -} - // createCNRs generates and applies CNRs from the changedNodeGroups func (c *controller) createCNRs(changedNodeGroups []*ListedNodeGroups) { klog.V(3).Infoln("applying") @@ -371,26 +305,6 @@ func (c *controller) nextRunTime() time.Time { return time.Now().UTC().Add(c.CheckInterval) } -func (c *controller) checkIfSafeToStartCycle() bool { - b := backoff.NewExponentialBackOff() - b.MaxElapsedTime = 120 * time.Second - - err := backoff.Retry(func() error { - if !c.safeToStartCycle() { - klog.Error("Cluster autoscaler scaleUp event in progress. Retry...") - return errors.New("cluster-autoscaler event in progress") - } - return nil - }, b) - - if err != nil { - klog.Errorln("there are still cluster-autoscaler scaleUp events") - return false - } - - return true -} - // Run runs the controller loops once. detecting lock, changes, and applying CNRs // implements cron.Job interface func (c *controller) Run() { @@ -420,11 +334,6 @@ func (c *controller) Run() { } } - // query cluster-autoscaler to check if it's safe to start a new CNR - if !c.checkIfSafeToStartCycle() { - return - } - // wait for the desired amount to allow any in progress changes to batch up klog.V(3).Infof("waiting for %v to allow changes to settle", c.WaitInterval) select {