Skip to content

Commit

Permalink
Add atlasScheduledAutoscaling (SRE-720)
Browse files Browse the repository at this point in the history
  • Loading branch information
r0zbot committed Nov 1, 2024
1 parent 14cf486 commit fe5239f
Show file tree
Hide file tree
Showing 8 changed files with 324 additions and 18 deletions.
27 changes: 27 additions & 0 deletions api/v1alpha1/mongodbcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,33 @@ type MongoDBClusterSpec struct {

// If this is set, along with useAtlasApi, all the kubernetes nodes on the cluster will be added to the Atlas firewall. The only available value right now is "rancher-annotation", which uses the rke.cattle.io/external-ip annotation.
AtlasNodeIPAccessStrategy string `json:"atlasNodeIpAccessStrategy,omitempty"`

AtlasScheduledAutoscaling *AtlasScheduledAutoscaling `json:"atlasScheduledAutoscaling,omitempty"`
}

type AtlasScheduledAutoscaling struct {
// If this is set, the cluster will be enabled for scheduled autoscaling.
// The way it works is that the cluster will be scaled up to the high tier at the specified time, and scaled down to the lowTier at the specified time.
// +kubebuilder:default=false
Enabled bool `json:"enabled,omitempty"`

// The minimum tier the cluster can scale down to.
// +kubebuilder:validation:Enum=M0;M2;M5;M10;M20;M30;M40;M50;M60;M80;M140;M200;M300;M400;M500;M700;M900;M1000
// +kubebuilder:default="M20"
LowTier string `json:"lowTier,omitempty"`

// The maximum tier the cluster can scale up to.
// +kubebuilder:validation:Enum=M0;M2;M5;M10;M20;M30;M40;M50;M60;M80;M140;M200;M300;M400;M500;M700;M900;M1000
// +kubebuilder:default="M50"
HighTier string `json:"highTier,omitempty"`

// Cron expression for the time the cluster should be scaled down.
// +kubebuilder:default="0 20 * * 1-5"
ScaleDownExpression string `json:"scaleDownExpression,omitempty"`

// Cron expression for the time the cluster should be scaled up.
// +kubebuilder:default="0 6 * * 1-5"
ScaleUpExpression string `json:"scaleUpExpression,omitempty"`
}

// MongoDBClusterStatus defines the observed state of MongoDBCluster
Expand Down
22 changes: 21 additions & 1 deletion api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

66 changes: 66 additions & 0 deletions config/crd/bases/airlock.cloud.rocket.chat_mongodbclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,72 @@ spec:
available value right now is "rancher-annotation", which uses the
rke.cattle.io/external-ip annotation.
type: string
atlasScheduledAutoscaling:
properties:
enabled:
default: false
description: If this is set, the cluster will be enabled for scheduled
autoscaling. The way it works is that the cluster will be scaled
up to the high tier at the specified time, and scaled down to
the lowTier at the specified time.
type: boolean
highTier:
default: M50
description: The maximum tier the cluster can scale up to.
enum:
- M0
- M2
- M5
- M10
- M20
- M30
- M40
- M50
- M60
- M80
- M140
- M200
- M300
- M400
- M500
- M700
- M900
- M1000
type: string
lowTier:
default: M20
description: The minimum tier the cluster can scale down to.
enum:
- M0
- M2
- M5
- M10
- M20
- M30
- M40
- M50
- M60
- M80
- M140
- M200
- M300
- M400
- M500
- M700
- M900
- M1000
type: string
scaleDownExpression:
default: 0 20 * * 1-5
description: Cron expression for the time the cluster should be
scaled down.
type: string
scaleUpExpression:
default: 0 6 * * 1-5
description: Cron expression for the time the cluster should be
scaled up.
type: string
type: object
connectionSecret:
description: Secret in which Airlock will look for a ConnectionString
or Atlas credentials, that will be used to connect to the cluster.
Expand Down
20 changes: 18 additions & 2 deletions config/samples/airlock_v1alpha1_mongodbcluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ metadata:
name: teste-atlas1
spec:
# The host with port that clients will receive when requesting credentials.
hostTemplate: "cluster0.vpz0mct.mongodb.net"
hostTemplate: "cluster0.4h0sjun.mongodb.net"

# Secret in which Airlock will look for a ConnectionString or Atlas credentials, that will be used to connect to the cluster.
connectionSecret: airlock-atlas-connection
Expand All @@ -38,6 +38,22 @@ spec:
# Optional. If this is set, along with useAtlasApi, all the kubernetes nodes on the cluster will be added to the Atlas firewall. The only available value right now is "rancher-annotation", which uses the rke.cattle.io/external-ip annotation.
atlasNodeIpAccessStrategy: rancher-annotation

atlasScheduledAutoscaling:
# Whether the autoscaling is enabled or not. Defaults to false.
enabled: true

# The low tier the cluster will scale down to. Defaults to "M20".
lowTier: "M10"

# The high tier the cluster will scale up to. Defaults to "M50".
highTier: "M20"

# The cron expression that will be used to scale down the cluster. Defaults to "0 20 * * 1-5".
scaleDownExpression: "* * * * *"

# The cron expression that will be used to scale up the cluster. Defaults to "0 6 * * 1-5".
scaleUpExpression: "0 0 * * *"

---
apiVersion: v1
kind: Secret
Expand All @@ -47,7 +63,7 @@ metadata:
type: Opaque
stringData:
# It should have enough privileges to manage users and access. This is not gonna be used by the created users.
connectionString: "mongodb://rcadmin:[email protected]/test?replicaSet=rs0"
connectionString: "mongodb://rcadmin:[email protected]/test?replicaSet=rs*"

---
apiVersion: v1
Expand Down
17 changes: 17 additions & 0 deletions controllers/common.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package controllers

import (
"context"
"strings"

"github.com/mongodb-forks/digest"
"go.mongodb.org/atlas/mongodbatlas"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -44,3 +47,17 @@ func getAtlasClientFromSecret(secret *corev1.Secret) (*mongodbatlas.Client, stri

return client, atlasGroupID, nil
}

func getClusterNameFromHostTemplate(ctx context.Context, client *mongodbatlas.Client, groupID, hostTemplate string) (string, error) {
clusters, _, err := client.Clusters.List(ctx, groupID, &mongodbatlas.ListOptions{})
if err != nil {
return "", err
}
for _, cluster := range clusters {
if strings.Contains(cluster.SrvAddress, hostTemplate) {
return cluster.Name, nil
}
}

return "", errors.NewBadRequest("Cluster not found for when searching for it's connectionString in atlas")
}
146 changes: 146 additions & 0 deletions controllers/mongodbcluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"strings"
"time"

"github.com/go-co-op/gocron/v2"

Check failure on line 26 in controllers/mongodbcluster_controller.go

View workflow job for this annotation

GitHub Actions / lint

could not import github.com/go-co-op/gocron/v2 (-: # github.com/go-co-op/gocron/v2
"github.com/go-logr/logr"
"go.mongodb.org/atlas/mongodbatlas"
"go.mongodb.org/mongo-driver/bson"
Expand Down Expand Up @@ -53,6 +54,7 @@ import (
type MongoDBClusterReconciler struct {
client.Client
Scheme *runtime.Scheme
gocron.Scheduler

Check failure on line 57 in controllers/mongodbcluster_controller.go

View workflow job for this annotation

GitHub Actions / lint

undeclared name: `gocron` (typecheck)
}

//+kubebuilder:rbac:groups=airlock.cloud.rocket.chat,resources=mongodbclusters,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -141,6 +143,22 @@ func (r *MongoDBClusterReconciler) Reconcile(ctx context.Context, req ctrl.Reque
return ctrl.Result{}, utilerrors.NewAggregate([]error{err, r.Status().Update(ctx, mongodbClusterCR)})
}
}

// Reconile scheduled autoscaling
err = r.reconcileAtlasScheduledAutoscaling(ctx, mongodbClusterCR, secret)
if err != nil {
meta.SetStatusCondition(&mongodbClusterCR.Status.Conditions,
metav1.Condition{
Type: "Ready",
Status: metav1.ConditionFalse,
Reason: "AtlasScheduledAutoscalingFailed",
LastTransitionTime: metav1.NewTime(time.Now()),
Message: fmt.Sprintf("Failed to reconcile scheduled autoscaling: %s", err.Error()),
})

return ctrl.Result{}, utilerrors.NewAggregate([]error{err, r.Status().Update(ctx, mongodbClusterCR)})
}

} else {
err = testMongoConnection(ctx, mongodbClusterCR, secret)
if err != nil {
Expand Down Expand Up @@ -210,6 +228,17 @@ func (r *MongoDBClusterReconciler) SetupWithManager(mgr ctrl.Manager) error {
return err
}

{
var err error
r.Scheduler, err = gocron.NewScheduler()

Check failure on line 233 in controllers/mongodbcluster_controller.go

View workflow job for this annotation

GitHub Actions / lint

undeclared name: `gocron` (typecheck)
if err != nil {
ctrl.Log.WithName("controllers").WithName("MongoDBCluster").V(1).Error(err, "Error creating scheduler")
return err
}

r.Scheduler.Start()
}

return ctrl.NewControllerManagedBy(mgr).
For(&airlockv1alpha1.MongoDBCluster{}).
Watches(
Expand Down Expand Up @@ -472,3 +501,120 @@ func (r *MongoDBClusterReconciler) reconcileAtlasFirewall(ctx context.Context, m

return nil
}

func (r *MongoDBClusterReconciler) reconcileAtlasScheduledAutoscaling(ctx context.Context, mongodbClusterCR *airlockv1alpha1.MongoDBCluster, secret *corev1.Secret) error {
logger := log.FromContext(ctx)

scheduledAutoscaling := mongodbClusterCR.Spec.AtlasScheduledAutoscaling

if scheduledAutoscaling != nil && scheduledAutoscaling.Enabled {

var foundUp gocron.Job

Check failure on line 512 in controllers/mongodbcluster_controller.go

View workflow job for this annotation

GitHub Actions / lint

undeclared name: `gocron` (typecheck)
var foundDown gocron.Job

jobs := r.Scheduler.Jobs()
if jobs != nil {
for _, job := range jobs {
if job.Tags()[0] == mongodbClusterCR.Name && job.Tags()[2] == "up" {
foundUp = job
} else if job.Tags()[0] == mongodbClusterCR.Name && job.Tags()[2] == "down" {
foundDown = job
}
}
} else {
return fmt.Errorf("list of jobs is nil, wtf? Did the scheduler not initialize?")
}

// Is this client gonna expire on me? Or is it eternal? ChatGPT says it wont expire, but I don't trust it.
client, atlasGroupID, err := getAtlasClientFromSecret(secret)
if err != nil {
logger.Error(err, "Couldn't get a client for Atlas")
return err
}

clusterName, err := getClusterNameFromHostTemplate(ctx, client, atlasGroupID, mongodbClusterCR.Spec.HostTemplate)
if err != nil {
logger.Error(err, "Couldn't find cluster in Atlas")
return err
}

clusterDetails, response, err := client.Clusters.Get(ctx, atlasGroupID, clusterName)
if err != nil || response.StatusCode != http.StatusOK {
logger.Error(err, "Couldn't get cluster details from Atlas")
return err
}

if foundDown == nil || foundDown.Tags()[1] != scheduledAutoscaling.ScaleDownExpression+scheduledAutoscaling.LowTier {

if foundDown != nil {
logger.Info("Removing outdated downscaling job for " + mongodbClusterCR.Name)
r.Scheduler.RemoveJob(foundDown.ID())
}

logger.Info("Creating scheduled downscaling job for " + mongodbClusterCR.Name + " with expression " + scheduledAutoscaling.ScaleDownExpression + " to " + scheduledAutoscaling.LowTier)
r.Scheduler.NewJob(
gocron.CronJob(scheduledAutoscaling.ScaleDownExpression, false),
gocron.NewTask(
func() error {
logger.Info("Scaling down " + mongodbClusterCR.Name + " to " + scheduledAutoscaling.LowTier)

_, response, err := client.Clusters.Update(ctx, atlasGroupID, clusterName, &mongodbatlas.Cluster{
ProviderSettings: &mongodbatlas.ProviderSettings{
ProviderName: "AWS",
InstanceSizeName: scheduledAutoscaling.LowTier,
RegionName: clusterDetails.ProviderSettings.RegionName,
},
})

if err != nil || response.StatusCode != http.StatusOK {
logger.Error(err, "Couldn't scale down "+mongodbClusterCR.Name)
// TODO: Flip a metric so we can alert on this. This one is a warning.
return err
}
return nil
},
),
gocron.WithTags(mongodbClusterCR.Name, scheduledAutoscaling.ScaleDownExpression+scheduledAutoscaling.LowTier, "down"),
)
}

if foundUp == nil || foundUp.Tags()[1] != scheduledAutoscaling.ScaleUpExpression+scheduledAutoscaling.HighTier {

if foundUp != nil {
logger.Info("Removing outdated upscaling job for " + mongodbClusterCR.Name)
r.Scheduler.RemoveJob(foundUp.ID())
}

logger.Info("Creating scheduled upscaling job for " + mongodbClusterCR.Name + " with expression " + scheduledAutoscaling.ScaleUpExpression + " to " + scheduledAutoscaling.HighTier)

r.Scheduler.NewJob(
gocron.CronJob(scheduledAutoscaling.ScaleUpExpression, false),
gocron.NewTask(
func() error {
logger.Info("Scaling up " + mongodbClusterCR.Name + " to " + scheduledAutoscaling.HighTier)

_, response, err := client.Clusters.Update(ctx, atlasGroupID, clusterName, &mongodbatlas.Cluster{
ProviderSettings: &mongodbatlas.ProviderSettings{
ProviderName: "AWS",
InstanceSizeName: scheduledAutoscaling.HighTier,
RegionName: clusterDetails.ProviderSettings.RegionName,
},
})

if err != nil || response.StatusCode != http.StatusOK {
logger.Error(err, "Couldn't scale up "+mongodbClusterCR.Name)
// TODO: Flip a metric so we can alert on this. If this fails, it's VERY CRITICAL
return err
}
return nil
},
),
gocron.WithTags(mongodbClusterCR.Name, scheduledAutoscaling.ScaleUpExpression+scheduledAutoscaling.HighTier, "up"),
)
}

} else {
r.Scheduler.RemoveByTags(mongodbClusterCR.Name)
}
return nil
}
Loading

0 comments on commit fe5239f

Please sign in to comment.