Merge pull request #2 from meezaan/requests

Requests
meezaan · Mar 24, 2021 · 1e3b18a · 1e3b18a
2 parents 7461847 + a640b83
commit 1e3b18a
Show file tree

Hide file tree

Showing 8 changed files with 681 additions and 268 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM islamicnetwork/php74:cli
+FROM islamicnetwork/php:8.0-cli
 
 COPY . /autoscaler/
 
@@ -12,6 +12,8 @@ ENV LINODE_LKE_CLUSTER_POOL_MINIMUM_NODES "3"
 ENV AUTOSCALE_TRIGGER "memory"
 ENV AUTOSCALE_UP_PERCENTAGE "60"
 ENV AUTOSCALE_DOWN_PERCENTAGE "40"
+ENV AUTOSCALE_RESOURCE_REQUEST_UP_PERCENTAGE "80"
+ENV AUTOSCALE_RESOURCE_REQUEST_DOWN_PERCENTAGE "70"
 ENV AUTOSCALE_QUERY_INTERVAL "10"
 ENV AUTOSCALE_THRESHOLD_COUNT "3"
 ENV AUTOSCALE_NUMBER_OF_NODES "1"

diff --git a/README.md b/README.md
@@ -46,26 +46,30 @@ The docker container takes all its configuration via environment variables. Here
 | AUTOSCALE_TRIGGER              | 'cpu' or 'memory'
 | AUTOSCALE_UP_PERCENTAGE        | At what percentage of 'cpu' or 'memory' to scale up the node pool. Example: 65
 | AUTOSCALE_DOWN_PERCENTAGE      | At what percentage of 'cpu' or 'memory' to scale down the node pool. Example: 40
+| AUTOSCALE_RESOURCE_REQUEST_UP_PERCENTAGE | At what percentage of 'cpu' or 'memory' of the requested / available to scale up the cluster. Default: 80
+| AUTOSCALE_RESOURCE_REQUEST_DOWN_PERCENTAGE | At what percentage of 'cpu' or 'memory' of the requested / available to scale down the cluster. Default: 70
 | AUTOSCALE_QUERY_INTERVAL       | How many seconds to wait before each call to the Kubernetes API to check CPU and Memory usage. Example: 10
 | AUTOSCALE_THRESHOLD_COUNT      | After how many consecutive matches of AUTOSCALE_UP_PERCENTAGE or AUTOSCALE_DOWN_PERCENTAGE to scale the cluster up or down.
-| AUTOSCALE_NUMBER_OF_NODES      | How many nodes to add or remove at one time when scaling the cluster. Example: 1 or 2 or 3 or N
+| AUTOSCALE_NUMBER_OF_NODES      | How many nodes to add at one time when scaling the cluster. Example: 1 or 2 or 3 or N
 | AUTOSCALE_WAIT_TIME_AFTER_SCALING | How many seconds to wait after scaling up or down to start checking CPU and Memory. This should be set the to give the cluster enough time to adjust itself with the updated number of nodes. Example: 150
 
 To understand the above assuming we have set the following values.
 * AUTOSCALE_TRIGGER=memory
 * AUTOSCALE_UP_PERCENTAGE=65
 * AUTOSCALE_UP_PERCENTAGE=30
+* AUTOSCALE_RESOURCE_REQUEST_UP_PERCENTAGE=80
+* AUTOSCALE_RESOURCE_REQUEST_DOWN_PERCENTAGE=80
 * AUTOSCALE_QUERY_INTERVAL=10
 * AUTOSCALE_THRESHOLD_COUNT=3
-* AUTOSCALE_NUMBER_OF_NODES=1
+* AUTOSCALE_NUMBER_OF_NODES=2
 * AUTOSCALE_WAIT_TIME_AFTER_SCALING=180
 
 With this setup, the autoscaler utility will query the Kuberenetes API every 10 seconds. If with 3 consecutive calls
-to the API (effectively meaning over 30 seconds), the memory usage is higher than 65%, 1 more node will be added to the
+to the API (effectively meaning over 30 seconds), the memory usage is higher than 65% or the requested memory exceeds 80% of the total memory available on the cluster, 2 more nodes will be added to the
 specified node pool. The utility will wait for 180 seconds and then start querying the API every 10 seconds again.
 
-If with 3 consecutive calls to the API (effectively meaning over 30 seconds), the memory usage is lower than 30%,
-1 node will be removed from the specified node pool. The utility will wait for 180 seconds and then start 
+If with 3 consecutive calls to the API (effectively meaning over 30 seconds), the memory usage is lower than 30% or the requested memory is below 80% of the total memory available on the cluster,
+1 node will be removed (**nodes are always removed one at a time to ensure you don't run out of capacity all of a sudden**) from the specified node pool. The utility will wait for 180 seconds and then start 
 querying the API every 10 seconds again.
 
 ## Usage
@@ -82,6 +86,8 @@ docker run -v ~/.kube/config:/root/.kube/config \
 -e AUTOSCALE_TRIGGER='cpu' \
 -e AUTOSCALE_UP_PERCENTAGE='60' \
 -e AUTOSCALE_DOWN_PERCENTAGE='30' \
+-e AUTOSCALE_RESOURCE_REQUEST_UP_PERCENTAGE='70' \
+-e AUTOSCALE_RESOURCE_REQUEST_DOWN_PERCENTAGE='70' \
 -e AUTOSCALE_QUERY_INTERVAL='10' \
 -e AUTOSCALE_THRESHOLD_COUNT='3' \
 -e AUTOSCALE_NUMBER_OF_NODES='1' \

diff --git a/bin/autoscale b/bin/autoscale
@@ -20,6 +20,8 @@ pcntl_signal(SIGINT, function () {
 $autoscaleTrigger = getenv('AUTOSCALE_TRIGGER'); // memory or cpu for memory percentage or CPU percentage
 $autoscaleUpAtUtilisationPercent = getenv('AUTOSCALE_UP_PERCENTAGE');
 $autoscaleDownAtUtilisationPercent = getenv('AUTOSCALE_DOWN_PERCENTAGE');
+$autoscaleUpRequestPercentage = getenv('AUTOSCALE_RESOURCE_REQUEST_UP_PERCENTAGE');
+$autoscaleDownRequestPercentage = getenv('AUTOSCALE_RESOURCE_REQUEST_DOWN_PERCENTAGE');
 $autoscaleQueryInterval = getenv('AUTOSCALE_QUERY_INTERVAL'); // Seconds. We'll call K8S after every interval to fetch node utilisation metrics
 $autoscaleThresholdCount = getenv('AUTOSCALE_THRESHOLD_COUNT'); // Number of consecutive times the utilisation percentage should be greater than $autoscaleAtCpuUtilisationPercent or $autoscaleAtMemoryUtilisationPercent to autoscale.
 $autoscaleNodesToAddOrRemovePerBreach = getenv('AUTOSCALE_NUMBER_OF_NODES');
@@ -63,8 +65,16 @@ while (true) {
         }
 
         $usedPercentage = $autoscaleTrigger == 'cpu' ? $nodes->getUsedCpuPercent() : $nodes->getUsedMemoryPercent();
-        $scale = new Scale($autoscaleUpAtUtilisationPercent, $autoscaleDownAtUtilisationPercent, $usedPercentage);
-        $logger->info(strtoupper($autoscaleTrigger) . ' Scale calculated', ['usedPercentage' => $usedPercentage, 'scaleUpPercentage' => $autoscaleUpAtUtilisationPercent, 'scaleDownPercentage' => $autoscaleDownAtUtilisationPercent]);
+        $requestedPercentage = $autoscaleTrigger == 'cpu' ? $nodes->getRequestedCpuPercent() : $nodes->getRequestedMemoryPercent();
+        $scale = new Scale($autoscaleUpAtUtilisationPercent, $autoscaleDownAtUtilisationPercent, $usedPercentage, $requestedPercentage, $autoscaleUpRequestPercentage, $autoscaleDownRequestPercentage);
+        $logger->info(strtoupper($autoscaleTrigger) . ' Scale calculated', [
+        'usedPercentage' => $usedPercentage,
+        'scaleUpPercentage' => $autoscaleUpAtUtilisationPercent,
+        'scaleDownPercentage' => $autoscaleDownAtUtilisationPercent,
+        'requestedPercentage' => $requestedPercentage,
+        'scaleUpRequestPercentage' => $autoscaleUpRequestPercentage,
+        'scaleDownRequestPercentage' => $autoscaleDownRequestPercentage
+        ]);
         if ($scale->scaleUp()) {
             $logger->info('Scale Count: Up');
             $counter->up();
@@ -91,7 +101,8 @@ while (true) {
             if ($currentNodesInPool > $linodeClusterPoolMinimumNodeCount && ($currentNodesInPool - $autoscaleNodesToAddOrRemovePerBreach) >= $linodeClusterPoolMinimumNodeCount) {
                 $logger->alert('Current Nodes in LKE Pool: ' . $currentNodesInPool);
                 $logger->alert("Removing $autoscaleNodesToAddOrRemovePerBreach node(s)...");
-                $linode->updateNodeCount($currentNodesInPool - $autoscaleNodesToAddOrRemovePerBreach);
+                // Scale down only 1 node at a time, this is much safer than scaling down multiple nodes
+                $linode->updateNodeCount($currentNodesInPool - 1);
                 sleep($autoscaleWaitTimeBetweenScaling);
             }
             $logger->alert("Skip downsizing cluster because we are already at the minimum number ($linodeClusterPoolMinimumNodeCount) of nodes or scaling down by $autoscaleNodesToAddOrRemovePerBreach will put us at less than the minimum number.");