Skip to content

Commit

Permalink
scaling logic
Browse files Browse the repository at this point in the history
  • Loading branch information
antoinefalisse committed May 13, 2024
1 parent 15cec94 commit 4fdf1da
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 7 deletions.
12 changes: 6 additions & 6 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import logging
import glob
import numpy as np
from utilsAPI import getAPIURL, getWorkerType, getASInstance
from utilsAPI import getAPIURL, getWorkerType, getASInstance, unprotect_current_instance
from utilsAuth import getToken
from utils import (getDataDirectory, checkTime, checkResourceUsage,
sendStatusEmail, checkForTrialsWithStatus)
Expand All @@ -30,7 +30,7 @@
# for removing AWS machine scale-in protection
t_lastTrial = time.localtime()
justProcessed = True
minutesBeforeShutdown = 5
minutesBeforeRemoveScaleInProtection = 5

while True:
# Run test trial at a given frequency to check status of machine. Stop machine if fails.
Expand All @@ -57,11 +57,11 @@
# when using autoscaling, we will remove the instance scale-in protection if it hasn't
# pulled a trial recently and there are no actively recording trials
if (autoScalingInstance and not justProcessed and
checkTime(t_lastTrial,minutesElapsed=minutesBeforeShutdown)):
if checkForTrialsWithStatus('recording',hours=2/60) == 0:
checkTime(t_lastTrial, minutesElapsed=minutesBeforeRemoveScaleInProtection)):
if checkForTrialsWithStatus('recording', hours=2/60) == 0:
# AWS CLI to remove machine scale-in protection
print('TODO placeholder: removing AWS machine scale-in protection.')
break
unprotect_current_instance()
time.sleep(3600)
else:
t_lastTrial = time.localtime()
if autoScalingInstance and justProcessed:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ pingouin==0.5.2
openpyxl
ffmpeg-python
psutil
boto3
75 changes: 74 additions & 1 deletion utilsAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
@author: suhlr
"""
import os
import boto3
import requests

from decouple import config
from datetime import datetime, timedelta

def getAPIURL():
if 'API_URL' not in globals():
Expand Down Expand Up @@ -51,4 +55,73 @@ def getASInstance():
print(f"Error occurred while checking ECS_CONTAINER_METADATA_FILE: {e}")
asInstance = False

return asInstance
return asInstance

def get_metric_average(namespace, metric_name, start_time, end_time, period):
"""
Fetch the average value of a specific metric from AWS CloudWatch.
Parameters:
- namespace (str): The namespace for the metric data.
- metric_name (str): The name of the metric.
- start_time (datetime): Start time for the data retrieval.
- end_time (datetime): End time for the data retrieval.
- period (int): The granularity, in seconds, of the data points returned.
"""
client = boto3.client('cloudwatch')
response = client.get_metric_statistics(
Namespace=namespace,
MetricName=metric_name,
StartTime=start_time,
EndTime=end_time,
Period=period,
Statistics=['Average'] # Correctly specifying 'Average' here
)
return response

def get_number_of_pending_trials(period=60):
# Time range setup for the last 1 minute
end_time = datetime.utcnow()
start_time = end_time - timedelta(minutes=1)

# Fetch the metric data
namespace = 'Custom/opencap' # or 'Custom/opencap' for production
metric_name = 'opencap_trials_pending'
stats = get_metric_average(
namespace, metric_name, start_time, end_time, period)

if stats['Datapoints']:
average = stats['Datapoints'][0]['Average']
print(f"Average value of '{metric_name}' over the last minute: {average}")
else:
print("No data points found.")
# Maybe raise an exception or do nothing to have control-loop retry this call later
return None

return average

def get_instance_id():
"""Retrieve the instance ID from EC2 metadata."""
response = requests.get("http://169.254.169.254/latest/meta-data/instance-id")
return response.text

def get_auto_scaling_group_name(instance_id):
"""Retrieve the Auto Scaling Group name using the instance ID."""
client = boto3.client('autoscaling')
response = client.describe_auto_scaling_instances(InstanceIds=[instance_id])
asg_name = response['AutoScalingInstances'][0]['AutoScalingGroupName']
return asg_name

def set_instance_protection(instance_id, asg_name, protect):
"""Set or remove instance protection."""
client = boto3.client('autoscaling')
client.set_instance_protection(
InstanceIds=[instance_id],
AutoScalingGroupName=asg_name,
ProtectedFromScaleIn=protect
)

def unprotect_current_instance():
instance_id = get_instance_id()
asg_name = get_auto_scaling_group_name(instance_id)
set_instance_protection(instance_id, asg_name, protect=False)

0 comments on commit 4fdf1da

Please sign in to comment.