Skip to content

Commit

Permalink
testing
Browse files Browse the repository at this point in the history
Signed-off-by: Judy Ng <[email protected]>
  • Loading branch information
judysng committed Mar 21, 2024
1 parent 89f69b6 commit 772599b
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 21 deletions.
69 changes: 54 additions & 15 deletions tests/integration-tests/tests/performance_tests/test_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,30 +115,51 @@ def test_static_scaling_stress_test(
head_node_instance_type = scaling_test_config.get("HeadNodeInstanceType")
scaling_targets = scaling_test_config.get("ScalingTargets")

# Creating cluster with intended head node instance type and scaling parameters
cluster_config = pcluster_config_reader(
# Prevent nodes being set down before we start monitoring the scale down metrics
scaledown_idletime=max_monitoring_time_in_mins,
head_node_instance_type=head_node_instance_type,
shared_headnode_storage_type=shared_headnode_storage_type,
scaling_strategy=scaling_strategy,
cluster_size=max(scaling_targets),
)
cluster = clusters_factory(cluster_config)
remote_command_executor = RemoteCommandExecutor(cluster)
scheduler_commands = scheduler_commands_factory(remote_command_executor)

for scaling_target in scaling_targets:
# Creating cluster with intended head node instance type and scaling parameters
cluster_config = pcluster_config_reader(
# Prevent nodes being set down before we start monitoring the scale down metrics
upscale_cluster_config = pcluster_config_reader(
config_file="upscale-pcluster.config.yaml",
scaledown_idletime=max_monitoring_time_in_mins,
min_cluster_size=scaling_target,
max_cluster_size=scaling_target,
cluster_size=scaling_target,
head_node_instance_type=head_node_instance_type,
shared_headnode_storage_type=shared_headnode_storage_type,
scaling_strategy=scaling_strategy,
)

# Create cluster and get creation start/end time
cluster = clusters_factory(cluster_config)
cluster_start_time = _datetime_to_minute(cluster.create_start_time)
cluster_end_time = _datetime_to_minute(cluster.create_end_time)
cluster_create_time = int((cluster_end_time - cluster_start_time).total_seconds())
cluster.update(str(upscale_cluster_config), force_update="true", wait=False, raise_on_error=False)

# Run a job and get the time it takes for the job to start running
remote_command_executor = RemoteCommandExecutor(cluster)
scheduler_commands = scheduler_commands_factory(remote_command_executor)
# Set start time at minute granularity (to simplify calculation and visualising on CloudWatch)
start_time = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc))
# Monitor the cluster during scale up
ec2_capacity_time_series_up, compute_nodes_time_series_up, timestamps, end_time = get_scaling_metrics(
remote_command_executor,
max_monitoring_time=minutes(max_monitoring_time_in_mins),
region=region,
cluster_name=cluster.name,
publish_metrics=True,
target_cluster_size=scaling_target,
)
# Extract scale up duration and timestamp from the monitoring metrics collected above
_, scale_up_time_ec2 = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time)
scaling_target_time, scale_up_time_scheduler = _get_scaling_time(
compute_nodes_time_series_up, timestamps, scaling_target, start_time
)

# Check that a simple job succeeds and retrieve the time it takes for the job to start
scaling_job = {
"command": f"srun sleep 10",
"command": "srun sleep 10",
"nodes": scaling_target,
}
job_id = scheduler_commands.submit_command_and_assert_job_accepted(scaling_job)
Expand All @@ -148,14 +169,32 @@ def test_static_scaling_stress_test(
scheduler_commands.cancel_job(job_id)
job_start_time = int((end_time - start_time).total_seconds())

# Scale down the cluster
cluster.update(str(cluster_config), force_update="true", wait=False, raise_on_error=False)

# Monitor the cluster during scale down
scale_down_start_timestamp = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc))
ec2_capacity_time_series_down, compute_nodes_time_series_down, timestamps, end_time = get_scaling_metrics(
remote_command_executor,
max_monitoring_time=minutes(max_monitoring_time_in_mins),
region=region,
cluster_name=cluster.name,
publish_metrics=True,
target_cluster_size=0,
)
# Extract scale down duration and timestamp from the monitoring metrics collected above
_, scale_down_time = _get_scaling_time(ec2_capacity_time_series_down, timestamps, 0, scale_down_start_timestamp)

scaling_results = {
"Region": region,
"OS": os,
"ComputeNode": instance,
"HeadNode": head_node_instance_type,
"ScalingTarget": scaling_target,
"ScalingStrategy": scaling_strategy,
"ClusterCreateTime": cluster_create_time,
"ScaleUpTimeEC2": scale_up_time_ec2,
"ScaleUpTimeScheduler": scale_up_time_scheduler,
"ScaleDownTime": scale_down_time,
"JobStartTime": job_start_time
}
logging.info(f"Scaling Results: {scaling_results}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,8 @@ Scheduling:
- Name: compute-resource-0
Instances:
- InstanceType: {{ instance }}
MinCount: {{ min_cluster_size }}
MaxCount: {{ max_cluster_size }}
MinCount: 0
MaxCount: {{ cluster_size }}
Networking:
SubnetIds:
- {{ private_subnet_id }}
DevSettings:
Timeouts:
HeadNodeBootstrapTimeout: 3600
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
MaxMonitoringTimeInMins: 20
ScalingTargets: [1000, 2000, 3000, 4000]
ScalingTargets: [100, 100, 100]
SharedHeadNodeStorageType: 'Efs'
HeadNodeInstanceType: 'c5.24xlarge'
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
Image:
Os: {{ os }}
HeadNode:
{% if shared_headnode_storage_type %}
SharedStorageType: {{ shared_headnode_storage_type }}
{% endif %}
InstanceType: {{ head_node_instance_type }}
Networking:
SubnetId: {{ public_subnet_id }}
Ssh:
KeyName: {{ key_name }}
Scheduling:
Scheduler: {{ scheduler }}
ScalingStrategy: {{ scaling_strategy }}
SlurmSettings:
ScaledownIdletime: {{ scaledown_idletime }}
SlurmQueues:
- Name: queue-0
ComputeResources:
- Name: compute-resource-0
Instances:
- InstanceType: {{ instance }}
MinCount: {{ cluster_size }}
MaxCount: {{ cluster_size }}
Networking:
SubnetIds:
- {{ private_subnet_id }}

0 comments on commit 772599b

Please sign in to comment.