Skip to content

Commit

Permalink
Add static node stress test
Browse files Browse the repository at this point in the history
Signed-off-by: Judy Ng <[email protected]>
  • Loading branch information
judysng committed Mar 20, 2024
1 parent 4e80c57 commit 89f69b6
Show file tree
Hide file tree
Showing 8 changed files with 189 additions and 5 deletions.
2 changes: 2 additions & 0 deletions tests/integration-tests/clusters_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def __init__(self, name, ssh_key, config_file, region, custom_cli_credentials=No
self.__cfn_resources = None
self.__cfn_stack_arn = None
self.custom_cli_credentials = custom_cli_credentials
self.create_start_time = None
self.create_end_time = None

def __repr__(self):
attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()])
Expand Down
10 changes: 5 additions & 5 deletions tests/integration-tests/configs/scaling_stress_test.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
test-suites:
performance_tests:
test_scaling.py::test_scaling_stress_test:
test_scaling.py::test_static_scaling_stress_test:
dimensions:
- regions: ["us-east-1"]
instances: ["c5.large"]
oss: ["alinux2"]
schedulers: ["slurm"]
- regions: [ "eu-west-1" ]
instances: [ "c5.large" ]
oss: [ "alinux2" ]
schedulers: [ "slurm" ]
3 changes: 3 additions & 0 deletions tests/integration-tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# additional details.

import copy
import datetime
import json
import logging
import os
Expand Down Expand Up @@ -423,7 +424,9 @@ def _cluster_factory(cluster_config, upper_case_cluster_name=False, custom_cli_c
custom_cli_credentials=custom_cli_credentials,
)
if not request.config.getoption("cluster"):
cluster.create_start_time = datetime.datetime.now(tz=datetime.timezone.utc)
cluster.creation_response = factory.create_cluster(cluster, **kwargs)
cluster.create_end_time = datetime.datetime.now(tz=datetime.timezone.utc)
return cluster

yield _cluster_factory
Expand Down
77 changes: 77 additions & 0 deletions tests/integration-tests/tests/performance_tests/test_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,83 @@ def _get_scaling_time(capacity_time_series: list, timestamps: list, scaling_targ
raise e


@pytest.mark.usefixtures("scheduler")
@pytest.mark.parametrize("scaling_strategy", ["all-or-nothing"])
def test_static_scaling_stress_test(
test_datadir,
instance,
os,
region,
request,
pcluster_config_reader,
scheduler_commands_factory,
clusters_factory,
scaling_strategy,
):
"""
The test scales up a cluster with a large number of static nodes, as opposed to scaling
up and down with dynamic nodes.
This test records the amount of time it takes to create the cluster with the target number
of static nodes and then the amount of time it takes to start running a job once it has been
submitted. It compares the time to the baselines.
This test doesn't upload metrics like the dynamic ones because the nodes start up in cluster creation,
so we can't monitor the number of nodes from here. So, we just do a check that the time it takes is within
the baseline.
"""
# Get the scaling parameters
scaling_test_config = _validate_and_get_scaling_test_config(test_datadir, request)
max_monitoring_time_in_mins = scaling_test_config.get("MaxMonitoringTimeInMins")
shared_headnode_storage_type = scaling_test_config.get("SharedHeadNodeStorageType")
head_node_instance_type = scaling_test_config.get("HeadNodeInstanceType")
scaling_targets = scaling_test_config.get("ScalingTargets")

for scaling_target in scaling_targets:
# Creating cluster with intended head node instance type and scaling parameters
cluster_config = pcluster_config_reader(
# Prevent nodes being set down before we start monitoring the scale down metrics
scaledown_idletime=max_monitoring_time_in_mins,
min_cluster_size=scaling_target,
max_cluster_size=scaling_target,
head_node_instance_type=head_node_instance_type,
shared_headnode_storage_type=shared_headnode_storage_type,
scaling_strategy=scaling_strategy,
)

# Create cluster and get creation start/end time
cluster = clusters_factory(cluster_config)
cluster_start_time = _datetime_to_minute(cluster.create_start_time)
cluster_end_time = _datetime_to_minute(cluster.create_end_time)
cluster_create_time = int((cluster_end_time - cluster_start_time).total_seconds())

# Run a job and get the time it takes for the job to start running
remote_command_executor = RemoteCommandExecutor(cluster)
scheduler_commands = scheduler_commands_factory(remote_command_executor)

scaling_job = {
"command": f"srun sleep 10",
"nodes": scaling_target,
}
job_id = scheduler_commands.submit_command_and_assert_job_accepted(scaling_job)
start_time = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc))
scheduler_commands.wait_job_running(job_id)
end_time = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc))
scheduler_commands.cancel_job(job_id)
job_start_time = int((end_time - start_time).total_seconds())

scaling_results = {
"Region": region,
"OS": os,
"ComputeNode": instance,
"HeadNode": head_node_instance_type,
"ScalingTarget": scaling_target,
"ScalingStrategy": scaling_strategy,
"ClusterCreateTime": cluster_create_time,
"JobStartTime": job_start_time
}
logging.info(f"Scaling Results: {scaling_results}")

@pytest.mark.usefixtures("scheduler")
@pytest.mark.parametrize("scaling_strategy", ["all-or-nothing", "best-effort"])
def test_scaling_stress_test(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
Image:
Os: {{ os }}
HeadNode:
{% if shared_headnode_storage_type %}
SharedStorageType: {{ shared_headnode_storage_type }}
{% endif %}
InstanceType: {{ head_node_instance_type }}
Networking:
SubnetId: {{ public_subnet_id }}
Ssh:
KeyName: {{ key_name }}
Scheduling:
Scheduler: {{ scheduler }}
ScalingStrategy: {{ scaling_strategy }}
SlurmSettings:
ScaledownIdletime: {{ scaledown_idletime }}
SlurmQueues:
- Name: queue-0
ComputeResources:
- Name: compute-resource-0
Instances:
- InstanceType: {{ instance }}
MinCount: {{ min_cluster_size }}
MaxCount: {{ max_cluster_size }}
Networking:
SubnetIds:
- {{ private_subnet_id }}
DevSettings:
Timeouts:
HeadNodeBootstrapTimeout: 3600
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"c5.large": {
"1000": {
"best-effort": {
"scale_up_time_ec2": 180,
"scale_up_time_scheduler": 540,
"scale_down_time": 120
},
"all-or-nothing": {
"scale_up_time_ec2": 180,
"scale_up_time_scheduler": 540,
"scale_down_time": 120
}
},
"2000": {
"best-effort": {
"scale_up_time_ec2": 300,
"scale_up_time_scheduler": 600,
"scale_down_time": 180
},
"all-or-nothing": {
"scale_up_time_ec2": 300,
"scale_up_time_scheduler": 600,
"scale_down_time": 180
}
},
"3000": {
"best-effort": {
"scale_up_time_ec2": 420,
"scale_up_time_scheduler": 1020,
"scale_down_time": 240
},
"all-or-nothing": {
"scale_up_time_ec2": 420,
"scale_up_time_scheduler": 1020,
"scale_down_time": 240
}
},
"4000": {
"best-effort": {
"scale_up_time_ec2": 540,
"scale_up_time_scheduler": 1200,
"scale_down_time": 300
},
"all-or-nothing": {
"scale_up_time_ec2": 540,
"scale_up_time_scheduler": 1200,
"scale_down_time": 300
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
MaxMonitoringTimeInMins: 20
ScalingTargets: [1000, 2000, 3000, 4000]
SharedHeadNodeStorageType: 'Efs'
HeadNodeInstanceType: 'c5.24xlarge'
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
type: map
mapping:
MaxMonitoringTimeInMins:
type: int
required: true
ScalingTargets:
type: seq
required: true
sequence:
- type: int
SharedHeadNodeStorageType:
type: str
required: true
HeadNodeInstanceType:
type: str
required: true

0 comments on commit 89f69b6

Please sign in to comment.