diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py index fe4e9a2a04..b05df5d5ff 100644 --- a/tests/integration-tests/tests/performance_tests/test_scaling.py +++ b/tests/integration-tests/tests/performance_tests/test_scaling.py @@ -115,30 +115,51 @@ def test_static_scaling_stress_test( head_node_instance_type = scaling_test_config.get("HeadNodeInstanceType") scaling_targets = scaling_test_config.get("ScalingTargets") + # Creating cluster with intended head node instance type and scaling parameters + cluster_config = pcluster_config_reader( + # Prevent nodes being set down before we start monitoring the scale down metrics + scaledown_idletime=max_monitoring_time_in_mins, + head_node_instance_type=head_node_instance_type, + shared_headnode_storage_type=shared_headnode_storage_type, + scaling_strategy=scaling_strategy, + cluster_size=max(scaling_targets), + ) + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = scheduler_commands_factory(remote_command_executor) + for scaling_target in scaling_targets: - # Creating cluster with intended head node instance type and scaling parameters - cluster_config = pcluster_config_reader( - # Prevent nodes being set down before we start monitoring the scale down metrics + upscale_cluster_config = pcluster_config_reader( + config_file="upscale-pcluster.config.yaml", scaledown_idletime=max_monitoring_time_in_mins, - min_cluster_size=scaling_target, - max_cluster_size=scaling_target, + cluster_size=scaling_target, head_node_instance_type=head_node_instance_type, shared_headnode_storage_type=shared_headnode_storage_type, scaling_strategy=scaling_strategy, ) - # Create cluster and get creation start/end time - cluster = clusters_factory(cluster_config) - cluster_start_time = _datetime_to_minute(cluster.create_start_time) - cluster_end_time = _datetime_to_minute(cluster.create_end_time) - cluster_create_time = int((cluster_end_time - cluster_start_time).total_seconds()) + cluster.update(str(upscale_cluster_config), force_update="true", wait=False, raise_on_error=False) - # Run a job and get the time it takes for the job to start running - remote_command_executor = RemoteCommandExecutor(cluster) - scheduler_commands = scheduler_commands_factory(remote_command_executor) + # Set start time at minute granularity (to simplify calculation and visualising on CloudWatch) + start_time = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc)) + # Monitor the cluster during scale up + ec2_capacity_time_series_up, compute_nodes_time_series_up, timestamps, end_time = get_scaling_metrics( + remote_command_executor, + max_monitoring_time=minutes(max_monitoring_time_in_mins), + region=region, + cluster_name=cluster.name, + publish_metrics=True, + target_cluster_size=scaling_target, + ) + # Extract scale up duration and timestamp from the monitoring metrics collected above + _, scale_up_time_ec2 = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time) + scaling_target_time, scale_up_time_scheduler = _get_scaling_time( + compute_nodes_time_series_up, timestamps, scaling_target, start_time + ) + # Check that a simple job succeeds and retrieve the time it takes for the job to start scaling_job = { - "command": f"srun sleep 10", + "command": "srun sleep 10", "nodes": scaling_target, } job_id = scheduler_commands.submit_command_and_assert_job_accepted(scaling_job) @@ -148,6 +169,22 @@ def test_static_scaling_stress_test( scheduler_commands.cancel_job(job_id) job_start_time = int((end_time - start_time).total_seconds()) + # Scale down the cluster + cluster.update(str(cluster_config), force_update="true", wait=False, raise_on_error=False) + + # Monitor the cluster during scale down + scale_down_start_timestamp = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc)) + ec2_capacity_time_series_down, compute_nodes_time_series_down, timestamps, end_time = get_scaling_metrics( + remote_command_executor, + max_monitoring_time=minutes(max_monitoring_time_in_mins), + region=region, + cluster_name=cluster.name, + publish_metrics=True, + target_cluster_size=0, + ) + # Extract scale down duration and timestamp from the monitoring metrics collected above + _, scale_down_time = _get_scaling_time(ec2_capacity_time_series_down, timestamps, 0, scale_down_start_timestamp) + scaling_results = { "Region": region, "OS": os, @@ -155,7 +192,9 @@ def test_static_scaling_stress_test( "HeadNode": head_node_instance_type, "ScalingTarget": scaling_target, "ScalingStrategy": scaling_strategy, - "ClusterCreateTime": cluster_create_time, + "ScaleUpTimeEC2": scale_up_time_ec2, + "ScaleUpTimeScheduler": scale_up_time_scheduler, + "ScaleDownTime": scale_down_time, "JobStartTime": job_start_time } logging.info(f"Scaling Results: {scaling_results}") diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml index 7ca50781a2..e2a3fd4a2e 100644 --- a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml +++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml @@ -20,11 +20,8 @@ Scheduling: - Name: compute-resource-0 Instances: - InstanceType: {{ instance }} - MinCount: {{ min_cluster_size }} - MaxCount: {{ max_cluster_size }} + MinCount: 0 + MaxCount: {{ cluster_size }} Networking: SubnetIds: - {{ private_subnet_id }} -DevSettings: - Timeouts: - HeadNodeBootstrapTimeout: 3600 diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml index c5a8872274..141803a666 100644 --- a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml +++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml @@ -1,4 +1,4 @@ MaxMonitoringTimeInMins: 20 -ScalingTargets: [1000, 2000, 3000, 4000] +ScalingTargets: [100, 100, 100] SharedHeadNodeStorageType: 'Efs' HeadNodeInstanceType: 'c5.24xlarge' diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/upscale-pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/upscale-pcluster.config.yaml new file mode 100644 index 0000000000..4b2a96c13a --- /dev/null +++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/upscale-pcluster.config.yaml @@ -0,0 +1,27 @@ +Image: + Os: {{ os }} +HeadNode: + {% if shared_headnode_storage_type %} + SharedStorageType: {{ shared_headnode_storage_type }} + {% endif %} + InstanceType: {{ head_node_instance_type }} + Networking: + SubnetId: {{ public_subnet_id }} + Ssh: + KeyName: {{ key_name }} +Scheduling: + Scheduler: {{ scheduler }} + ScalingStrategy: {{ scaling_strategy }} + SlurmSettings: + ScaledownIdletime: {{ scaledown_idletime }} + SlurmQueues: + - Name: queue-0 + ComputeResources: + - Name: compute-resource-0 + Instances: + - InstanceType: {{ instance }} + MinCount: {{ cluster_size }} + MaxCount: {{ cluster_size }} + Networking: + SubnetIds: + - {{ private_subnet_id }}