Skip to content

Commit

Permalink
Add resource leak check into util, call checks in starccm test
Browse files Browse the repository at this point in the history
Signed-off-by: Judy Ng <[email protected]>
  • Loading branch information
judysng committed Jan 22, 2024
1 parent 15c454a commit 6b2c840
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 1 deletion.
37 changes: 37 additions & 0 deletions tests/integration-tests/tests/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,3 +401,40 @@ def wait_process_completion(remote_command_executor, pid):
raise Exception("The process is still running")
else:
return result.stdout.strip()


def check_file_handler_leak(remote_command_executor, slurm_commands, cluster, region):
"""Gets a mapping of compute node instance ip to its current number of open files."""
logging.info("Checking the number of file descriptors...")

# Submit job to the test nodes
compute_node_names = slurm_commands.get_compute_nodes(all_nodes=True)
for name in compute_node_names:
slurm_commands.submit_command_and_assert_job_accepted(
submit_command_args={"command": "srun sleep 1", "host": name}
)
# Wait for all jobs to be completed
slurm_commands.wait_job_queue_empty()

# Get the number of open files on all the nodes
instance_ip_to_no_files = {}
for node_name in compute_node_names:
compute_node_instance_ip = slurm_commands.get_node_addr(node_name)
lsof_cmd = f"ssh -q {compute_node_instance_ip} 'sudo lsof -p $(pgrep computemgtd) | wc -l'"
no_files = remote_command_executor.run_remote_command(lsof_cmd).stdout
instance_ip_to_no_files[compute_node_instance_ip] = no_files

logging.info(f"Mapping from instance ip to number of open files in computemgtd: {instance_ip_to_no_files}")
return instance_ip_to_no_files


def assert_no_file_handler_leak(init_compute_ip_to_no_files, remote_command_executor, slurm_commands, cluster, region):
"""Asserts that the current number of open files for each compute node is the same as the given map"""
current_compute_ip_to_no_files = check_file_handler_leak(remote_command_executor, slurm_commands, cluster, region)
logging.info(
f"Asserting that the number of open files in computemgtd hasn't grown from "
f"{init_compute_ip_to_no_files} to {current_compute_ip_to_no_files}."
)
for compute_ip in current_compute_ip_to_no_files:
if compute_ip in init_compute_ip_to_no_files:
assert_that(current_compute_ip_to_no_files[compute_ip]).is_equal_to(init_compute_ip_to_no_files[compute_ip])
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import pytest
from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor

from tests.common.utils import assert_no_file_handler_leak, check_file_handler_leak

# timeout in seconds
STARCCM_INSTALLATION_TIMEOUT = 1800
STARCCM_JOB_TIMEOUT = 600
Expand Down Expand Up @@ -70,13 +72,15 @@ def test_starccm(
cluster = clusters_factory(cluster_config)
logging.info("Cluster Created")
remote_command_executor = RemoteCommandExecutor(cluster)
scheduler_commands = scheduler_commands_factory(remote_command_executor)
init_no_files = check_file_handler_leak(remote_command_executor, scheduler_commands, cluster, region)

if not starccm_installed(remote_command_executor):
logging.info("Installing StarCCM+")
remote_command_executor.run_remote_script(
str(test_datadir / "starccm.install.sh"), timeout=STARCCM_INSTALLATION_TIMEOUT, hide=False
)
logging.info("StarCCM+ Installed")
scheduler_commands = scheduler_commands_factory(remote_command_executor)
podkey, licpath = get_starccm_secrets(region)
performance_degradation = {}
for node in number_of_nodes:
Expand Down Expand Up @@ -112,6 +116,8 @@ def test_starccm(
f"Percentage difference: {percentage_difference}%, Outcome: {outcome}"
)

assert_no_file_handler_leak(init_no_files, remote_command_executor, scheduler_commands, cluster, region)

if performance_degradation:
pytest.fail(f"Performance degradation detected: {performance_degradation}")
else:
Expand Down

0 comments on commit 6b2c840

Please sign in to comment.