Skip to content

Commit

Permalink
test changing
Browse files Browse the repository at this point in the history
Signed-off-by: Judy Ng <[email protected]>
  • Loading branch information
judysng committed Jan 18, 2024
1 parent fa13db4 commit 18fa6b3
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 17 deletions.
30 changes: 29 additions & 1 deletion tests/integration-tests/tests/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from remote_command_executor import RemoteCommandExecutor
from retrying import retry
from time_utils import seconds
from utils import get_instance_info, run_command
from utils import get_instance_info, run_command, get_compute_nodes_instance_ips

from tests.common.osu_common import PRIVATE_OSES

Expand Down Expand Up @@ -401,3 +401,31 @@ def wait_process_completion(remote_command_executor, pid):
raise Exception("The process is still running")
else:
return result.stdout.strip()


def check_file_handler_leak(cluster, region):
"""Gets a mapping of compute node instance ip to its current number of open files."""
logging.info("Checking the number of file descriptors...")
remote_command_executor = RemoteCommandExecutor(cluster)
# Wake up all dynamic nodes in order to get instance ips of all compute nodes

compute_node_instance_ips = get_compute_nodes_instance_ips(cluster.name, region)
instance_ip_to_no_files = {}
for compute_node_instance_ip in compute_node_instance_ips:
lsof_cmd = f"ssh -q {compute_node_instance_ip} 'sudo lsof -p $(pgrep computemgtd) | wc -l'"
no_files = remote_command_executor.run_remote_command(lsof_cmd).stdout
logging.info(f"{compute_node_instance_ip} number of file descriptors: {no_files}")
instance_ip_to_no_files[compute_node_instance_ip] = no_files

return instance_ip_to_no_files


def assert_no_file_handler_leak(init_compute_ip_to_no_files, cluster, region):
"""Asserts that the current number of open files for each compute node is the same as the given map"""
current_compute_ip_to_no_files = check_file_handler_leak(cluster, region)
for compute_ip in current_compute_ip_to_no_files:
if compute_ip in init_compute_ip_to_no_files:
assert_that(current_compute_ip_to_no_files[compute_ip]).is_equal_to(init_compute_ip_to_no_files[compute_ip])
else:
raise Exception("Compute node ips differed")

38 changes: 22 additions & 16 deletions tests/integration-tests/tests/resource_leaks/test_resource_leaks.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import logging

import pytest
import time
from assertpy import assert_that
from remote_command_executor import RemoteCommandExecutor
from utils import get_compute_nodes_instance_ips

from tests.common.assertions import assert_head_node_is_running
from tests.common.utils import check_file_handler_leak, assert_no_file_handler_leak


@pytest.mark.usefixtures("instance", "os", "scheduler")
Expand All @@ -15,7 +17,7 @@ def test_resource_leaks(
s3_bucket_factory,
clusters_factory,
test_datadir,
scheduler_commands_factory,
scheduler_commands_factory
):
total_sleep_time = 1800 # 30 minutes
loop_sleep_time = 300 # 5 minutes
Expand All @@ -25,18 +27,22 @@ def test_resource_leaks(
assert_head_node_is_running(region, cluster)
remote_command_executor = RemoteCommandExecutor(cluster)

compute_node_instance_ip = get_compute_nodes_instance_ips(cluster.name, region)[0]
lsof_cmd = f"ssh -q {compute_node_instance_ip} 'sudo lsof -p $(pgrep computemgtd) | wc -l'"
sleep_cmd = f"ssh -q {compute_node_instance_ip} 'sleep {loop_sleep_time}'"

logging.info("Checking the number of file descriptors...")
initial_no_file_descs = remote_command_executor.run_remote_command(lsof_cmd).stdout
logging.info("Initial number of file descriptors: %s", initial_no_file_descs)

curr_no_file_descs = initial_no_file_descs
for _ in range(total_sleep_time // loop_sleep_time):
remote_command_executor.run_remote_command(sleep_cmd)
curr_no_file_descs = remote_command_executor.run_remote_command(lsof_cmd).stdout
logging.info("Number of file descriptors after sleeping: %s", curr_no_file_descs)

assert_that(initial_no_file_descs).is_equal_to(curr_no_file_descs)
init_no_files = check_file_handler_leak(cluster, region)
# time.sleep(1200)
assert_no_file_handler_leak(init_no_files, cluster, region)

# compute_node_instance_ip = get_compute_nodes_instance_ips(cluster.name, region)[0]
# lsof_cmd = f"ssh -q {compute_node_instance_ip} 'sudo lsof -p $(pgrep computemgtd) | wc -l'"
# sleep_cmd = f"ssh -q {compute_node_instance_ip} 'sleep {loop_sleep_time}'"
#
# logging.info("Checking the number of file descriptors...")
# initial_no_file_descs = remote_command_executor.run_remote_command(lsof_cmd).stdout
# logging.info("Initial number of file descriptors: %s", initial_no_file_descs)
#
# curr_no_file_descs = initial_no_file_descs
# for _ in range(total_sleep_time // loop_sleep_time):
# remote_command_executor.run_remote_command(sleep_cmd)
# curr_no_file_descs = remote_command_executor.run_remote_command(lsof_cmd).stdout
# logging.info("Number of file descriptors after sleeping: %s", curr_no_file_descs)
#
# assert_that(initial_no_file_descs).is_equal_to(curr_no_file_descs)

0 comments on commit 18fa6b3

Please sign in to comment.