-
Notifications
You must be signed in to change notification settings - Fork 531
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Core] Avoid high concurrency issue with control master #4455
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,19 @@ | ||
"""Utils to check if the ssh control master should be disabled.""" | ||
|
||
import functools | ||
import subprocess | ||
|
||
from sky import sky_logging | ||
from sky.utils import subprocess_utils | ||
|
||
logger = sky_logging.init_logger(__name__) | ||
|
||
# The maximum number of concurrent ssh connections to a same node. This is a | ||
# heuristic value, based on the observation that when the number of concurrent | ||
# ssh connections to a node with control master is high, new connections through | ||
# control master will hang. | ||
_MAX_CONCURRENT_SSH_CONNECTIONS = 32 | ||
|
||
|
||
def is_tmp_9p_filesystem() -> bool: | ||
"""Check if the /tmp filesystem is 9p. | ||
|
@@ -34,16 +41,53 @@ def is_tmp_9p_filesystem() -> bool: | |
return filesystem_types[1].lower() == '9p' | ||
|
||
|
||
@functools.lru_cache | ||
def should_disable_control_master() -> bool: | ||
def should_disable_control_master(ip: str) -> bool: | ||
"""Whether disable ssh control master based on file system. | ||
|
||
Args: | ||
ip: The ip address of the node. | ||
|
||
Returns: | ||
bool: True if the ssh control master should be disabled, | ||
False otherwise. | ||
""" | ||
if is_tmp_9p_filesystem(): | ||
if is_unsupported_filesystem(): | ||
return True | ||
if is_high_concurrency(ip): | ||
return True | ||
# there may be additional criteria to disable ssh control master | ||
# in the future. They should be checked here | ||
return False | ||
|
||
|
||
@functools.lru_cache(maxsize=1) | ||
def is_unsupported_filesystem(): | ||
"""Determine if the filesystem is unsupported.""" | ||
return is_tmp_9p_filesystem() | ||
|
||
|
||
def is_high_concurrency(ip: str) -> bool: | ||
"""Determine if the node has high concurrent ssh connections. | ||
|
||
Args: | ||
ip: The IP address to check | ||
threshold: Maximum number of allowed concurrent SSH connections | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. stale? seems now the threshold is a constant (but not an arg) |
||
|
||
Returns: | ||
bool: True if number of concurrent SSH connections exceeds threshold | ||
""" | ||
try: | ||
# Use pgrep to efficiently find ssh processes and pipe to grep for IP | ||
cmd = f'pgrep -f ssh | xargs -r ps -p | grep -c {ip}' | ||
Comment on lines
+80
to
+81
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is that only tracks the SSH command from local machine? How about the connections from other laptops |
||
proc = subprocess.run(cmd, | ||
shell=True, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE, | ||
text=True, | ||
check=False) | ||
if proc.returncode == 0: | ||
count = int(proc.stdout.strip()) | ||
return count >= _MAX_CONCURRENT_SSH_CONNECTIONS | ||
return False | ||
except (subprocess.SubprocessError, ValueError): | ||
return False |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
have you test if this is related to number of cpu?