Skip to content

Commit

Permalink
fix: ssh health check failure status report (#349)
Browse files Browse the repository at this point in the history
* fix: ssh health check failure status report

* chore: update retry params

* ssh health check raise error on unexpected for retry

* change format string

* fixup tests

* retry longer wait

* retry longer wait

* health check no raise on error

* reset ssh conn retry

* Update src/openstack_cloud/openstack_manager.py

Co-authored-by: Christopher Bartz <[email protected]>

---------

Co-authored-by: Christopher Bartz <[email protected]>
  • Loading branch information
yanksyoon and cbartz authored Aug 9, 2024
1 parent bd1e262 commit e17b1a1
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 16 deletions.
26 changes: 19 additions & 7 deletions src/openstack_cloud/openstack_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,11 +497,16 @@ def _health_check(
elapsed_min = (created_at - current_time).total_seconds()
if server.status == _INSTANCE_STATUS_BUILDING:
return elapsed_min < CREATE_SERVER_TIMEOUT
return OpenstackRunnerManager._ssh_health_check(
conn=conn, server_name=server_name, startup=startup
)
try:
return OpenstackRunnerManager._ssh_health_check(
conn=conn, server_name=server_name, startup=startup
)
except _SSHError:
logger.warning("Health check failed, unable to SSH into server: %s", server_name)
return False

@staticmethod
@retry(tries=3, delay=5, max_delay=60, backoff=2, local_logger=logger)
def _ssh_health_check(conn: OpenstackConnection, server_name: str, startup: bool) -> bool:
"""Use SSH to check whether runner application is running.
Expand All @@ -515,6 +520,9 @@ def _ssh_health_check(conn: OpenstackConnection, server_name: str, startup: bool
server_name: The openstack server instance to check connections.
startup: Check only whether the startup is successful.
Raises:
_SSHError: if there was an error SSH-ing into the machine or with the SSH command.
Returns:
Whether the runner application is running.
"""
Expand All @@ -524,13 +532,17 @@ def _ssh_health_check(conn: OpenstackConnection, server_name: str, startup: bool
)
except _SSHError as exc:
logger.error("[ALERT]: Unable to SSH to server: %s, reason: %s", server_name, str(exc))
return True
raise

result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True)
logger.debug("Output of `ps aux` on %s stderr: %s", server_name, result.stderr)
if not result.ok or RUNNER_STARTUP_PROCESS not in result.stdout:
logger.warning("List all process command failed on %s ", server_name)
return False
if not result.ok:
logger.warning("List all process command failed on %s.", server_name)
raise _SSHError(f"List process command failed on {server_name}.")
if RUNNER_STARTUP_PROCESS not in result.stdout:
logger.warning("No startup process found on server %s.", server_name)
raise _SSHError(f"Runner not yet started on {server_name}.")

logger.info("Runner process found to be healthy on %s", server_name)
if startup:
return True
Expand Down
42 changes: 33 additions & 9 deletions tests/unit/test_openstack_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,21 +804,31 @@ def test__get_ssh_connection(
assert conn is not None


def test__ssh_health_check_success(
mock_server: MagicMock,
):
@pytest.mark.usefixtures("skip_retry")
def test__ssh_health_check_success(monkeypatch: pytest.MonkeyPatch, mock_server: MagicMock):
"""
arrange: A server with SSH correctly setup.
act: Run health check on the server.
assert: The health check passes.
"""
ssh_connection_mock = MagicMock()
result_mock = MagicMock()
result_mock.stdout = "/home/ubuntu/actions-runner/run.sh\nRunner.Worker"
ssh_connection_mock.run.return_value = result_mock
monkeypatch.setattr(
openstack_manager.OpenstackRunnerManager,
"_get_ssh_connection",
MagicMock(return_value=ssh_connection_mock),
)
mock_connection = MagicMock(spec=openstack.connection.Connection)
mock_connection.get_server.return_value = mock_server

assert openstack_manager.OpenstackRunnerManager._ssh_health_check(
mock_connection, mock_server.name, False
)


@pytest.mark.usefixtures("skip_retry")
def test__ssh_health_check_no_key(mock_server: MagicMock):
"""
arrange: A server with no key available.
Expand All @@ -831,23 +841,37 @@ def test__ssh_health_check_no_key(mock_server: MagicMock):
mock_connection = MagicMock(spec=openstack.connection.Connection)
mock_connection.get_server.return_value = mock_server

assert openstack_manager.OpenstackRunnerManager._ssh_health_check(
mock_connection, mock_server.name, False
)
with pytest.raises(openstack_manager._SSHError) as exc:
openstack_manager.OpenstackRunnerManager._ssh_health_check(
mock_connection, mock_server.name, False
)

assert "no valid keypair found" in str(exc)


def test__ssh_health_check_error(mock_server: MagicMock):
@pytest.mark.usefixtures("skip_retry")
def test__ssh_health_check_error(monkeypatch: pytest.MonkeyPatch, mock_server: MagicMock):
"""
arrange: A server with error on SSH run.
act: Run health check on the server.
assert: The health check fails.
"""
monkeypatch.setattr(openstack_manager.OpenstackRunnerManager, "_get_key_path", MagicMock())
mock_connection = MagicMock(spec=openstack.connection.Connection)
mock_connection.get_server.return_value = mock_server
openstack_manager.OpenstackRunnerManager._ssh_health_check(
mock_connection, mock_server.name, False
mock_ssh_connection = MagicMock()
mock_ssh_connection.run = MagicMock(side_effect=TimeoutError)
monkeypatch.setattr(
openstack_manager, "SshConnection", MagicMock(return_value=mock_ssh_connection)
)

with pytest.raises(openstack_manager._SSHError) as exc:
openstack_manager.OpenstackRunnerManager._ssh_health_check(
mock_connection, mock_server.name, False
)

assert "No connectable SSH addresses found" in str(exc)


def test__wait_until_runner_process_running_no_server():
"""
Expand Down

0 comments on commit e17b1a1

Please sign in to comment.