Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle task timeout to send SIGTERM #67

Merged
merged 2 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions rebootmgr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,15 @@ def run_tasks(tasktype, con, hostname, dryrun, task_timeout):
for task in sorted(os.listdir("/etc/rebootmgr/%s_tasks/" % tasktype)):
task = os.path.join("/etc/rebootmgr/%s_tasks" % tasktype, task)
LOG.info("Run task %s" % task)
p = subprocess.Popen(task, env=env)
try:
subprocess.run(task, check=True, env=env, timeout=(task_timeout * 60))
except subprocess.CalledProcessError as e:
LOG.error("Task %s failed with return code %s. Exit" % (task, e.returncode))
sys.exit(EXIT_TASK_FAILED)
ret = p.wait(timeout=(task_timeout * 60))
except subprocess.TimeoutExpired:
p.terminate()
try:
p.wait(timeout=10)
except subprocess.TimeoutExpired:
p.kill()
LOG.error("Could not finish task %s in %i minutes. Exit" % (task, task_timeout))
LOG.error("Disable rebootmgr in consul for this node")
data = get_config(con, hostname)
Expand All @@ -84,6 +87,9 @@ def run_tasks(tasktype, con, hostname, dryrun, task_timeout):
put_config(con, hostname, data)
con.kv.delete("service/rebootmgr/reboot_in_progress")
sys.exit(EXIT_TASK_FAILED)
if ret != 0:
LOG.error("Task %s failed with return code %s. Exit" % (task, ret))
sys.exit(EXIT_TASK_FAILED)
LOG.info("task %s finished" % task)


Expand Down
68 changes: 61 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import subprocess

from unittest.mock import DEFAULT
from unittest.mock import MagicMock

import pytest
import consul
Expand Down Expand Up @@ -40,6 +41,57 @@ def fake_gethostname():
c.agent.service.deregister(name)


@pytest.fixture
def mock_subprocess_popen(mocker):
"""
Fixture for testing with mocked `subprocess.Popen`.

Returns a configured `MagicMock` instance.

You can optionally pass a `side_effect` as a second argument
which will be used as a side_effect for Popen.wait.

`side_effect` can be an Exception and will then be raised;
see the `MagicMock.side_effect` documentation for more information.

Example:

mocked_popen = mock_subprocess_popen(["reboot"])

call_your_tested_code()

mocked_popen.assert_any_call(["reboot"])
mocked_popen.wait.assert_called()
"""
wait_results = {}

def get_wait_result(command):
if isinstance(command, str):
command = [command]
elif isinstance(command, list):
pass
else:
raise ValueError("command must be either string or list")

return wait_results[json.dumps(command)]

def get_mocked_popen(command, *args, **kwargs):
mock = MagicMock()
return_value, side_effect = get_wait_result(command)
mock.wait.return_value = return_value
mock.wait.side_effect = side_effect
return mock

mocked_popen = mocker.patch("subprocess.Popen")
mocked_popen.side_effect = get_mocked_popen

def add(command, wait_return_value=None, wait_side_effect=None):
wait_results[json.dumps(command)] = wait_return_value, wait_side_effect
return mocked_popen

return add


@pytest.fixture
def mock_subprocess_run(mocker):
"""
Expand Down Expand Up @@ -101,7 +153,7 @@ def run(*args, catch_exceptions=False, **kwargs):


@pytest.fixture
def reboot_task(mocker, mock_subprocess_run):
def reboot_task(mocker, mock_subprocess_popen):
tasks = {"pre_boot": [], "post_boot": []}

def listdir(directory):
Expand All @@ -120,15 +172,17 @@ def create_task(tasktype, filename, exit_code=0, raise_timeout_expired=False):

tasks[tasktype] += [filename]

side_effect = None
if exit_code != 0:
side_effect = subprocess.CalledProcessError(exit_code, filename)
elif raise_timeout_expired:
if raise_timeout_expired:
return_value = None
side_effect = subprocess.TimeoutExpired(filename, 1234)
else:
return_value = exit_code
side_effect = None

mock_subprocess_run(
return mock_subprocess_popen(
["/etc/rebootmgr/{}_tasks/{}".format(tasktype, filename)],
side_effect)
wait_return_value=return_value,
wait_side_effect=side_effect)

return create_task

Expand Down
14 changes: 14 additions & 0 deletions tests/test_post_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ def test_post_reboot_consul_checks_passing(
"""
mocker.patch("time.sleep")
mocked_run = mocker.patch("subprocess.run")
mocked_popen = mocker.patch("subprocess.Popen")

result = run_cli(rebootmgr, ["-v"])

mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert result.exit_code == 0


Expand All @@ -50,10 +52,12 @@ def test_post_reboot_consul_checks_failing(

mocker.patch("time.sleep")
mocked_run = mocker.patch("subprocess.run")
mocked_popen = mocker.patch("subprocess.Popen")

result = run_cli(rebootmgr, ["-v"])

mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert result.exit_code == EXIT_CONSUL_CHECKS_FAILED


Expand All @@ -66,10 +70,12 @@ def test_post_reboot_wait_until_healthy_and_are_healthy(
"""
mocker.patch("time.sleep")
mocked_run = mocker.patch("subprocess.run")
mocked_popen = mocker.patch("subprocess.Popen")

result = run_cli(rebootmgr, ["-v", "--post-reboot-wait-until-healthy"])

mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert result.exit_code == 0


Expand Down Expand Up @@ -103,10 +109,12 @@ def fake_sleep(seconds):

mocker.patch("time.sleep", new=fake_sleep)
mocked_run = mocker.patch("subprocess.run")
mocked_popen = mocker.patch("subprocess.Popen")

result = run_cli(rebootmgr, ["-v", "--post-reboot-wait-until-healthy"])

mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert sleep_counter == 0
assert result.exit_code == 0

Expand Down Expand Up @@ -135,6 +143,7 @@ def test_post_reboot_phase_fails_with_uptime(
run_cli, forward_consul_port, default_config, reboot_in_progress,
reboot_task, mocker):
mocker.patch('rebootmgr.main.open', new=mock_open(read_data='99999999.9 99999999.9'))
mocker.patch("subprocess.run")
reboot_task("post_boot", "50_another_task.sh")

result = run_cli(rebootmgr, ["-v", "--check-uptime"])
Expand All @@ -146,6 +155,7 @@ def test_post_reboot_phase_fails_with_uptime(
def test_post_reboot_succeeds_with_current_node_in_maintenance(
run_cli, consul_cluster, reboot_in_progress, forward_consul_port,
default_config, reboot_task, mocker):
mocker.patch("subprocess.run")
consul_cluster[0].agent.service.register("A", tags=["rebootmgr"])
consul_cluster[1].agent.service.register("A", tags=["rebootmgr"])
consul_cluster[2].agent.service.register("A", tags=["rebootmgr"])
Expand All @@ -163,6 +173,7 @@ def test_post_reboot_succeeds_with_current_node_in_maintenance(
def test_post_reboot_fails_with_other_node_in_maintenance(
run_cli, consul_cluster, reboot_in_progress, forward_consul_port,
default_config, reboot_task, mocker):
mocker.patch("subprocess.run")
consul_cluster[0].agent.service.register("A", tags=["rebootmgr"])
consul_cluster[1].agent.service.register("A", tags=["rebootmgr"])
consul_cluster[2].agent.service.register("A", tags=["rebootmgr"])
Expand All @@ -181,6 +192,7 @@ def test_post_reboot_succeeds_with_other_node_in_maintenance_but_ignoring(
run_cli, consul_cluster, reboot_in_progress, forward_consul_port,
default_config, reboot_task, mocker):

mocker.patch("subprocess.run")
consul_cluster[0].agent.service.register("A", tags=["rebootmgr"])
consul_cluster[1].agent.service.register("A", tags=["rebootmgr", "ignore_maintenance"])
consul_cluster[2].agent.service.register("A", tags=["rebootmgr"])
Expand Down Expand Up @@ -228,10 +240,12 @@ def fake_sleep(seconds):

mocker.patch("time.sleep", new=fake_sleep)
mocked_run = mocker.patch("subprocess.run")
mocked_popen = mocker.patch("subprocess.Popen")

result = run_cli(rebootmgr, ["-v", "--post-reboot-wait-until-healthy"])

mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert sleep_counter == 0
assert 'There were failed consul checks' in result.output
assert '_node_maintenance on consul2' in result.output
Expand Down
17 changes: 14 additions & 3 deletions tests/test_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_dryrun_reboot_succeeds_with_tasks(run_cli, forward_consul_port,
reboot_task, mock_subprocess_run,
mocker):
mocked_sleep = mocker.patch("time.sleep")
reboot_task("pre_boot", "00_some_task.sh")
mocked_popen = reboot_task("pre_boot", "00_some_task.sh")
mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"])

result = run_cli(rebootmgr, ["-vv", "--dryrun"])
Expand All @@ -58,8 +58,11 @@ def test_dryrun_reboot_succeeds_with_tasks(run_cli, forward_consul_port,
assert "in key service/rebootmgr/reboot_in_progress" in result.output
assert result.exit_code == 0

assert mocked_run.call_count == 1
args, kwargs = mocked_run.call_args
# shutdown must not be called
mocked_run.assert_not_called()
# task should be called
assert mocked_popen.call_count == 1
args, kwargs = mocked_popen.call_args
assert args[0] == "/etc/rebootmgr/pre_boot_tasks/00_some_task.sh"
assert 'env' in kwargs
assert 'REBOOTMGR_DRY_RUN' in kwargs['env']
Expand All @@ -80,6 +83,7 @@ def test_reboot_fail(
mock_subprocess_run, mocker):
mocked_sleep = mocker.patch("time.sleep")

mocked_popen = mocker.patch("subprocess.Popen")
mocked_run = mock_subprocess_run(
["shutdown", "-r", "+1"],
side_effect=Exception("Failed to run reboot command"))
Expand All @@ -88,6 +92,7 @@ def test_reboot_fail(

assert result.exit_code == 1

mocked_popen.assert_not_called()
mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True)

# We want rebootmgr to sleep for 2 minutes after running the pre boot tasks,
Expand All @@ -112,10 +117,12 @@ def test_reboot_succeeds_if_this_node_is_in_maintenance(
consul_cluster[0].agent.maintenance(True)

mocker.patch("time.sleep")
mocked_popen = mocker.patch("subprocess.Popen")
mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"])

result = run_cli(rebootmgr, ["-v"])

mocked_popen.assert_not_called()
mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True)
assert result.exit_code == 0

Expand All @@ -129,10 +136,12 @@ def test_reboot_fails_if_another_node_is_in_maintenance(
consul_cluster[1].agent.maintenance(True)

mocker.patch("time.sleep")
mocked_popen = mocker.patch("subprocess.Popen")
mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"])

result = run_cli(rebootmgr, ["-v"])

mocked_popen.assert_not_called()
mocked_run.assert_not_called()
assert 'There were failed consul checks' in result.output
assert '_node_maintenance on consul2' in result.output
Expand All @@ -149,9 +158,11 @@ def test_reboot_succeeds_if_another_node_is_in_maintenance_but_ignoring(
consul_cluster[1].agent.maintenance(True)

mocker.patch("time.sleep")
mocked_popen = mocker.patch("subprocess.Popen")
mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"])

result = run_cli(rebootmgr, ["-v"])

mocked_popen.assert_not_called()
mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True)
assert result.exit_code == 0
4 changes: 4 additions & 0 deletions tests/test_stopflag.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,14 @@ def test_set_global_stop_flag(
mock_subprocess_run, mocker):
mocked_sleep = mocker.patch("time.sleep")
mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"])
mocked_popen = mocker.patch("subprocess.Popen")
datacenter = "test"

result = run_cli(rebootmgr, ["-v", "--set-global-stop-flag", datacenter])

mocked_sleep.assert_not_called()
mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert "Set "+datacenter+" global stop flag:" in result.output
idx, data = consul_cluster[0].kv.get("service/rebootmgr/stop", dc=datacenter)
assert idx is not None
Expand Down Expand Up @@ -96,12 +98,14 @@ def test_set_local_stop_flag(
mock_subprocess_run, mocker):
mocked_sleep = mocker.patch("time.sleep")
mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"])
mocked_popen = mocker.patch("subprocess.Popen")
hostname = socket.gethostname().split(".")[0]

result = run_cli(rebootmgr, ["-v", "--set-local-stop-flag"])

mocked_sleep.assert_not_called()
mocked_run.assert_not_called()
mocked_popen.assert_not_called()
assert "Set "+hostname+" local stop flag:" in result.output
idx, data = consul_cluster[0].kv.get("service/rebootmgr/nodes/{}/config".format(
hostname))
Expand Down
17 changes: 12 additions & 5 deletions tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def test_reboot_task_timeout(run_cli, consul_cluster, forward_consul_port, defau

def test_reboot_preboot_task_fails(run_cli, consul_cluster, forward_consul_port, default_config, reboot_task, mocker):
mocker.patch("time.sleep")
reboot_task("pre_boot", "00_some_task.sh", exit_code=1)
mocked_run = mocker.patch("subprocess.run")
mocked_popen = reboot_task("pre_boot", "00_some_task.sh", exit_code=1)

result = run_cli(rebootmgr)

Expand All @@ -29,13 +30,15 @@ def test_reboot_preboot_task_fails(run_cli, consul_cluster, forward_consul_port,
assert json.loads(data["Value"].decode()) == {
"enabled": True,
}
# TODO(oseibert): check that shutdown is NOT called.
assert mocked_popen.call_count == 1
mocked_run.assert_not_called()


def test_reboot_task_timeout_with_preexisting_config(run_cli, consul_cluster, forward_consul_port, reboot_task, mocker):
consul_cluster[0].kv.put("service/rebootmgr/nodes/{}/config".format(socket.gethostname()), '{"enabled": true, "test_preserved": true}')
mocker.patch("time.sleep")
reboot_task("pre_boot", "00_some_task.sh", raise_timeout_expired=True)
mocked_run = mocker.patch("subprocess.run")
mocked_popen = reboot_task("pre_boot", "00_some_task.sh", raise_timeout_expired=True)

result = run_cli(rebootmgr)

Expand All @@ -48,11 +51,13 @@ def test_reboot_task_timeout_with_preexisting_config(run_cli, consul_cluster, fo
"enabled": False,
"message": "Could not finish task /etc/rebootmgr/pre_boot_tasks/00_some_task.sh in 120 minutes"
}
# TODO(oseibert): check that shutdown is NOT called.
assert mocked_popen.call_count == 1
mocked_run.assert_not_called()


def test_post_reboot_phase_task_timeout(run_cli, consul_cluster, forward_consul_port, default_config, reboot_task, mocker):
reboot_task("post_boot", "50_another_task.sh", raise_timeout_expired=True)
mocked_run = mocker.patch("subprocess.run")
mocked_popen = reboot_task("post_boot", "50_another_task.sh", raise_timeout_expired=True)

mocker.patch("time.sleep")
consul_cluster[0].kv.put("service/rebootmgr/reboot_in_progress", socket.gethostname())
Expand All @@ -67,3 +72,5 @@ def test_post_reboot_phase_task_timeout(run_cli, consul_cluster, forward_consul_
"enabled": False,
"message": "Could not finish task /etc/rebootmgr/post_boot_tasks/50_another_task.sh in 120 minutes"
}
assert mocked_popen.call_count == 1
mocked_run.assert_not_called()
Loading