From f9ca0243660f01567ac33a654ef5c54aff3683c0 Mon Sep 17 00:00:00 2001 From: Joshua Watt Date: Thu, 7 Sep 2023 08:36:05 -0600 Subject: [PATCH 1/2] ssh: Prevent timeout from deadlock Using Popen.wait() on a process that has output sent to a pipe can potentially deadlock if the process produces enough output to fill the pipe, since it will stall and never terminate waiting for the pipe to have more space. Instead, use Popen.communicate() with the timeout parameter. This will consume all output until EOF (preventing the process from stalling due to a full pipe), and then check the return code. In the event of a timeout error, Popen.communicate() doesn't loose any data, so it's safe to call it again after the Popen.kill() in the exception handler. This likely was done this way because the timeout parameter was new in Python 3.3, but this shouldn't be a concern anymore Signed-off-by: Joshua Watt --- labgrid/util/ssh.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/labgrid/util/ssh.py b/labgrid/util/ssh.py index 0f51bbce8..8df205510 100644 --- a/labgrid/util/ssh.py +++ b/labgrid/util/ssh.py @@ -453,8 +453,8 @@ def _start_own_master(self): ) try: - if self._master.wait(timeout=connect_timeout) != 0: - stdout, stderr = self._master.communicate() + stdout, stderr = self._master.communicate(timeout=connect_timeout) + if self._master.returncode != 0: raise ExecutionError( f"failed to connect to {self.host} with args {args}, returncode={self._master.returncode} {stdout},{stderr}" # pylint: disable=line-too-long ) From b8f059fb4277e3ca01684b7a71ac89974ff1cc36 Mon Sep 17 00:00:00 2001 From: Joshua Watt Date: Thu, 7 Sep 2023 08:36:05 -0600 Subject: [PATCH 2/2] sshdriver: Prevent timeout from deadlock Using Popen.wait() on a process that has output sent to a pipe can potentially deadlock if the process produces enough output to fill the pipe, since it will stall and never terminate waiting for the pipe to have more space. Instead, use Popen.communicate() with the timeout parameter. This will consume all output until EOF (preventing the process from stalling due to a full pipe), and then check the return code. In the event of a timeout error, Popen.communicate() doesn't loose any data, so it's safe to call it again after the Popen.kill() in the exception handler. This likely was done this way because the timeout parameter was new in Python 3.3, but this shouldn't be a concern anymore Signed-off-by: Joshua Watt --- labgrid/driver/sshdriver.py | 11 ++++++----- tests/test_sshdriver.py | 4 ++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/labgrid/driver/sshdriver.py b/labgrid/driver/sshdriver.py index 3ad6fafa5..fb63988d4 100644 --- a/labgrid/driver/sshdriver.py +++ b/labgrid/driver/sshdriver.py @@ -136,9 +136,8 @@ def _start_own_master_once(self, timeout): try: subprocess_timeout = timeout + 5 - return_value = self.process.wait(timeout=subprocess_timeout) - if return_value != 0: - stdout, _ = self.process.communicate(timeout=subprocess_timeout) + stdout, _ = self.process.communicate(timeout=subprocess_timeout) + if self.process.returncode != 0: stdout = stdout.split(b"\n") for line in stdout: self.logger.warning("ssh: %s", line.rstrip().decode(encoding="utf-8", errors="replace")) @@ -155,12 +154,14 @@ def _start_own_master_once(self, timeout): pass raise ExecutionError( - f"Failed to connect to {self.networkservice.address} with {' '.join(args)}: return code {return_value}", # pylint: disable=line-too-long + f"Failed to connect to {self.networkservice.address} with {' '.join(args)}: return code {self.process.returncode}", # pylint: disable=line-too-long stdout=stdout, ) except subprocess.TimeoutExpired: + self.process.kill() + stdout, _ = self.process.communicate() raise ExecutionError( - f"Subprocess timed out [{subprocess_timeout}s] while executing {args}", + f"Subprocess timed out [{subprocess_timeout}s] while executing {args}: {stdout}", ) finally: if self.networkservice.password and os.path.exists(pass_file): diff --git a/tests/test_sshdriver.py b/tests/test_sshdriver.py index 875570822..0f766dfc7 100644 --- a/tests/test_sshdriver.py +++ b/tests/test_sshdriver.py @@ -17,6 +17,8 @@ def ssh_driver_mocked_and_activated(target, mocker): instance_mock = mocker.MagicMock() popen.return_value = instance_mock instance_mock.wait = mocker.MagicMock(return_value=0) + instance_mock.communicate = mocker.MagicMock(return_value=(b"", b"")) + instance_mock.returncode = 0 SSHDriver(target, "ssh") s = target.get_driver("SSHDriver") return s @@ -35,6 +37,8 @@ def test_create(target, mocker): instance_mock = mocker.MagicMock() popen.return_value = instance_mock instance_mock.wait = mocker.MagicMock(return_value=0) + instance_mock.communicate = mocker.MagicMock(return_value=(b"", b"")) + instance_mock.returncode = 0 s = SSHDriver(target, "ssh") assert isinstance(s, SSHDriver)