From f9ca0243660f01567ac33a654ef5c54aff3683c0 Mon Sep 17 00:00:00 2001
From: Joshua Watt <Joshua.Watt@garmin.com>
Date: Thu, 7 Sep 2023 08:36:05 -0600
Subject: [PATCH 1/2] ssh: Prevent timeout from deadlock

Using Popen.wait() on a process that has output sent to a pipe can
potentially deadlock if the process produces enough output to fill the
pipe, since it will stall and never terminate waiting for the pipe to
have more space.

Instead, use Popen.communicate() with the timeout parameter. This will
consume all output until EOF (preventing the process from stalling due
to a full pipe), and then check the return code. In the event of a
timeout error, Popen.communicate() doesn't loose any data, so it's safe
to call it again after the Popen.kill() in the exception handler.

This likely was done this way because the timeout parameter was new in
Python 3.3, but this shouldn't be a concern anymore

Signed-off-by: Joshua Watt <Joshua.Watt@garmin.com>
---
 labgrid/util/ssh.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/labgrid/util/ssh.py b/labgrid/util/ssh.py
index 0f51bbce8..8df205510 100644
--- a/labgrid/util/ssh.py
+++ b/labgrid/util/ssh.py
@@ -453,8 +453,8 @@ def _start_own_master(self):
         )
 
         try:
-            if self._master.wait(timeout=connect_timeout) != 0:
-                stdout, stderr = self._master.communicate()
+            stdout, stderr = self._master.communicate(timeout=connect_timeout)
+            if self._master.returncode != 0:
                 raise ExecutionError(
                     f"failed to connect to {self.host} with args {args}, returncode={self._master.returncode} {stdout},{stderr}"  # pylint: disable=line-too-long
                 )

From b8f059fb4277e3ca01684b7a71ac89974ff1cc36 Mon Sep 17 00:00:00 2001
From: Joshua Watt <Joshua.Watt@garmin.com>
Date: Thu, 7 Sep 2023 08:36:05 -0600
Subject: [PATCH 2/2] sshdriver: Prevent timeout from deadlock

Using Popen.wait() on a process that has output sent to a pipe can
potentially deadlock if the process produces enough output to fill the
pipe, since it will stall and never terminate waiting for the pipe to
have more space.

Instead, use Popen.communicate() with the timeout parameter. This will
consume all output until EOF (preventing the process from stalling due
to a full pipe), and then check the return code. In the event of a
timeout error, Popen.communicate() doesn't loose any data, so it's safe
to call it again after the Popen.kill() in the exception handler.

This likely was done this way because the timeout parameter was new in
Python 3.3, but this shouldn't be a concern anymore

Signed-off-by: Joshua Watt <Joshua.Watt@garmin.com>
---
 labgrid/driver/sshdriver.py | 11 ++++++-----
 tests/test_sshdriver.py     |  4 ++++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/labgrid/driver/sshdriver.py b/labgrid/driver/sshdriver.py
index 3ad6fafa5..fb63988d4 100644
--- a/labgrid/driver/sshdriver.py
+++ b/labgrid/driver/sshdriver.py
@@ -136,9 +136,8 @@ def _start_own_master_once(self, timeout):
 
         try:
             subprocess_timeout = timeout + 5
-            return_value = self.process.wait(timeout=subprocess_timeout)
-            if return_value != 0:
-                stdout, _ = self.process.communicate(timeout=subprocess_timeout)
+            stdout, _ = self.process.communicate(timeout=subprocess_timeout)
+            if self.process.returncode != 0:
                 stdout = stdout.split(b"\n")
                 for line in stdout:
                     self.logger.warning("ssh: %s", line.rstrip().decode(encoding="utf-8", errors="replace"))
@@ -155,12 +154,14 @@ def _start_own_master_once(self, timeout):
                     pass
 
                 raise ExecutionError(
-                    f"Failed to connect to {self.networkservice.address} with {' '.join(args)}: return code {return_value}",  # pylint: disable=line-too-long
+                    f"Failed to connect to {self.networkservice.address} with {' '.join(args)}: return code {self.process.returncode}",  # pylint: disable=line-too-long
                     stdout=stdout,
                 )
         except subprocess.TimeoutExpired:
+            self.process.kill()
+            stdout, _ = self.process.communicate()
             raise ExecutionError(
-                f"Subprocess timed out [{subprocess_timeout}s] while executing {args}",
+                f"Subprocess timed out [{subprocess_timeout}s] while executing {args}: {stdout}",
             )
         finally:
             if self.networkservice.password and os.path.exists(pass_file):
diff --git a/tests/test_sshdriver.py b/tests/test_sshdriver.py
index 875570822..0f766dfc7 100644
--- a/tests/test_sshdriver.py
+++ b/tests/test_sshdriver.py
@@ -17,6 +17,8 @@ def ssh_driver_mocked_and_activated(target, mocker):
     instance_mock = mocker.MagicMock()
     popen.return_value = instance_mock
     instance_mock.wait = mocker.MagicMock(return_value=0)
+    instance_mock.communicate = mocker.MagicMock(return_value=(b"", b""))
+    instance_mock.returncode = 0
     SSHDriver(target, "ssh")
     s = target.get_driver("SSHDriver")
     return s
@@ -35,6 +37,8 @@ def test_create(target, mocker):
     instance_mock = mocker.MagicMock()
     popen.return_value = instance_mock
     instance_mock.wait = mocker.MagicMock(return_value=0)
+    instance_mock.communicate = mocker.MagicMock(return_value=(b"", b""))
+    instance_mock.returncode = 0
     s = SSHDriver(target, "ssh")
     assert isinstance(s, SSHDriver)