From 91b475852e0ecb1c6819a8c26a319b6d04620a5e Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Tue, 8 Aug 2023 22:38:44 -0400 Subject: [PATCH] code: fixes for cleanup of qemu/container nodes and session creation --- munet/base.py | 26 ++++++++++++++++++-------- munet/native.py | 45 +++++++++++++++++++++++++++------------------ pyproject.toml | 2 +- 3 files changed, 46 insertions(+), 27 deletions(-) diff --git a/munet/base.py b/munet/base.py index 15e8158..6bcd547 100644 --- a/munet/base.py +++ b/munet/base.py @@ -865,14 +865,18 @@ async def async_cleanup_proc(self, p, pid=None): else: o, e = await p.communicate() self.logger.debug( - "%s: cmd_p already exited status: %s", self, proc_error(p, o, e) + "%s: [cleanup_proc] proc already exited status: %s", + self, + proc_error(p, o, e), ) return None if pid is None: pid = p.pid - self.logger.debug("%s: terminate process: %s (pid %s)", self, proc_str(p), pid) + self.logger.debug( + "%s: [cleanup_proc] terminate process: %s (pid %s)", self, proc_str(p), pid + ) try: # This will SIGHUP and wait a while then SIGKILL and return immediately await self.cleanup_pid(p.pid, pid) @@ -885,14 +889,19 @@ async def async_cleanup_proc(self, p, pid=None): else: o, e = await asyncio.wait_for(p.communicate(), timeout=wait_secs) self.logger.debug( - "%s: cmd_p exited after kill, status: %s", self, proc_error(p, o, e) + "%s: [cleanup_proc] exited after kill, status: %s", + self, + proc_error(p, o, e), ) except (asyncio.TimeoutError, subprocess.TimeoutExpired): - self.logger.warning("%s: SIGKILL timeout", self) + self.logger.warning("%s: [cleanup_proc] SIGKILL timeout", self) return p except Exception as error: self.logger.warning( - "%s: kill unexpected exception: %s", self, error, exc_info=True + "%s: [cleanup_proc] kill unexpected exception: %s", + self, + error, + exc_info=True, ) return p return None @@ -2005,8 +2014,9 @@ def __init__( stdout=stdout, stderr=stderr, text=True, - start_new_session=not unet, + # start_new_session=not unet shell=False, + preexec_fn=os.setsid if not unet else None, ) # The pid number returned is in the global pid namespace. For unshare_inline @@ -2345,14 +2355,14 @@ async def _async_delete(self): and self.pid != our_pid ): self.logger.debug( - "cleanup pid on separate pid %s from proc pid %s", + "cleanup separate pid %s from namespace proc pid %s", self.pid, self.p.pid if self.p else None, ) await self.cleanup_pid(self.pid) if self.p is not None: - self.logger.debug("cleanup proc pid %s", self.p.pid) + self.logger.debug("cleanup namespace proc pid %s", self.p.pid) await self.async_cleanup_proc(self.p) # return to the previous namespace, need to do this in case anothe munet diff --git a/munet/native.py b/munet/native.py index 03408c4..f3cb522 100644 --- a/munet/native.py +++ b/munet/native.py @@ -1025,7 +1025,7 @@ async def add_phy_intf(self, devaddr, lname): ) self.phy_odrivers[devaddr] = driver self.unet.rootcmd.cmd_raises( - f"echo {devaddr} > /sys/bus/pci/drivers/{driver}/unbind" + f"echo {devaddr} | timeout 10 tee /sys/bus/pci/drivers/{driver}/unbind" ) # Add the device vendor and device id to vfio-pci in case it's the first time @@ -1067,7 +1067,7 @@ async def rem_phy_intf(self, devaddr): "Unbinding physical PCI device %s from driver vfio-pci", devaddr ) self.unet.rootcmd.cmd_status( - f"echo {devaddr} > /sys/bus/pci/drivers/vfio-pci/unbind" + f"echo {devaddr} | timeout 10 tee /sys/bus/pci/drivers/vfio-pci/unbind" ) self.logger.info("Binding physical PCI device %s to driver %s", devaddr, driver) @@ -1086,13 +1086,13 @@ async def _async_delete(self): for hname in list(self.host_intfs): await self.rem_host_intf(hname) - # remove any hostintf interfaces - for devaddr in list(self.phy_intfs): - await self.rem_phy_intf(devaddr) - # delete the LinuxNamespace/InterfaceMixin await super()._async_delete() + # remove any hostintf interfaces, needs to come after normal exits + for devaddr in list(self.phy_intfs): + await self.rem_phy_intf(devaddr) + class L3NamespaceNode(L3NodeMixin, LinuxNamespace): """A namespace L3 node.""" @@ -1124,6 +1124,7 @@ def __init__(self, name, config, **kwargs): assert self.container_image self.cmd_p = None + self.cmd_pid = None self.__base_cmd = [] self.__base_cmd_pty = [] @@ -1394,7 +1395,13 @@ async def run_cmd(self): start_new_session=True, # keeps main tty signals away from podman ) - self.logger.debug("%s: async_popen => %s", self, self.cmd_p.pid) + # If our process is actually the child of an nsenter fetch its pid. + if self.nsenter_fork: + self.cmd_pid = await self.get_proc_child_pid(self.cmd_p) + + self.logger.debug( + "%s: async_popen => %s (%s)", self, self.cmd_p.pid, self.cmd_pid + ) self.pytest_hook_run_cmd(stdout, stderr) @@ -1543,6 +1550,7 @@ def __init__(self, name, config, **kwargs): """Create a Container Node.""" self.cont_exec_paths = {} self.launch_p = None + self.launch_pid = None self.qemu_config = config["qemu"] self.extra_mounts = [] assert self.qemu_config @@ -2262,7 +2270,7 @@ async def launch(self): stdout = open(os.path.join(self.rundir, "qemu.out"), "wb") stderr = open(os.path.join(self.rundir, "qemu.err"), "wb") - self.launch_p = await self.async_popen( + self.launch_p = await self.async_popen_nsonly( args, stdin=subprocess.DEVNULL, stdout=stdout, @@ -2271,16 +2279,22 @@ async def launch(self): # We don't need this here b/c we are only ever running qemu and that's all # we need to kill for cleanup # XXX reconcile this - start_new_session=True, # allows us to signal all children to exit + # start_new_session=True, # allows us to signal all children to exit + preexec_fn=os.setsid, ) + if self.nsenter_fork: + self.launch_pid = await self.get_proc_child_pid(self.launch_p) + self.pytest_hook_run_cmd(stdout, stderr) # We've passed these on, so don't need these open here anymore. for fd in pass_fds: os.close(fd) - self.logger.debug("%s: async_popen => %s", self, self.launch_p.pid) + self.logger.debug( + "%s: popen => %s (%s)", self, self.launch_p.pid, self.launch_pid + ) confiles = ["_console"] if use_cmdcon: @@ -2349,11 +2363,6 @@ def launch_completed(self, future): "%s: node launch (qemu) cmd wait() canceled: %s", future, error ) - async def cleanup_qemu(self): - """Launch qemu.""" - if self.launch_p: - await self.async_cleanup_proc(self.launch_p) - async def async_cleanup_cmd(self): """Run the configured cleanup commands for this node.""" self.cleanup_called = True @@ -2373,7 +2382,7 @@ async def _async_delete(self): # Need to cleanup early b/c it is running on the VM if self.cmd_p: - await self.async_cleanup_proc(self.cmd_p) + await self.async_cleanup_proc(self.cmd_p, self.cmd_pid) self.cmd_p = None try: @@ -2389,9 +2398,9 @@ async def _async_delete(self): if not self.launch_p: self.logger.warning("async_delete: qemu is not running") else: - await self.cleanup_qemu() + await self.async_cleanup_proc(self.launch_p, self.launch_pid) except Exception as error: - self.logger.warning("%s: failued to cleanup qemu process: %s", self, error) + self.logger.warning("%s: failed to cleanup qemu process: %s", self, error) await super()._async_delete() diff --git a/pyproject.toml b/pyproject.toml index 444f911..ef9092a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "munet" -version = "0.13.7" +version = "0.13.8" description = "A package to facilitate network simulations" authors = ["Christian Hopps "] license = "GPL-2.0-or-later"