Skip to content

Commit

Permalink
code: fixes for cleanup of qemu/container nodes and session creation
Browse files Browse the repository at this point in the history
  • Loading branch information
choppsv1 committed Aug 9, 2023
1 parent dd77e3e commit 91b4758
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 27 deletions.
26 changes: 18 additions & 8 deletions munet/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -865,14 +865,18 @@ async def async_cleanup_proc(self, p, pid=None):
else:
o, e = await p.communicate()
self.logger.debug(
"%s: cmd_p already exited status: %s", self, proc_error(p, o, e)
"%s: [cleanup_proc] proc already exited status: %s",
self,
proc_error(p, o, e),
)
return None

if pid is None:
pid = p.pid

self.logger.debug("%s: terminate process: %s (pid %s)", self, proc_str(p), pid)
self.logger.debug(
"%s: [cleanup_proc] terminate process: %s (pid %s)", self, proc_str(p), pid
)
try:
# This will SIGHUP and wait a while then SIGKILL and return immediately
await self.cleanup_pid(p.pid, pid)
Expand All @@ -885,14 +889,19 @@ async def async_cleanup_proc(self, p, pid=None):
else:
o, e = await asyncio.wait_for(p.communicate(), timeout=wait_secs)
self.logger.debug(
"%s: cmd_p exited after kill, status: %s", self, proc_error(p, o, e)
"%s: [cleanup_proc] exited after kill, status: %s",
self,
proc_error(p, o, e),
)
except (asyncio.TimeoutError, subprocess.TimeoutExpired):
self.logger.warning("%s: SIGKILL timeout", self)
self.logger.warning("%s: [cleanup_proc] SIGKILL timeout", self)
return p
except Exception as error:
self.logger.warning(
"%s: kill unexpected exception: %s", self, error, exc_info=True
"%s: [cleanup_proc] kill unexpected exception: %s",
self,
error,
exc_info=True,
)
return p
return None
Expand Down Expand Up @@ -2005,8 +2014,9 @@ def __init__(
stdout=stdout,
stderr=stderr,
text=True,
start_new_session=not unet,
# start_new_session=not unet
shell=False,
preexec_fn=os.setsid if not unet else None,
)

# The pid number returned is in the global pid namespace. For unshare_inline
Expand Down Expand Up @@ -2345,14 +2355,14 @@ async def _async_delete(self):
and self.pid != our_pid
):
self.logger.debug(
"cleanup pid on separate pid %s from proc pid %s",
"cleanup separate pid %s from namespace proc pid %s",
self.pid,
self.p.pid if self.p else None,
)
await self.cleanup_pid(self.pid)

if self.p is not None:
self.logger.debug("cleanup proc pid %s", self.p.pid)
self.logger.debug("cleanup namespace proc pid %s", self.p.pid)
await self.async_cleanup_proc(self.p)

# return to the previous namespace, need to do this in case anothe munet
Expand Down
45 changes: 27 additions & 18 deletions munet/native.py
Original file line number Diff line number Diff line change
Expand Up @@ -1025,7 +1025,7 @@ async def add_phy_intf(self, devaddr, lname):
)
self.phy_odrivers[devaddr] = driver
self.unet.rootcmd.cmd_raises(
f"echo {devaddr} > /sys/bus/pci/drivers/{driver}/unbind"
f"echo {devaddr} | timeout 10 tee /sys/bus/pci/drivers/{driver}/unbind"
)

# Add the device vendor and device id to vfio-pci in case it's the first time
Expand Down Expand Up @@ -1067,7 +1067,7 @@ async def rem_phy_intf(self, devaddr):
"Unbinding physical PCI device %s from driver vfio-pci", devaddr
)
self.unet.rootcmd.cmd_status(
f"echo {devaddr} > /sys/bus/pci/drivers/vfio-pci/unbind"
f"echo {devaddr} | timeout 10 tee /sys/bus/pci/drivers/vfio-pci/unbind"
)

self.logger.info("Binding physical PCI device %s to driver %s", devaddr, driver)
Expand All @@ -1086,13 +1086,13 @@ async def _async_delete(self):
for hname in list(self.host_intfs):
await self.rem_host_intf(hname)

# remove any hostintf interfaces
for devaddr in list(self.phy_intfs):
await self.rem_phy_intf(devaddr)

# delete the LinuxNamespace/InterfaceMixin
await super()._async_delete()

# remove any hostintf interfaces, needs to come after normal exits
for devaddr in list(self.phy_intfs):
await self.rem_phy_intf(devaddr)


class L3NamespaceNode(L3NodeMixin, LinuxNamespace):
"""A namespace L3 node."""
Expand Down Expand Up @@ -1124,6 +1124,7 @@ def __init__(self, name, config, **kwargs):
assert self.container_image

self.cmd_p = None
self.cmd_pid = None
self.__base_cmd = []
self.__base_cmd_pty = []

Expand Down Expand Up @@ -1394,7 +1395,13 @@ async def run_cmd(self):
start_new_session=True, # keeps main tty signals away from podman
)

self.logger.debug("%s: async_popen => %s", self, self.cmd_p.pid)
# If our process is actually the child of an nsenter fetch its pid.
if self.nsenter_fork:
self.cmd_pid = await self.get_proc_child_pid(self.cmd_p)

self.logger.debug(
"%s: async_popen => %s (%s)", self, self.cmd_p.pid, self.cmd_pid
)

self.pytest_hook_run_cmd(stdout, stderr)

Expand Down Expand Up @@ -1543,6 +1550,7 @@ def __init__(self, name, config, **kwargs):
"""Create a Container Node."""
self.cont_exec_paths = {}
self.launch_p = None
self.launch_pid = None
self.qemu_config = config["qemu"]
self.extra_mounts = []
assert self.qemu_config
Expand Down Expand Up @@ -2262,7 +2270,7 @@ async def launch(self):

stdout = open(os.path.join(self.rundir, "qemu.out"), "wb")
stderr = open(os.path.join(self.rundir, "qemu.err"), "wb")
self.launch_p = await self.async_popen(
self.launch_p = await self.async_popen_nsonly(
args,
stdin=subprocess.DEVNULL,
stdout=stdout,
Expand All @@ -2271,16 +2279,22 @@ async def launch(self):
# We don't need this here b/c we are only ever running qemu and that's all
# we need to kill for cleanup
# XXX reconcile this
start_new_session=True, # allows us to signal all children to exit
# start_new_session=True, # allows us to signal all children to exit
preexec_fn=os.setsid,
)

if self.nsenter_fork:
self.launch_pid = await self.get_proc_child_pid(self.launch_p)

self.pytest_hook_run_cmd(stdout, stderr)

# We've passed these on, so don't need these open here anymore.
for fd in pass_fds:
os.close(fd)

self.logger.debug("%s: async_popen => %s", self, self.launch_p.pid)
self.logger.debug(
"%s: popen => %s (%s)", self, self.launch_p.pid, self.launch_pid
)

confiles = ["_console"]
if use_cmdcon:
Expand Down Expand Up @@ -2349,11 +2363,6 @@ def launch_completed(self, future):
"%s: node launch (qemu) cmd wait() canceled: %s", future, error
)

async def cleanup_qemu(self):
"""Launch qemu."""
if self.launch_p:
await self.async_cleanup_proc(self.launch_p)

async def async_cleanup_cmd(self):
"""Run the configured cleanup commands for this node."""
self.cleanup_called = True
Expand All @@ -2373,7 +2382,7 @@ async def _async_delete(self):

# Need to cleanup early b/c it is running on the VM
if self.cmd_p:
await self.async_cleanup_proc(self.cmd_p)
await self.async_cleanup_proc(self.cmd_p, self.cmd_pid)
self.cmd_p = None

try:
Expand All @@ -2389,9 +2398,9 @@ async def _async_delete(self):
if not self.launch_p:
self.logger.warning("async_delete: qemu is not running")
else:
await self.cleanup_qemu()
await self.async_cleanup_proc(self.launch_p, self.launch_pid)
except Exception as error:
self.logger.warning("%s: failued to cleanup qemu process: %s", self, error)
self.logger.warning("%s: failed to cleanup qemu process: %s", self, error)

await super()._async_delete()

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "munet"
version = "0.13.7"
version = "0.13.8"
description = "A package to facilitate network simulations"
authors = ["Christian Hopps <[email protected]>"]
license = "GPL-2.0-or-later"
Expand Down

0 comments on commit 91b4758

Please sign in to comment.