Skip to content

Commit

Permalink
[202205] Improve the cleanup of PTF container (#10523)
Browse files Browse the repository at this point in the history
Cherry-pick #10069 and #10286 to 202205 branch.

* Improve the cleanup of processes and interfaces before stopping PTF container (#10244 

What is the motivation for this PR?
We still observe issues with "testbed-cli.sh remove-topo" and "testbed-cli.sh restart-ptf":

Server may crash and run into CPU softlock issue.
Some exabgp process cannot be fully stopped and "restart-ptf" may fail.
The expectation is that remove-topo and restart-ptf can always be successful. And of course, no server crash.
Possible reason of server crash:

Some exabgp processes are still running in PTF container while we remove the container. This could cause server crash.
Some network interfaces are in the PTF container's network namespace while we remove the container.

How did you do it?
Added a customized module "ptf_control" to stop&kill processes running in PTF container in a more aggressive and reliable way.
Improve the vm_topology module to remove network interfaces from the PTF container in the "unbind" procedure.
Added a vm_topology "unbind" step in the "testbed-cli.sh restart-ptf" procedure.
Updated some "ip link" commands to fully compliant with the syntax in "ip link help".

How did you verify/test it?
Tested the add-topo/remove-topo on both physical and KVM testbed.
Tested restart-ptf on phsycial testbed.

* Avoid running command in exited ptf docker container (#10286)

While stopping PTF container, "ptf_control" module is executed to
kill all processes in the PTF container.
The original code checks if the PTF container's Pid exists before
running command in the PTF container. Unfortunately, this check
is not enough. PTF docker container in exited status still has Pid.

This change improved the code for getting PTF container's Pid.
When PTF container is not in "running" status, always return None
for PTF container's Pid.

Signed-off-by: Xin Wang <[email protected]>
  • Loading branch information
wangxin authored Oct 30, 2023
1 parent cda5cac commit 13c6b12
Show file tree
Hide file tree
Showing 5 changed files with 246 additions and 130 deletions.
141 changes: 141 additions & 0 deletions ansible/roles/vm_set/library/ptf_control.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/python

import json
import logging
import traceback

import docker

from ansible.module_utils.debug_utils import config_module_logging
from ansible.module_utils.basic import AnsibleModule

DOCUMENTATION = '''
---
module: ptf_control
version_added: "0.1"
author: Xin Wang ([email protected])
short_description: Control PTF container
description: For controlling PTF container, for example killing processes running in PTF container before stopping it.
Parameters:
- ctn_name: Name of the PTF container
- command: Command to run, currently only support "kill"
'''

EXAMPLES = '''
- name: Kill exabgp and ptf_nn_agent processes in PTF container
ptf_control:
ctn_name: "ptf_vms6-1"
command: kill
'''


class PtfControl(object):
"""This class is for controlling PTF container
"""

def __init__(self, module, ctn_name):
self.module = module
self.ctn_name = ctn_name

self.pid = PtfControl.get_pid(self.ctn_name)

def cmd(self, cmdline, use_unsafe_shell=False, ignore_failure=False, verbose=True):
rc, out, err = self.module.run_command(cmdline, use_unsafe_shell=use_unsafe_shell)
if verbose:
msg = {
'cmd': cmdline,
'rc': rc,
'stdout_lines': out.splitlines(),
'stderr_lines': err.splitlines()
}
logging.debug('***** RUN CMD:\n%s' % json.dumps(msg, indent=2))

if rc != 0 and not ignore_failure:
raise Exception("Failed to run command: %s, rc=%d, out=%s, err=%s" % (cmdline, rc, out, err))
return rc, out, err

@staticmethod
def get_pid(ctn_name):
cli = docker.from_env()
try:
ctn = cli.containers.get(ctn_name)
if ctn.status == 'running':
return ctn.attrs['State']['Pid']
except Exception as e:
logging.debug("Failed to get pid for container %s: %s" % (ctn_name, str(e)))

return None

def get_process_pids(self, process):
cmd = 'docker exec -t {} bash -c "pgrep -f \'{}\'"'.format(self.ctn_name, process)
_, out, _ = self.cmd(cmd, ignore_failure=True)
return [int(pid.strip()) for pid in out.splitlines()]

def get_supervisord_processes(self):
_, out, _ = self.cmd(
'docker exec -t {} bash -c "supervisorctl status"'.format(self.ctn_name), ignore_failure=True
)
processes = [line.strip().split()[0] for line in out.splitlines() if "sshd" not in line]
return processes

def kill_process(self, pid):
self.cmd('docker exec -t {} bash -c "kill -9 {}"'.format(self.ctn_name, pid), ignore_failure=True)

def kill_processes(self):
supervisord_processes = self.get_supervisord_processes()
self.cmd('docker exec -t {} bash -c "ps -ef"'.format(self.ctn_name))
for i in range(3):
logging.info("=== Attempt %d ===" % (i + 1))
logging.info("=== Use supervisorctl to stop processes ===")
for process in supervisord_processes:
self.cmd(
'docker exec -t {} bash -c "supervisorctl stop {}"'.format(self.ctn_name, process),
ignore_failure=True
)
self.cmd(
'docker exec -t {} bash -c "ps -ef"'.format(self.ctn_name)
)

for pattern in [
"/usr/share/exabgp/http_api.py",
"/usr/local/bin/exabgp",
"ptf_nn_agent.py"
]:
logging.info("=== Kill process %s ===" % pattern)
for pid in self.get_process_pids(pattern):
self.kill_process(pid)

self.cmd('docker exec -t {} bash -c "ps -ef"'.format(self.ctn_name))


def main():
module = AnsibleModule(
argument_spec=dict(
ctn_name=dict(required=True, type='str'),
command=dict(required=True, type='str')
),
supports_check_mode=False)

ctn_name = module.params['ctn_name']
command = module.params['command']
if command not in ['kill']:
module.fail_json(msg="command %s is not supported" % command)

config_module_logging('ptf_control_' + ctn_name)

try:
ptf = PtfControl(module, ctn_name)
if command == "kill":
if ptf.pid is not None:
ptf.kill_processes()
except Exception as error:
logging.error(traceback.format_exc())
module.fail_json(msg=str(error))

module.exit_json(changed=True)


if __name__ == "__main__":
main()
88 changes: 70 additions & 18 deletions ansible/roles/vm_set/library/vm_topology.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ def add_br_if_to_docker(self, bridge, ext_if, int_if):
VMTopology.iface_up(ext_if)

if VMTopology.intf_exists(tmp_int_if) and VMTopology.intf_not_exists(tmp_int_if, pid=self.pid):
VMTopology.cmd("ip link set netns %s dev %s" % (self.pid, tmp_int_if))
VMTopology.cmd("ip link set dev %s netns %s " % (tmp_int_if, self.pid))
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" % (self.pid, tmp_int_if, int_if))

VMTopology.iface_up(int_if, pid=self.pid)
Expand All @@ -494,7 +494,7 @@ def add_br_if_to_netns(self, bridge, ext_if, int_if):
VMTopology.iface_up(ext_if)

if VMTopology.intf_exists(tmp_int_if) and VMTopology.intf_not_exists(tmp_int_if, netns=self.netns):
VMTopology.cmd("ip link set netns %s dev %s" % (self.netns, tmp_int_if))
VMTopology.cmd("ip link set dev %s netns %s" % (tmp_int_if, self.netns))
VMTopology.cmd("ip netns exec %s ip link set dev %s name %s" % (self.netns, tmp_int_if, int_if))

VMTopology.iface_up(int_if, netns=self.netns)
Expand Down Expand Up @@ -535,9 +535,9 @@ def add_ip_to_netns_if(self, int_if, ip_addr, ipv6_addr=None, default_gw=None, d
def add_dut_if_to_docker(self, iface_name, dut_iface):
logging.info("=== Add DUT interface %s to PTF docker as %s ===" % (dut_iface, iface_name))
if VMTopology.intf_exists(dut_iface) \
and VMTopology.intf_not_exists(dut_iface, pid=self.pid) \
and VMTopology.intf_not_exists(iface_name, pid=self.pid):
VMTopology.cmd("ip link set netns %s dev %s" % (self.pid, dut_iface))
and VMTopology.intf_not_exists(dut_iface, pid=self.pid) \
and VMTopology.intf_not_exists(iface_name, pid=self.pid):
VMTopology.cmd("ip link set dev %s netns %s" % (dut_iface, self.pid))

if VMTopology.intf_exists(dut_iface, pid=self.pid) and VMTopology.intf_not_exists(iface_name, pid=self.pid):
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" % (self.pid, dut_iface, iface_name))
Expand All @@ -553,7 +553,7 @@ def add_dut_vlan_subif_to_docker(self, iface_name, vlan_separator, vlan_id):
VMTopology.cmd("nsenter -t %s -n ip link set %s up" % (self.pid, vlan_sub_iface_name))

def remove_dut_if_from_docker(self, iface_name, dut_iface):

logging.info("=== Restore docker interface %s as dut interface %s ===" % (iface_name, dut_iface))
if self.pid is None:
return

Expand All @@ -564,7 +564,8 @@ def remove_dut_if_from_docker(self, iface_name, dut_iface):
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" % (self.pid, iface_name, dut_iface))

if VMTopology.intf_not_exists(dut_iface) and VMTopology.intf_exists(dut_iface, pid=self.pid):
VMTopology.cmd("nsenter -t %s -n ip link set netns 1 dev %s" % (self.pid, dut_iface))
VMTopology.cmd(
"nsenter -t %s -n ip link set dev %s netns 1" % (self.pid, dut_iface))

def remove_dut_vlan_subif_from_docker(self, iface_name, vlan_separator, vlan_id):
"""Remove the vlan sub interface created for the ptf interface."""
Expand All @@ -573,7 +574,9 @@ def remove_dut_vlan_subif_from_docker(self, iface_name, vlan_separator, vlan_id)

vlan_sub_iface_name = iface_name + vlan_separator + vlan_id
if VMTopology.intf_exists(vlan_sub_iface_name, pid=self.pid):
VMTopology.cmd("nsenter -t %s -n ip link del %s" % (self.pid, vlan_sub_iface_name))
VMTopology.iface_down(vlan_sub_iface_name, pid=self.pid)
VMTopology.cmd("nsenter -t %s -n ip link del %s" %
(self.pid, vlan_sub_iface_name))

def add_veth_if_to_docker(self, ext_if, int_if, create_vlan_subintf=False, **kwargs):
"""Create vethernet devices (ext_if, int_if) and put int_if into the ptf docker."""
Expand Down Expand Up @@ -618,14 +621,16 @@ def add_veth_if_to_docker(self, ext_if, int_if, create_vlan_subintf=False, **kwa
VMTopology.iface_up(ext_if)

if VMTopology.intf_exists(t_int_if) \
and VMTopology.intf_not_exists(t_int_if, pid=self.pid) \
and VMTopology.intf_not_exists(int_if, pid=self.pid):
VMTopology.cmd("ip link set netns %s dev %s" % (self.pid, t_int_if))
and VMTopology.intf_not_exists(t_int_if, pid=self.pid) \
and VMTopology.intf_not_exists(int_if, pid=self.pid):
VMTopology.cmd("ip link set dev %s netns %s" %
(t_int_if, self.pid))
if create_vlan_subintf \
and VMTopology.intf_exists(t_int_sub_if) \
and VMTopology.intf_not_exists(t_int_sub_if, pid=self.pid) \
and VMTopology.intf_not_exists(int_sub_if, pid=self.pid):
VMTopology.cmd("ip link set netns %s dev %s" % (self.pid, t_int_sub_if))
and VMTopology.intf_exists(t_int_sub_if) \
and VMTopology.intf_not_exists(t_int_sub_if, pid=self.pid) \
and VMTopology.intf_not_exists(int_sub_if, pid=self.pid):
VMTopology.cmd("ip link set dev %s netns %s" %
(t_int_sub_if, self.pid))

if VMTopology.intf_exists(t_int_if, pid=self.pid) and VMTopology.intf_not_exists(int_if, pid=self.pid):
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" % (self.pid, t_int_if, int_if))
Expand Down Expand Up @@ -662,9 +667,10 @@ def add_veth_if_to_netns(self, ext_if, int_if):
VMTopology.iface_up(ext_if)

if VMTopology.intf_exists(t_int_if) \
and VMTopology.intf_not_exists(t_int_if, netns=self.netns) \
and VMTopology.intf_not_exists(int_if, netns=self.netns):
VMTopology.cmd("ip link set netns %s dev %s" % (self.netns, t_int_if))
and VMTopology.intf_not_exists(t_int_if, netns=self.netns) \
and VMTopology.intf_not_exists(int_if, netns=self.netns):
VMTopology.cmd("ip link set dev %s netns %s" %
(t_int_if, self.netns))

if VMTopology.intf_exists(t_int_if, netns=self.netns) and VMTopology.intf_not_exists(int_if, netns=self.netns):
VMTopology.cmd("ip netns exec %s ip link set dev %s name %s" % (self.netns, t_int_if, int_if))
Expand Down Expand Up @@ -749,6 +755,7 @@ def bind_fp_ports(self, disconnect_vm=False):
self.bind_vs_dut_ports(VS_CHASSIS_MIDPLANE_BRIDGE_NAME, self.topo['DUT']['vs_chassis']['midplane_port'])

def unbind_fp_ports(self):
logging.info("=== unbind front panel ports ===")
for attr in self.VMs.values():
for vlan_num, vlan in enumerate(attr['vlans']):
br_name = adaptive_name(OVS_FP_BRIDGE_TEMPLATE, self.vm_names[self.vm_base_index + attr['vm_offset']], vlan_num)
Expand Down Expand Up @@ -1091,6 +1098,7 @@ def remove_host_ports(self):
"""
remove dut port from the ptf docker
"""
logging.info("=== Remove host ports ===")
for i, intf in enumerate(self.host_interfaces):
if self._is_multi_duts:
if isinstance(intf, list):
Expand All @@ -1111,6 +1119,45 @@ def remove_host_ports(self):
vlan_id = self.vlan_ids[str(intf)]
self.remove_dut_vlan_subif_from_docker(ptf_if, vlan_separator, vlan_id)

def remove_veth_if_from_docker(self, ext_if, int_if, tmp_name):
"""
Remove veth interface from docker
"""
logging.info("=== Cleanup port, int_if: %s, ext_if: %s, tmp_name: %s ===" % (ext_if, int_if, tmp_name))
if VMTopology.intf_exists(int_if, pid=self.pid):
# Name it back to temp name in PTF container to avoid potential conflicts
VMTopology.iface_down(int_if, pid=self.pid)
VMTopology.cmd("nsenter -t %s -n ip link set dev %s name %s" % (self.pid, int_if, tmp_name))
# Set it to default namespace
VMTopology.cmd("nsenter -t %s -n ip link set dev %s netns 1" % (self.pid, tmp_name))

# Delete its peer in default namespace
if VMTopology.intf_exists(ext_if):
VMTopology.cmd("ip link delete dev %s" % ext_if)

def remove_ptf_mgmt_port(self):
ext_if = PTF_MGMT_IF_TEMPLATE % self.vm_set_name
tmp_name = MGMT_PORT_NAME + VMTopology._generate_fingerprint(ext_if, MAX_INTF_LEN-len(MGMT_PORT_NAME))
self.remove_veth_if_from_docker(ext_if, MGMT_PORT_NAME, tmp_name)

def remove_ptf_backplane_port(self):
ext_if = PTF_BP_IF_TEMPLATE % self.vm_set_name
tmp_name = BP_PORT_NAME + VMTopology._generate_fingerprint(ext_if, MAX_INTF_LEN-len(BP_PORT_NAME))
self.remove_veth_if_from_docker(ext_if, BP_PORT_NAME, tmp_name)

def remove_injected_fp_ports_from_docker(self):
for vm, vlans in self.injected_fp_ports.items():
for vlan in vlans:
(_, _, ptf_index) = VMTopology.parse_vm_vlan_port(vlan)
ext_if = adaptive_name(INJECTED_INTERFACES_TEMPLATE, self.vm_set_name, ptf_index)
int_if = PTF_FP_IFACE_TEMPLATE % ptf_index
properties = self.vm_properties.get(vm, {})
create_vlan_subintf = properties.get('device_type') in (
BACKEND_TOR_TYPE, BACKEND_LEAF_TYPE)
if not create_vlan_subintf:
tmp_name = int_if + VMTopology._generate_fingerprint(ext_if, MAX_INTF_LEN-len(int_if))
self.remove_veth_if_from_docker(ext_if, int_if, tmp_name)

@staticmethod
def _generate_fingerprint(name, digit=6):
"""
Expand Down Expand Up @@ -1667,10 +1714,14 @@ def main():
if vms_exists:
net.unbind_vm_backplane()
net.unbind_fp_ports()
net.remove_injected_fp_ports_from_docker()

if hostif_exists:
net.remove_host_ports()

net.remove_ptf_mgmt_port()
net.remove_ptf_backplane_port()

if net.netns:
net.unbind_mgmt_port(NETNS_MGMT_IF_TEMPLATE % net.vm_set_name)
net.delete_network_namespace()
Expand Down Expand Up @@ -1730,6 +1781,7 @@ def main():
net.unbind_fp_ports()
net.add_injected_fp_ports_to_docker()
net.bind_fp_ports()
net.bind_vm_backplane()
net.add_bp_port_to_docker(ptf_bp_ip_addr, ptf_bp_ipv6_addr)

if net.netns:
Expand Down
Loading

0 comments on commit 13c6b12

Please sign in to comment.