diff --git a/scripts/fast-reboot b/scripts/fast-reboot index 16ba9e4f23..501e360d89 100755 --- a/scripts/fast-reboot +++ b/scripts/fast-reboot @@ -15,6 +15,7 @@ STRICT=no REBOOT_METHOD="/sbin/kexec -e" ASSISTANT_IP_LIST="" ASSISTANT_SCRIPT="/usr/local/bin/neighbor_advertiser" +LAG_KEEPALIVE_SCRIPT="/usr/local/bin/lag_keepalive.py" WATCHDOG_UTIL="/usr/local/bin/watchdogutil" DEVPATH="/usr/share/sonic/device" PLATFORM=$(sonic-cfggen -H -v DEVICE_METADATA.localhost.platform) @@ -594,6 +595,13 @@ set +e # disable trap-handlers which were set before trap '' EXIT HUP INT QUIT TERM KILL ABRT ALRM +# start sending LACPDUs to keep the LAGs refreshed +# this is a non-blocking call, and the process will die in 300s +debug "Starting lag_keepalive to send LACPDUs ..." +timeout 300 python3 ${LAG_KEEPALIVE_SCRIPT} & +# give the lag_keepalive script a chance to get ready (30s) and collect one lacpdu before going down (30s) +sleep 60 + if [ -x ${LOG_SSD_HEALTH} ]; then debug "Collecting logs to check ssd health before ${REBOOT_TYPE}..." ${LOG_SSD_HEALTH} diff --git a/scripts/lag_keepalive.py b/scripts/lag_keepalive.py new file mode 100755 index 0000000000..dbb8834e05 --- /dev/null +++ b/scripts/lag_keepalive.py @@ -0,0 +1,101 @@ +from scapy.config import conf +conf.ipv6_enabled = False +from scapy.all import sendp, sniff +from swsssdk import ConfigDBConnector +import time, threading, traceback +import syslog + +SYSLOG_ID = 'lag_keepalive' + + +def log_info(msg): + syslog.openlog(SYSLOG_ID) + syslog.syslog(syslog.LOG_INFO, msg) + syslog.closelog() + + +def log_error(msg): + syslog.openlog(SYSLOG_ID) + syslog.syslog(syslog.LOG_ERR, msg) + syslog.closelog() + + +def sniff_lacpdu(device_mac, lag_member, lag_member_to_packet): + sniffed_packet = sniff(iface=lag_member, + filter="ether proto 0x8809 and ether src {}".format(device_mac), + count=1, timeout=30) + lag_member_to_packet[lag_member] = sniffed_packet + + +def get_lacpdu_per_lag_member(): + appDB = ConfigDBConnector() + appDB.db_connect('APPL_DB') + appDB_lag_info = appDB.get_keys('LAG_MEMBER_TABLE') + configDB = ConfigDBConnector() + configDB.db_connect('CONFIG_DB') + device_mac = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "mac") + hwsku = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "hwsku") + active_lag_members = list() + lag_member_to_packet = dict() + sniffer_threads = list() + for lag_entry in appDB_lag_info: + lag_data = lag_entry.split(":") + lag_name = str(lag_data[0]) + oper_status = appDB.get(appDB.APPL_DB,"LAG_TABLE:{}".format(lag_name), "oper_status") + if oper_status == "up": + # only apply the workaround for active lags + lag_member = str(lag_data[1]) + active_lag_members.append(lag_member) + # use threading to capture lacpdus from several lag members simultaneously + sniffer_thread = threading.Thread(target=sniff_lacpdu, + args=(device_mac, lag_member, lag_member_to_packet)) + sniffer_thread.start() + sniffer_threads.append(sniffer_thread) + + # sniff for lacpdu should finish in <= 30s. sniff timeout is also set to 30s + for sniffer in sniffer_threads: + sniffer.join(timeout=30) + + return active_lag_members, lag_member_to_packet + + +def lag_keepalive(lag_member_to_packet): + while True: + for lag_member, packet in lag_member_to_packet.items(): + try: + sendp(packet, iface=lag_member, verbose=False) + except Exception: + # log failure and continue to send lacpdu + traceback_msg = traceback.format_exc() + log_error("Failed to send LACPDU packet from interface {} with error: {}".format( + lag_member, traceback_msg)) + continue + log_info("sent LACPDU packets via {}".format(lag_member_to_packet.keys())) + time.sleep(1) + + +def main(): + while True: + try: + active_lag_members, lag_member_to_packet = get_lacpdu_per_lag_member() + if len(active_lag_members) != len(lag_member_to_packet.keys()): + log_error("Failed to capture LACPDU packets for some lag members. " +\ + "Active lag members: {}. LACPDUs captured for: {}".format( + active_lag_members, lag_member_to_packet.keys())) + + log_info("ready to send LACPDU packets via {}".format(lag_member_to_packet.keys())) + except Exception: + traceback_msg = traceback.format_exc() + log_error("Failed to get LAG members and LACPDUs with error: {}".format( + traceback_msg)) + # keep attempting until sniffed packets are ready + continue + # if no exceptions are thrown, break from loop as LACPDUs are ready to be sent + break + + if lag_member_to_packet: + # start an infinite loop to keep sending lacpdus from lag member ports + lag_keepalive(lag_member_to_packet) + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index f18bb28565..27e2ddc8d8 100644 --- a/setup.py +++ b/setup.py @@ -93,6 +93,7 @@ 'scripts/intfutil', 'scripts/intfstat', 'scripts/ipintutil', + 'scripts/lag_keepalive.py', 'scripts/lldpshow', 'scripts/log_ssd_health', 'scripts/mellanox_buffer_migrator.py',