diff --git a/common/OPexpect.py b/common/OPexpect.py index da21f5f64..987c8ef05 100644 --- a/common/OPexpect.py +++ b/common/OPexpect.py @@ -164,3 +164,15 @@ def expect(self, pattern, timeout=-1, searchwindowsize=-1, async=False): raise PlatformError(state, log) return r - len(op_patterns) + + ''' + Provide function that do not raise any exception. This is usefull for the + tests that do not consider panic or opal TI as test failure. This allows + such testcases to take control and look for specific pattern in system + crash scenario. + ''' + def expect_no_fail(self, pattern, timeout=-1, searchwindowsize=-1, async=False): + r = super(spawn,self).expect(pattern, + timeout=timeout, + searchwindowsize=searchwindowsize) + return r diff --git a/common/OpTestConstants.py b/common/OpTestConstants.py index f3ebf4da5..ba907cc2a 100644 --- a/common/OpTestConstants.py +++ b/common/OpTestConstants.py @@ -266,6 +266,8 @@ class OpTestConstants(): TFMR_DEC_PARITY_ERROR = "0006080000000000" TFMR_PURR_PARITY_ERROR = "0004080000000000" TFMR_SPURR_PARITY_ERROR = "0005080000000000" + HMI_TOD_TOPOLOGY_FAILOVER = 7 + OPAL_TI = 8 # CPU sleep states constants GET_CPU_SLEEP_STATE2 = "cat /sys/devices/system/cpu/cpu*/cpuidle/state2/disable" diff --git a/testcases/OpTestHMIHandling.py b/testcases/OpTestHMIHandling.py index 846499d09..df341cca3 100644 --- a/testcases/OpTestHMIHandling.py +++ b/testcases/OpTestHMIHandling.py @@ -106,6 +106,26 @@ def clear_stop(self): else: self.assertTrue(False, "OpTestHMIHandling failed to recover from previous OpSystemState.UNKNOWN_BAD") + def handle_panic(self): + rc = self.cv_SYSTEM.console.pty.expect_no_fail(["Kernel panic - not syncing: Unrecoverable HMI exception", pexpect.TIMEOUT, pexpect.EOF], timeout=120) + if rc == 0: + rc = self.cv_SYSTEM.console.pty.expect_no_fail(["ISTEP", pexpect.TIMEOUT, pexpect.EOF], timeout=120) + if rc == 0: + self.cv_SYSTEM.set_state(OpSystemState.IPLing) + self.cv_SYSTEM.goto_state(OpSystemState.OS) + else: + self.assertTrue(False, "OpTestHMIHandling: System failing to reboot after topology recovery failure") + else: + self.assertTrue(False, "OpTestHMIHandling: No panic after topology recovery failure") + + def handle_OpalTI(self): + rc = self.cv_SYSTEM.console.pty.expect_no_fail(["ISTEP", pexpect.TIMEOUT, pexpect.EOF], timeout=120) + if rc == 0: + self.cv_SYSTEM.set_state(OpSystemState.IPLing) + self.cv_SYSTEM.goto_state(OpSystemState.OS) + else: + self.assertTrue(False, "System failed to reboot after OPAL TI") + def handle_ipl(self): rc = self.cv_SYSTEM.console.pty.expect(["ISTEP", "istep", pexpect.TIMEOUT, pexpect.EOF], timeout=180) log.debug("before={}".format(self.cv_SYSTEM.console.pty.before)) @@ -218,6 +238,54 @@ def form_scom_addr(self, addr, core): log.debug(val) return val + def is_node_present(self, node): + ''' Check if specified device tree is present or not.''' + self.cv_SYSTEM.goto_state(OpSystemState.OS) + l_cmd = "ls %s" % node + try: + self.cv_HOST.host_run_command(l_cmd, console=1) + except CommandFailed as cf: + '''Node is not present ''' + return 0 + + return 1 + + def get_OpalSwXstop(self): + self.proc_gen = self.cv_HOST.host_get_proc_gen(console=1) + self.cv_SYSTEM.goto_state(OpSystemState.OS) + try: + o = self.cv_HOST.host_run_command("nvram -p ibm,skiboot --print-config=opal-sw-xstop", console=1) + ''' + On a fresh system this isn't set. The command will exit with + exitcode = 255. + On power8 we treat this as enabled + On power9 we treat this as disable. + ''' + except CommandFailed as cf: + if cf.exitcode == 255: + if self.proc_gen in ["POWER8", "POWER8E"]: + return "enable" + elif self.proc_gen in ["POWER9"]: + return "disable" + else: + self.assertTrue(False, "get_OpalSwXstop() failed to query nvram.") + return o + + def set_OpalSwXstop(self, val): + self.cv_SYSTEM.goto_state(OpSystemState.OS) + o = self.get_OpalSwXstop() + if val in o: + return + + l_cmd = "nvram -p ibm,skiboot --update-config opal-sw-xstop=%s" % val + self.cv_HOST.host_run_command(l_cmd, console=1) + o = self.get_OpalSwXstop() + if val in o: + pass + else: + l_msg = "Failed to set opal-sw-xstop config to %s" % val + self.assertTrue(False, l_msg) + def clearGardEntries(self): self.cv_SYSTEM.goto_state(OpSystemState.OS) expect_prompt = self.cv_SYSTEM.util.build_prompt() @@ -374,6 +442,10 @@ def _testHMIHandling(self, i_test): self._testTFMR_Errors(BMC_CONST.TFMR_DEC_PARITY_ERROR) self._testTFMR_Errors(BMC_CONST.TFMR_PURR_PARITY_ERROR) self._testTFMR_Errors(BMC_CONST.TFMR_SPURR_PARITY_ERROR) + elif l_test == BMC_CONST.HMI_TOD_TOPOLOGY_FAILOVER: + self._test_tod_topology_failover() + elif l_test == BMC_CONST.OPAL_TI: + self._test_opal_ti() else: raise Exception("Please provide valid test case") l_con.run_command("dmesg -C") @@ -494,6 +566,69 @@ def _test_malfunction_alert(self): console.pty.sendline(l_cmd) self.handle_ipl() + def _test_tod_topology_failover(self): + ''' + This function is used to test error path for hmi TOD topology failover. + On HMI recovery failure TOD/TB goes in invalid state and stops running. + In this case kernel should either + a) panic followed by clean reboot. (For opal-sw-xstop=disable) + OR + b) cause OPAL TI by triggering sw checkstop to OCC. (For + opal-sw-xstop=enable) + + In both cases we should not see any hangs at Linux OS level. + To simulate error condition inject TOD topology failover on all the + chips until we see HMI failure. + ''' + scom_addr = "0x40000" + l_error = "0x4000000000000000" + l_test_mode = "TI" + + g = self.get_OpalSwXstop() + if "disable" in g: + l_test_mode="panic" + + console = self.cv_SYSTEM.console + l_cmd = "" + for l_pair in self.l_dic: + l_chip = l_pair[0] + l_cmd_str = "PATH=/usr/local/sbin:$PATH putscom -c %s %s %s; " % (l_chip, scom_addr, l_error) + l_cmd = l_cmd + l_cmd_str + + console.pty.sendline(l_cmd) + if l_test_mode == "panic": + self.handle_panic() + else: + self.handle_OpalTI() + + return + + def ppc_bit(self, bit): + l_val = 0x8000000000000000 >> bit + return l_val + + def _test_opal_ti(self): + ''' + This function is used to test OPAL TI functionality. + ''' + lsprop_output = self.cv_HOST.host_run_command("lsprop /proc/device-tree/ibm,sw-checkstop-fir | tail -n 1") + saddr, bit = str(lsprop_output[0]).split() + scom_addr = "0x%s" % saddr + bit = int(bit, 16) + + l_error = "0x%016x" % self.ppc_bit(bit) + + log.debug("lsprop = %s = %d" % (scom_addr, bit)) + console = self.cv_SYSTEM.console + + for l_pair in self.l_dic: + l_chip = l_pair[0] + l_cmd = "PATH=/usr/local/sbin:$PATH putscom -c %s %s %s" % (l_chip, scom_addr, l_error) + console.pty.sendline(l_cmd) + self.handle_OpalTI() + + return + def _test_hyp_resource_err(self): ''' This function is used to test HMI: Hypervisor resource error @@ -650,6 +785,28 @@ def runTest(self): self._testHMIHandling(BMC_CONST.HMI_MALFUNCTION_ALERT) self.clearGardEntries() +class TodTopologyFailoverPanic(OpTestHMIHandling): + def runTest(self): + self.set_OpalSwXstop("disable") + self._testHMIHandling(BMC_CONST.HMI_TOD_TOPOLOGY_FAILOVER) + +class TodTopologyFailoverOpalTI(OpTestHMIHandling): + def runTest(self): + rc = self.is_node_present("/proc/device-tree/ibm,sw-checkstop-fir") + if rc == 1: + self.set_OpalSwXstop("enable") + self._testHMIHandling(BMC_CONST.HMI_TOD_TOPOLOGY_FAILOVER) + else: + self.skipTest("OPAL TI not supported on this system.") + +class OpalTI(OpTestHMIHandling): + def runTest(self): + rc = self.is_node_present("/proc/device-tree/ibm,sw-checkstop-fir") + if rc == 1: + self._testHMIHandling(BMC_CONST.OPAL_TI) + else: + self.skipTest("OPAL TI not supported on this system.") + class HypervisorResourceError(OpTestHMIHandling): def runTest(self): self._testHMIHandling(BMC_CONST.HMI_HYPERVISOR_RESOURCE_ERROR) @@ -663,6 +820,9 @@ def unrecoverable_suite(): s = unittest.TestSuite() s.addTest(MalfunctionAlert()) s.addTest(HypervisorResourceError()) + s.addTest(TodTopologyFailoverPanic()) + s.addTest(OpalTI()) + s.addTest(TodTopologyFailoverOpalTI()) s.addTest(ClearGard()) return s