Fix panic with 'stuck spinlock' during crash recovery tests (#901)

Problem: Isolation crash recovery tests (for example, uao_crash_compaction_column, crash_recovery_dtm, etc.) failed with 'stuck spinlock' panic with very low reproduction rate. Cause: Problem happened in the following case: 1. During the test, a backend process was forced into a panic (according to test scenario). 2. At the very same moment, the background writer process called 'SIMPLE_FAULT_INJECTOR("fault_in_background_writer_main")', he did it regularly from the loop in 'BackgroundWriterMain'. The bg writer acquired the spinlock inside the fault injector, but before it released it... 3. Postmaster started to send the SIGQUIT signal to all child processes. 4. Background writer process received SIGQUIT and halted its execution without releasing the spinlock. 5. In the background writer process, a handler for the SIGQUIT signal was invoked - 'bg_quickdie'. And 'bg_quickdie' called 'SIMPLE_FAULT_INJECTOR("fault_in_background_writer_quickdie");'. As the spinlock was not released, 'bg_quickdie' hanged on the spinlock. 6. Postmaster failed to wait child processes completion. And after timeout it tried to check 'SIMPLE_FAULT_INJECTOR("postmaster_server_loop_no_sigkill")' and hanged on the same spinlock as well. 7. Finally, both postmaster and 'bg_quickdie' failed with 'stuck spinlock' panic. Fix: The general rule would be that it is NOT allowed to access the fault injector from the postmaster process or from any SIGQUIT handler of child processes when the system is resetting after the crash of some backend. It is so because during reset postmaster terminates child processes, and it might terminate some process when the process has acquired the spinlock of the fault injector, but hasn't released it yet. So, subsequent calls to the fault injector api from postmaster or any SIGQUIT handler will lead to deadlock. Only 'doomed' processes can still call the fault injector api, as they will soon be terminated anyway. According to written above, following changes are made: 1. Access to the fault injector during a reset is removed from 'bg_quickdie' and postmaster's ServerLoop. 2. The fts_segment_reset test and related code are redesigned. Now, instead of sleeping for a delay defined by the test in 'bg_quickdie', postmaster sends SIGSTOP to the bg writer, and starts a timer for the delay. Once the delay elapses, the timer handler sends SIGCONT and SIGQUIT to the bg writer. This logic is triggered if the postmaster detects the new fault "postmaster_delay_termination_bg_writer" (which is a replacement for "fault_in_background_writer_quickdie" and "postmaster_server_loop_no_sigkill"). This fault is checked only when the postmaster is in the PM_RUN state. So it should be safe to check for it. New tests specific to this issue are not added because of the unstable nature of the problem. Changes from original commit: 1. Add a NULL check to timeout.c (borrowed from GPDB 6) 2. Changed comments. (cherry picked from commit dae2f08)
arenadata · Oct 24, 2024 · d15eade · d15eade
1 parent 41ad71f
commit d15eade
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 35 deletions.
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
@@ -391,8 +391,6 @@ BackgroundWriterMain(void)
 static void
 bg_quickdie(SIGNAL_ARGS)
 {
-	SIMPLE_FAULT_INJECTOR("fault_in_background_writer_quickdie");
-
 	/*
 	 * We DO NOT want to run proc_exit() or atexit() callbacks -- we're here
 	 * because shared memory may be corrupted, so we don't want to try to

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
@@ -164,6 +164,9 @@ bool am_mirror = false;
 /* GPDB specific flag to handle deadlocks during parallel segment start. */
 volatile bool *pm_launch_walreceiver = NULL;
 
+/* This is set by a test to simulate long RESET state. */
+static int	delay_bg_writer_termination_sec = 0;
+
 /*
  * Possible types of a backend. Beyond being the possible bkend_type values in
  * struct bkend, these are OR-able request flag bits for SignalSomeChildren()
@@ -498,6 +501,7 @@ static void sigusr1_handler(SIGNAL_ARGS);
 static void process_startup_packet_die(SIGNAL_ARGS);
 static void process_startup_packet_quickdie(SIGNAL_ARGS);
 static void dummy_handler(SIGNAL_ARGS);
+static void TerminateBgWriterHandler(void);
 static void StartupPacketTimeoutHandler(void);
 static void CleanupBackend(int pid, int exitstatus);
 static bool CleanupBackgroundWorker(int pid, int exitstatus);
@@ -803,6 +807,12 @@ PostmasterMain(int argc, char *argv[])
 	 */
 	InitializeGUCOptions();
 
+	/*
+	 * Note: MyLatch is still not initialized for postmaster, 
+	 * so it will not be set during timeout signal handler
+	 */
+	InitializeTimeouts();
+
 	opterr = 1;
 
 	/*
@@ -1916,6 +1926,7 @@ ServerLoop(void)
 	int			nSockets;
 	time_t		last_lockfile_recheck_time,
 				last_touch_time;
+	bool		postmaster_no_sigkill = false;
 
 	last_lockfile_recheck_time = last_touch_time = time(NULL);
 
@@ -2026,6 +2037,44 @@ ServerLoop(void)
 			}
 		}
 
+#ifdef FAULT_INJECTOR
+		if (pmState == PM_RUN)
+		{
+			/*
+			 * Code below is for test purposes only. If a fault is set, it
+			 * will delay the termination of the background writer process for
+			 * the amount of seconds desired by the test.
+			 *
+			 * Important note: it is NOT allowed to access the fault injector
+			 * from Postmaster process or from any SIGQUIT handler of child
+			 * processes when the system is resetting after the crash of some
+			 * backend. Reason: during reset Postmaster kills child processes,
+			 * and it might kill some process when the process has acquired
+			 * the spin lock of the fault injector, but hasn't released it
+			 * yet. So, subsequent call of the fault injector api from
+			 * Postmaster or any SIGQUIT handler will lead to deadlock. Only
+			 * 'doomed' processes can still call the fault injector api, as
+			 * they will soon be terminated anyway. Thus, the fault injector
+			 * is accessed only when the Postmaster state is PM_RUN.
+			 */
+			if (SIMPLE_FAULT_INJECTOR("postmaster_delay_termination_bg_writer") ==
+				FaultInjectorTypeSkip)
+			{
+				/*
+				 * Let the background writer sleep a little bit larger than
+				 * the FTS retry window (which is gp_fts_probe_retries * 1 sec).
+				 */
+				delay_bg_writer_termination_sec = gp_fts_probe_retries + 2;
+				postmaster_no_sigkill = true;
+			}
+			else
+			{
+				delay_bg_writer_termination_sec = 0;
+				postmaster_no_sigkill = false;
+			}
+		}
+#endif
+
 		/* If we have lost the log collector, try to start a new one */
 		if (SysLoggerPID == 0 && Logging_collector)
 			SysLoggerPID = SysLogger_Start();
@@ -2123,8 +2172,7 @@ ServerLoop(void)
 			AbortStartTime != 0 &&
 			(now - AbortStartTime) >= SIGKILL_CHILDREN_AFTER_SECS)
 		{
-#ifdef FAULT_INJECTOR
-			if (SIMPLE_FAULT_INJECTOR("postmaster_server_loop_no_sigkill") == FaultInjectorTypeSkip)
+			if (postmaster_no_sigkill)
 			{
 				/* 
 				 * This prevents sending SIGKILL to child processes for testing purpose.
@@ -2135,7 +2183,7 @@ ServerLoop(void)
 				pg_usleep(100000L); 
 				continue;
 			}
-#endif
+
 			/* We were gentle with them before. Not anymore */
 			TerminateChildren(SIGKILL);
 			/* reset flag so we don't SIGKILL again */
@@ -4029,11 +4077,37 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 		BgWriterPID = 0;
 	else if (BgWriterPID != 0 && take_action)
 	{
-		ereport(DEBUG2,
-				(errmsg_internal("sending %s to process %d",
-								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
-								 (int) BgWriterPID)));
-		signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
+		if (delay_bg_writer_termination_sec == 0)
+		{
+			ereport(DEBUG2,
+					(errmsg_internal("sending %s to process %d",
+									 (SendStop ? "SIGSTOP" : "SIGQUIT"),
+									 (int) BgWriterPID)));
+			signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
+		}
+		else
+		{
+			/*
+			 * Code below is only for test purposes and should not be invoked
+			 * under normal conditions. It will delay the termination of the
+			 * background writer process.
+			 *
+			 * In order to delay the termination of the background writer
+			 * process - send SIGSTOP to the bg writer process and start a
+			 * timer that will wake up and then immediately terminate it.
+			 */
+			static TimeoutId id = 0;
+
+			ereport(DEBUG2,
+					(errmsg_internal("sending SIGSTOP to process %d and starting timer",
+									 (int) BgWriterPID)));
+			signal_child(BgWriterPID, SIGSTOP);
+
+			if (id == 0)
+				id = RegisterTimeout(USER_TIMEOUT, TerminateBgWriterHandler);
+
+			enable_timeout_after(id, delay_bg_writer_termination_sec * 1000);
+		}
 	}
 
 	/* Take care of the checkpointer too */
@@ -5881,6 +5955,21 @@ dummy_handler(SIGNAL_ARGS)
 {
 }
 
+/*
+ * Terminate stopped background writer process (should be invoked only
+ * during test).
+ */
+static void
+TerminateBgWriterHandler()
+{
+	ereport(DEBUG2,
+			(errmsg_internal("sending SIGCONT and SIGQUIT to process %d",
+							 (int) BgWriterPID)));
+
+	signal_child(BgWriterPID, SIGCONT);
+	signal_child(BgWriterPID, SIGQUIT);
+}
+
 /*
  * Timeout while processing startup packet.
  * As for process_startup_packet_die(), we clean up and exit(1).

diff --git a/src/backend/utils/misc/timeout.c b/src/backend/utils/misc/timeout.c
@@ -275,9 +275,11 @@ handle_sig_alarm(SIGNAL_ARGS)
 
 	/*
 	 * SIGALRM is always cause for waking anything waiting on the process
-	 * latch.
+	 * latch.  Cope with MyLatch not being there, as the startup process also
+	 * uses this signal handler.
 	 */
-	SetLatch(MyLatch);
+	if (MyLatch)
+		SetLatch(MyLatch);
 
 	/*
 	 * Fire any pending timeouts, but only if we're enabled to do so.

diff --git a/src/test/isolation2/expected/fts_segment_reset.out b/src/test/isolation2/expected/fts_segment_reset.out
@@ -13,7 +13,7 @@
 -- to restart, and potentially makes FTS think it's in "recovery not
 -- in progress" stage and promote the mirror, we would need the FTS
 -- to make that decision a bit less frequently.
-!\retcode gpconfig -c gp_fts_probe_retries -v 15 --coordinatoronly;
+!\retcode gpconfig -c gp_fts_probe_retries -v 15;
 (exited with code 0)
 !\retcode gpstop -u;
 (exited with code 0)
@@ -22,14 +22,8 @@
 -- This number is selected to be larger than the 15-second retry window
 -- which makes a meaningful test, meanwhile reduce the chance that FTS sees
 -- a "recovery not in progress" primary as much as possible.
-select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 17, dbid) from gp_segment_configuration where role = 'p' and content = 0;
- gp_inject_fault 
------------------
- Success:        
-(1 row)
-
--- Do not let the postmaster send SIGKILL to the bgwriter
-select gp_inject_fault_infinite('postmaster_server_loop_no_sigkill', 'skip', dbid) from gp_segment_configuration where role = 'p' and content = 0;
+-- It also will not let the postmaster send SIGKILL to the bgwriter.
+select gp_inject_fault_infinite('postmaster_delay_termination_bg_writer', 'skip', dbid) from gp_segment_configuration where role = 'p' and content = 0;
  gp_inject_fault_infinite 
 --------------------------
  Success:                 
@@ -74,12 +68,7 @@ select dbid, role, preferred_role, status from gp_segment_configuration where co
  5    | m    | m              | u      
 (2 rows)
 
-select gp_inject_fault('postmaster_server_loop_no_sigkill', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
- gp_inject_fault 
------------------
- Success:        
-(1 row)
-select gp_inject_fault('fault_in_background_writer_quickdie', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
+select gp_inject_fault('postmaster_delay_termination_bg_writer', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
  gp_inject_fault 
 -----------------
  Success:        

diff --git a/src/test/isolation2/sql/fts_segment_reset.sql b/src/test/isolation2/sql/fts_segment_reset.sql
@@ -12,18 +12,15 @@
 -- to restart, and potentially makes FTS think it's in "recovery not
 -- in progress" stage and promote the mirror, we would need the FTS
 -- to make that decision a bit less frequently.
-!\retcode gpconfig -c gp_fts_probe_retries -v 15 --coordinatoronly;
+!\retcode gpconfig -c gp_fts_probe_retries -v 15;
 !\retcode gpstop -u;
 
 -- Let the background writer sleep 17 seconds to delay the resetting.
 -- This number is selected to be larger than the 15-second retry window
 -- which makes a meaningful test, meanwhile reduce the chance that FTS sees
 -- a "recovery not in progress" primary as much as possible.
-select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 17, dbid) 
-from gp_segment_configuration where role = 'p' and content = 0;
-
--- Do not let the postmaster send SIGKILL to the bgwriter
-select gp_inject_fault_infinite('postmaster_server_loop_no_sigkill', 'skip', dbid) 
+-- It also will not let the postmaster send SIGKILL to the bgwriter.
+select gp_inject_fault_infinite('postmaster_delay_termination_bg_writer', 'skip', dbid)
 from gp_segment_configuration where role = 'p' and content = 0;
 
 -- Now bring down primary of seg0. There're a lot of ways to do that, in order
@@ -48,8 +45,7 @@ from gp_segment_configuration where role = 'p' AND content = 0;
 select gp_request_fts_probe_scan();
 select dbid, role, preferred_role, status from gp_segment_configuration where content = 0;
 
-select gp_inject_fault('postmaster_server_loop_no_sigkill', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
-select gp_inject_fault('fault_in_background_writer_quickdie', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
+select gp_inject_fault('postmaster_delay_termination_bg_writer', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
 
 -- The only table that should have been created successfully
 drop table fts_reset_t3;