From d15eadea86602b41882928d46241d1e7f3db0e93 Mon Sep 17 00:00:00 2001
From: Roman Eskin <r.eskin@arenadata.io>
Date: Thu, 4 Apr 2024 21:47:32 +1000
Subject: [PATCH] Fix panic with 'stuck spinlock' during crash recovery tests
 (#901)

Problem:
Isolation crash recovery tests (for example, uao_crash_compaction_column,
crash_recovery_dtm, etc.) failed with 'stuck spinlock' panic with very low
reproduction rate.

Cause:
Problem happened in the following case:
1. During the test, a backend process was forced into a panic (according to test
scenario).
2. At the very same moment, the background writer process called
'SIMPLE_FAULT_INJECTOR("fault_in_background_writer_main")', he did it regularly
from the loop in 'BackgroundWriterMain'. The bg writer acquired the spinlock
inside the fault injector, but before it released it...
3. Postmaster started to send the SIGQUIT signal to all child processes.
4. Background writer process received SIGQUIT and halted its execution without
releasing the spinlock.
5. In the background writer process, a handler for the SIGQUIT signal was
invoked - 'bg_quickdie'. And 'bg_quickdie' called
'SIMPLE_FAULT_INJECTOR("fault_in_background_writer_quickdie");'. As the spinlock
was not released, 'bg_quickdie' hanged on the spinlock.
6. Postmaster failed to wait child processes completion. And after timeout it
tried to check 'SIMPLE_FAULT_INJECTOR("postmaster_server_loop_no_sigkill")' and
hanged on the same spinlock as well.
7. Finally, both postmaster and 'bg_quickdie' failed with 'stuck spinlock'
panic.

Fix:
The general rule would be that it is NOT allowed to access the fault injector
from the postmaster process or from any SIGQUIT handler of child processes when
the system is resetting after the crash of some backend. It is so because during
reset postmaster terminates child processes, and it might terminate some process
when the process has acquired the spinlock of the fault injector, but hasn't
released it yet. So, subsequent calls to the fault injector api from postmaster
or any SIGQUIT handler will lead to deadlock. Only 'doomed' processes can still
call the fault injector api, as they will soon be terminated anyway. According
to written above, following changes are made:
1. Access to the fault injector during a reset is removed from 'bg_quickdie' and
postmaster's ServerLoop.
2. The fts_segment_reset test and related code are redesigned. Now, instead of
sleeping for a delay defined by the test in 'bg_quickdie', postmaster sends
SIGSTOP to the bg writer, and starts a timer for the delay. Once the delay
elapses, the timer handler sends SIGCONT and SIGQUIT to the bg writer. This
logic is triggered if the postmaster detects the new fault
"postmaster_delay_termination_bg_writer" (which is a replacement for
"fault_in_background_writer_quickdie" and "postmaster_server_loop_no_sigkill").
This fault is checked only when the postmaster is in the PM_RUN state. So it
should be safe to check for it.

New tests specific to this issue are not added because of the unstable nature of
the problem.

Changes from original commit:
1. Add a NULL check to timeout.c (borrowed from GPDB 6)
2. Changed comments.

(cherry picked from commit dae2f0851da4a18cccdfb1f4c2b5d1dd552b806c)
---
 src/backend/postmaster/bgwriter.c             |   2 -
 src/backend/postmaster/postmaster.c           | 105 ++++++++++++++++--
 src/backend/utils/misc/timeout.c              |   6 +-
 .../isolation2/expected/fts_segment_reset.out |  19 +---
 src/test/isolation2/sql/fts_segment_reset.sql |  12 +-
 5 files changed, 109 insertions(+), 35 deletions(-)

diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 8391660ed3a7..855ec9db26af 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -391,8 +391,6 @@ BackgroundWriterMain(void)
 static void
 bg_quickdie(SIGNAL_ARGS)
 {
-	SIMPLE_FAULT_INJECTOR("fault_in_background_writer_quickdie");
-
 	/*
 	 * We DO NOT want to run proc_exit() or atexit() callbacks -- we're here
 	 * because shared memory may be corrupted, so we don't want to try to
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index f29b5c9d5bc0..4af867f5f9b2 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -164,6 +164,9 @@ bool am_mirror = false;
 /* GPDB specific flag to handle deadlocks during parallel segment start. */
 volatile bool *pm_launch_walreceiver = NULL;
 
+/* This is set by a test to simulate long RESET state. */
+static int	delay_bg_writer_termination_sec = 0;
+
 /*
  * Possible types of a backend. Beyond being the possible bkend_type values in
  * struct bkend, these are OR-able request flag bits for SignalSomeChildren()
@@ -498,6 +501,7 @@ static void sigusr1_handler(SIGNAL_ARGS);
 static void process_startup_packet_die(SIGNAL_ARGS);
 static void process_startup_packet_quickdie(SIGNAL_ARGS);
 static void dummy_handler(SIGNAL_ARGS);
+static void TerminateBgWriterHandler(void);
 static void StartupPacketTimeoutHandler(void);
 static void CleanupBackend(int pid, int exitstatus);
 static bool CleanupBackgroundWorker(int pid, int exitstatus);
@@ -803,6 +807,12 @@ PostmasterMain(int argc, char *argv[])
 	 */
 	InitializeGUCOptions();
 
+	/*
+	 * Note: MyLatch is still not initialized for postmaster, 
+	 * so it will not be set during timeout signal handler
+	 */
+	InitializeTimeouts();
+
 	opterr = 1;
 
 	/*
@@ -1916,6 +1926,7 @@ ServerLoop(void)
 	int			nSockets;
 	time_t		last_lockfile_recheck_time,
 				last_touch_time;
+	bool		postmaster_no_sigkill = false;
 
 	last_lockfile_recheck_time = last_touch_time = time(NULL);
 
@@ -2026,6 +2037,44 @@ ServerLoop(void)
 			}
 		}
 
+#ifdef FAULT_INJECTOR
+		if (pmState == PM_RUN)
+		{
+			/*
+			 * Code below is for test purposes only. If a fault is set, it
+			 * will delay the termination of the background writer process for
+			 * the amount of seconds desired by the test.
+			 *
+			 * Important note: it is NOT allowed to access the fault injector
+			 * from Postmaster process or from any SIGQUIT handler of child
+			 * processes when the system is resetting after the crash of some
+			 * backend. Reason: during reset Postmaster kills child processes,
+			 * and it might kill some process when the process has acquired
+			 * the spin lock of the fault injector, but hasn't released it
+			 * yet. So, subsequent call of the fault injector api from
+			 * Postmaster or any SIGQUIT handler will lead to deadlock. Only
+			 * 'doomed' processes can still call the fault injector api, as
+			 * they will soon be terminated anyway. Thus, the fault injector
+			 * is accessed only when the Postmaster state is PM_RUN.
+			 */
+			if (SIMPLE_FAULT_INJECTOR("postmaster_delay_termination_bg_writer") ==
+				FaultInjectorTypeSkip)
+			{
+				/*
+				 * Let the background writer sleep a little bit larger than
+				 * the FTS retry window (which is gp_fts_probe_retries * 1 sec).
+				 */
+				delay_bg_writer_termination_sec = gp_fts_probe_retries + 2;
+				postmaster_no_sigkill = true;
+			}
+			else
+			{
+				delay_bg_writer_termination_sec = 0;
+				postmaster_no_sigkill = false;
+			}
+		}
+#endif
+
 		/* If we have lost the log collector, try to start a new one */
 		if (SysLoggerPID == 0 && Logging_collector)
 			SysLoggerPID = SysLogger_Start();
@@ -2123,8 +2172,7 @@ ServerLoop(void)
 			AbortStartTime != 0 &&
 			(now - AbortStartTime) >= SIGKILL_CHILDREN_AFTER_SECS)
 		{
-#ifdef FAULT_INJECTOR
-			if (SIMPLE_FAULT_INJECTOR("postmaster_server_loop_no_sigkill") == FaultInjectorTypeSkip)
+			if (postmaster_no_sigkill)
 			{
 				/* 
 				 * This prevents sending SIGKILL to child processes for testing purpose.
@@ -2135,7 +2183,7 @@ ServerLoop(void)
 				pg_usleep(100000L); 
 				continue;
 			}
-#endif
+
 			/* We were gentle with them before. Not anymore */
 			TerminateChildren(SIGKILL);
 			/* reset flag so we don't SIGKILL again */
@@ -4029,11 +4077,37 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 		BgWriterPID = 0;
 	else if (BgWriterPID != 0 && take_action)
 	{
-		ereport(DEBUG2,
-				(errmsg_internal("sending %s to process %d",
-								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
-								 (int) BgWriterPID)));
-		signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
+		if (delay_bg_writer_termination_sec == 0)
+		{
+			ereport(DEBUG2,
+					(errmsg_internal("sending %s to process %d",
+									 (SendStop ? "SIGSTOP" : "SIGQUIT"),
+									 (int) BgWriterPID)));
+			signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
+		}
+		else
+		{
+			/*
+			 * Code below is only for test purposes and should not be invoked
+			 * under normal conditions. It will delay the termination of the
+			 * background writer process.
+			 *
+			 * In order to delay the termination of the background writer
+			 * process - send SIGSTOP to the bg writer process and start a
+			 * timer that will wake up and then immediately terminate it.
+			 */
+			static TimeoutId id = 0;
+
+			ereport(DEBUG2,
+					(errmsg_internal("sending SIGSTOP to process %d and starting timer",
+									 (int) BgWriterPID)));
+			signal_child(BgWriterPID, SIGSTOP);
+
+			if (id == 0)
+				id = RegisterTimeout(USER_TIMEOUT, TerminateBgWriterHandler);
+
+			enable_timeout_after(id, delay_bg_writer_termination_sec * 1000);
+		}
 	}
 
 	/* Take care of the checkpointer too */
@@ -5881,6 +5955,21 @@ dummy_handler(SIGNAL_ARGS)
 {
 }
 
+/*
+ * Terminate stopped background writer process (should be invoked only
+ * during test).
+ */
+static void
+TerminateBgWriterHandler()
+{
+	ereport(DEBUG2,
+			(errmsg_internal("sending SIGCONT and SIGQUIT to process %d",
+							 (int) BgWriterPID)));
+
+	signal_child(BgWriterPID, SIGCONT);
+	signal_child(BgWriterPID, SIGQUIT);
+}
+
 /*
  * Timeout while processing startup packet.
  * As for process_startup_packet_die(), we clean up and exit(1).
diff --git a/src/backend/utils/misc/timeout.c b/src/backend/utils/misc/timeout.c
index c76f23acfcb5..10a722496b9e 100644
--- a/src/backend/utils/misc/timeout.c
+++ b/src/backend/utils/misc/timeout.c
@@ -275,9 +275,11 @@ handle_sig_alarm(SIGNAL_ARGS)
 
 	/*
 	 * SIGALRM is always cause for waking anything waiting on the process
-	 * latch.
+	 * latch.  Cope with MyLatch not being there, as the startup process also
+	 * uses this signal handler.
 	 */
-	SetLatch(MyLatch);
+	if (MyLatch)
+		SetLatch(MyLatch);
 
 	/*
 	 * Fire any pending timeouts, but only if we're enabled to do so.
diff --git a/src/test/isolation2/expected/fts_segment_reset.out b/src/test/isolation2/expected/fts_segment_reset.out
index d8a49af69582..c9f19ebac626 100644
--- a/src/test/isolation2/expected/fts_segment_reset.out
+++ b/src/test/isolation2/expected/fts_segment_reset.out
@@ -13,7 +13,7 @@
 -- to restart, and potentially makes FTS think it's in "recovery not
 -- in progress" stage and promote the mirror, we would need the FTS
 -- to make that decision a bit less frequently.
-!\retcode gpconfig -c gp_fts_probe_retries -v 15 --coordinatoronly;
+!\retcode gpconfig -c gp_fts_probe_retries -v 15;
 (exited with code 0)
 !\retcode gpstop -u;
 (exited with code 0)
@@ -22,14 +22,8 @@
 -- This number is selected to be larger than the 15-second retry window
 -- which makes a meaningful test, meanwhile reduce the chance that FTS sees
 -- a "recovery not in progress" primary as much as possible.
-select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 17, dbid) from gp_segment_configuration where role = 'p' and content = 0;
- gp_inject_fault 
------------------
- Success:        
-(1 row)
-
--- Do not let the postmaster send SIGKILL to the bgwriter
-select gp_inject_fault_infinite('postmaster_server_loop_no_sigkill', 'skip', dbid) from gp_segment_configuration where role = 'p' and content = 0;
+-- It also will not let the postmaster send SIGKILL to the bgwriter.
+select gp_inject_fault_infinite('postmaster_delay_termination_bg_writer', 'skip', dbid) from gp_segment_configuration where role = 'p' and content = 0;
  gp_inject_fault_infinite 
 --------------------------
  Success:                 
@@ -74,12 +68,7 @@ select dbid, role, preferred_role, status from gp_segment_configuration where co
  5    | m    | m              | u      
 (2 rows)
 
-select gp_inject_fault('postmaster_server_loop_no_sigkill', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
- gp_inject_fault 
------------------
- Success:        
-(1 row)
-select gp_inject_fault('fault_in_background_writer_quickdie', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
+select gp_inject_fault('postmaster_delay_termination_bg_writer', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
  gp_inject_fault 
 -----------------
  Success:        
diff --git a/src/test/isolation2/sql/fts_segment_reset.sql b/src/test/isolation2/sql/fts_segment_reset.sql
index d784990dbc9f..632367678640 100644
--- a/src/test/isolation2/sql/fts_segment_reset.sql
+++ b/src/test/isolation2/sql/fts_segment_reset.sql
@@ -12,18 +12,15 @@
 -- to restart, and potentially makes FTS think it's in "recovery not
 -- in progress" stage and promote the mirror, we would need the FTS
 -- to make that decision a bit less frequently.
-!\retcode gpconfig -c gp_fts_probe_retries -v 15 --coordinatoronly;
+!\retcode gpconfig -c gp_fts_probe_retries -v 15;
 !\retcode gpstop -u;
 
 -- Let the background writer sleep 17 seconds to delay the resetting.
 -- This number is selected to be larger than the 15-second retry window
 -- which makes a meaningful test, meanwhile reduce the chance that FTS sees
 -- a "recovery not in progress" primary as much as possible.
-select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 17, dbid) 
-from gp_segment_configuration where role = 'p' and content = 0;
-
--- Do not let the postmaster send SIGKILL to the bgwriter
-select gp_inject_fault_infinite('postmaster_server_loop_no_sigkill', 'skip', dbid) 
+-- It also will not let the postmaster send SIGKILL to the bgwriter.
+select gp_inject_fault_infinite('postmaster_delay_termination_bg_writer', 'skip', dbid)
 from gp_segment_configuration where role = 'p' and content = 0;
 
 -- Now bring down primary of seg0. There're a lot of ways to do that, in order
@@ -48,8 +45,7 @@ from gp_segment_configuration where role = 'p' AND content = 0;
 select gp_request_fts_probe_scan();
 select dbid, role, preferred_role, status from gp_segment_configuration where content = 0;
 
-select gp_inject_fault('postmaster_server_loop_no_sigkill', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
-select gp_inject_fault('fault_in_background_writer_quickdie', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
+select gp_inject_fault('postmaster_delay_termination_bg_writer', 'reset', dbid) from gp_segment_configuration where role = 'p' and content = 0;
 
 -- The only table that should have been created successfully
 drop table fts_reset_t3;