Skip to content

Commit

Permalink
Improve comments and log messages in the logical replication monitor (#…
Browse files Browse the repository at this point in the history
…9974)

Improved comments will help others when they read the code, and the log
messages will help others understand why the logical replication monitor
works the way it does.

Signed-off-by: Tristan Partin <[email protected]>
  • Loading branch information
tristan957 authored Dec 13, 2024
1 parent eeabecd commit 07d1db5
Showing 1 changed file with 30 additions and 23 deletions.
53 changes: 30 additions & 23 deletions pgxn/neon/logical_replication_monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ get_snapshots_cutoff_lsn(void)
{
cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn;
elog(LOG,
"ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d",
LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files);
"ls_monitor: number of snapshot files, %zu, is larger than limit of %d",
snapshot_index, logical_replication_max_snap_files);
}

/* Is the size of the logical snapshots directory larger than specified?
Expand Down Expand Up @@ -162,8 +162,8 @@ get_snapshots_cutoff_lsn(void)
}

if (cutoff != original)
elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB",
LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size);
elog(LOG, "ls_monitor: " SNAPDIR " is larger than %d KB",
logical_replication_max_logicalsnapdir_size);
}

pfree(snapshot_descriptors);
Expand Down Expand Up @@ -214,9 +214,13 @@ InitLogicalReplicationMonitor(void)
}

/*
* Unused logical replication slots pins WAL and prevents deletion of snapshots.
* Unused logical replication slots pins WAL and prevent deletion of snapshots.
* WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
* need too many .snap files.
* need too many .snap files. These files are stored as AUX files, which are a
* pageserver mechanism for storing non-relation data. AUX files are shipped in
* in the basebackup which is requested by compute_ctl before Postgres starts.
* The larger the time to retrieve the basebackup, the more likely it is the
* compute will be killed by the control plane due to a timeout.
*/
void
LogicalSlotsMonitorMain(Datum main_arg)
Expand All @@ -239,10 +243,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
ProcessConfigFile(PGC_SIGHUP);
}

/*
* If there are too many .snap files, just drop all logical slots to
* prevent aux files bloat.
*/
/* Get the cutoff LSN */
cutoff_lsn = get_snapshots_cutoff_lsn();
if (cutoff_lsn > 0)
{
Expand All @@ -252,31 +253,37 @@ LogicalSlotsMonitorMain(Datum main_arg)
ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
XLogRecPtr restart_lsn;

/* find the name */
LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
/* Consider only logical repliction slots */

/* Consider only active logical repliction slots */
if (!s->in_use || !SlotIsLogical(s))
{
LWLockRelease(ReplicationSlotControlLock);
continue;
}

/* do we need to drop it? */
/*
* Retrieve the restart LSN to determine if we need to drop the
* slot
*/
SpinLockAcquire(&s->mutex);
restart_lsn = s->data.restart_lsn;
SpinLockRelease(&s->mutex);

strlcpy(slot_name, s->data.name.data, sizeof(slot_name));
LWLockRelease(ReplicationSlotControlLock);

if (restart_lsn >= cutoff_lsn)
{
LWLockRelease(ReplicationSlotControlLock);
elog(LOG, "ls_monitor: not dropping replication slot %s because restart LSN %X/%X is greater than cutoff LSN %X/%X",
slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
continue;
}

strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
elog(LOG, "ls_monitor: dropping replication slot %s because restart LSN %X/%X lower than cutoff LSN %X/%X",
slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
LWLockRelease(ReplicationSlotControlLock);

/* now try to drop it, killing owner before if any */
/* now try to drop it, killing owner before, if any */
for (;;)
{
pid_t active_pid;
Expand All @@ -288,9 +295,9 @@ LogicalSlotsMonitorMain(Datum main_arg)
if (active_pid == 0)
{
/*
* Slot is releasted, try to drop it. Though of course
* Slot is released, try to drop it. Though of course,
* it could have been reacquired, so drop can ERROR
* out. Similarly it could have been dropped in the
* out. Similarly, it could have been dropped in the
* meanwhile.
*
* In principle we could remove pg_try/pg_catch, that
Expand All @@ -300,22 +307,22 @@ LogicalSlotsMonitorMain(Datum main_arg)
PG_TRY();
{
ReplicationSlotDrop(slot_name, true);
elog(LOG, "ls_monitor: slot %s dropped", slot_name);
elog(LOG, "ls_monitor: replication slot %s dropped", slot_name);
}
PG_CATCH();
{
/* log ERROR and reset elog stack */
EmitErrorReport();
FlushErrorState();
elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
elog(LOG, "ls_monitor: failed to drop replication slot %s", slot_name);
}
PG_END_TRY();
break;
}
else
{
/* kill the owner and wait for release */
elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
elog(LOG, "ls_monitor: killing replication slot %s owner %d", slot_name, active_pid);
(void) kill(active_pid, SIGTERM);
/* We shouldn't get stuck, but to be safe add timeout. */
ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
Expand Down

1 comment on commit 07d1db5

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7095 tests run: 6798 passed, 0 failed, 297 skipped (full report)


Flaky tests (5)

Postgres 17

Postgres 16

  • test_physical_replication_config_mismatch_too_many_known_xids: release-arm64

Postgres 15

Code coverage* (full report)

  • functions: 31.4% (8402 of 26778 functions)
  • lines: 48.1% (66637 of 138650 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
07d1db5 at 2024-12-13T19:10:30.921Z :recycle:

Please sign in to comment.