From db9095ebf36e3a78d9497692411fd81142c9b7d1 Mon Sep 17 00:00:00 2001 From: Craig Comstock Date: Wed, 27 Sep 2023 07:51:58 -0500 Subject: [PATCH] Added better handling of postgresql server state during hub package install The pg_ctl stop/start commands don't ensure that the server state is as reqeusted at the moment that the command returns. Added up to 5 seconds of wait time to ensure the state is as we want. Also added some tails of expected logs when failures occur. Either /var/log/postgresql.log or the specific pg_upgrade logs which may be created and mentioned in the output of that command failing. Ticket: ENT-10647 Changelog: title (cherry picked from commit a7e6fa707772a9a8e9ca4f4ca307fb4a9a7a2ef2) --- packaging/common/cfengine-hub/postinstall.sh | 28 ++++++++++++++++--- .../common/script-templates/script-common.sh | 18 ++++++++++++ 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/packaging/common/cfengine-hub/postinstall.sh b/packaging/common/cfengine-hub/postinstall.sh index 7a81b3244..179528f0a 100644 --- a/packaging/common/cfengine-hub/postinstall.sh +++ b/packaging/common/cfengine-hub/postinstall.sh @@ -415,11 +415,24 @@ init_postgres_dir() if [ -f "$BACKUP_DIR/data/postgresql.conf.modified" ]; then # User-modified file from the previous old version of CFEngine exists, try to use it. cp -a "$BACKUP_DIR/data/postgresql.conf.modified" "$PREFIX/state/pg/data/postgresql.conf" - (cd /tmp && su cfpostgres -c "$PREFIX/bin/pg_ctl -w -D $PREFIX/state/pg/data -l /var/log/postgresql.log start") - if [ $? = 0 ]; then + failure=0 + (cd /tmp && su cfpostgres -c "$PREFIX/bin/pg_ctl -w -D $PREFIX/state/pg/data -l /var/log/postgresql.log start") || failure=1 + if [ $failure = 0 ]; then + wait_for_cf_postgres || failure=1 + fi + if [ $failure = 0 ]; then # Started successfully, stop it again, the migration requires it to be not running. - (cd /tmp && su cfpostgres -c "$PREFIX/bin/pg_ctl -w -D $PREFIX/state/pg/data -l /var/log/postgresql.log stop") - + (cd /tmp && su cfpostgres -c "$PREFIX/bin/pg_ctl -w -D $PREFIX/state/pg/data -l /var/log/postgresql.log stop") || failure=1 + if [ $failure = 0 ]; then + wait_for_cfpostgres_down || failure=1 + fi + if [ $failure != 0 ]; then + cf_console echo "Error: unable to shutdown postgresql server. Showing last of /var/log/postgresql.log for clues." + cf_console tail /var/log/postgresql.log + # this is a fatal error and so we exit instead of return + # steps after this init_postgres_dir() function should not continue if we can't start/stop the server + exit 1 + fi # Copy over the new config as well, user should take at look at it. cf_console echo "Installing the $pgconfig_type postgresql.conf file as $PREFIX/state/pg/data/postgresql.conf.new." cf_console echo "Please review it and update $PREFIX/state/pg/data/postgresql.conf accordingly." @@ -431,6 +444,8 @@ init_postgres_dir() cf_console echo "Warning: failed to use the old postgresql.conf file, using the $pgconfig_type one." cf_console echo "Please review the $PREFIX/state/pg/data/postgresql.conf file and update it accordingly." cf_console echo "The original file was saved as $PREFIX/state/pg/data/postgresql.conf.old" + cf_console echo "last 10 lines of /var/log/postgresql.log for determining cause of failure" + cf_console tail /var/log/postgresql.log cp -a "$new_pgconfig_file" "$PREFIX/state/pg/data/postgresql.conf" chown cfpostgres "$PREFIX/state/pg/data/postgresql.conf" fi @@ -670,6 +685,11 @@ do_migration() { exit 0 # exits only from (...) fi cf_console echo "Migration using pg_upgrade failed." + # here pg_upgrade probably said something like + # Consult the last few lines of "/var/cfengine/state/pg/data/pg_upgrade_output.d/20230913T150025.959/log/pg_upgrade_server.log" for the probable cause of the failure. + cf_console echo "Showing last lines of any related log files:" + _daysearch=$(date +%Y%m%d) + find "$PREFIX"/state/pg/data/pg_upgrade_output.d -name *.log | grep "$_daysearch" | xargs tail cf_console echo check_disk_space # will abort if low on disk space init_postgres_dir "$new_pgconfig_file" "$pgconfig_type" diff --git a/packaging/common/script-templates/script-common.sh b/packaging/common/script-templates/script-common.sh index 67cd02775..a87cc280e 100644 --- a/packaging/common/script-templates/script-common.sh +++ b/packaging/common/script-templates/script-common.sh @@ -99,6 +99,24 @@ wait_for_cf_postgres() { $PREFIX/bin/psql cfsettings -c "SELECT 1;" >/dev/null 2>&1 } +wait_for_cf_postgres_down() { + # wait for CFEngine Postgresql service to be shutdown, up to 5 sec. + # Returns 0 if postgresql service is not running + # Returns non-0 otherwise (1 if exited by timeout) + for i in $(seq 1 5); do + true "checking if Postgresql is shutdown..." + if ! "$PREFIX"/bin/pg_isready >/dev/null 2>&1; then + true "Postgresql is shutdown, moving on" + return 0 + fi + true "waiting 1 sec for Postgresql to shutdown..." + sleep 1 + done + # Note: it is important that this is the last command of this function. + # Return code of `pg_isready` is the return code of whole function. + ! "$PREFIX"/bin/pg_isready >/dev/null 2>&1 +} + safe_cp() { # "safe" alternative to `cp`. Tries `cp -al` first, and if it fails - `cp -a`. # Deletes partially-copied files if copy operation fails.