From 66c05b910e571285fdb16a784e85d2eb6841e5ca Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Fri, 21 Oct 2022 13:01:13 +0200 Subject: [PATCH] Implement on_primary hooks. This allows users of pg_auto_failover to setup their own scripts/actions to complement a failover. The hooks are run in a process that is separate from the main FSM, so as to prevent the system from making progress. As a result, it's not possible for the hooks to change how things are implemented in pg_auto_failover itself. The hook system also allows running a user-defined "service", which is a long running process or a deamon that belongs to pg_autoctl process tree. - [x] Implement a new internal service for running user-defined hooks - [ ] Implement pg_autoctl enable|disable run-hooks - [ ] Implement pg_autoctl create listener - [ ] Implement support for running script/commands (man system) - [ ] Add unit testing support for user-defined hooks - [ ] Document the new hook system, including tutorial - [ ] Add documentation examples covering pgbouncer as a hooked system --- src/bin/pg_autoctl/cli_do_service.c | 94 +++++ src/bin/pg_autoctl/keeper_config.c | 15 + src/bin/pg_autoctl/keeper_config.h | 5 + src/bin/pg_autoctl/service_keeper.c | 8 + src/bin/pg_autoctl/service_keeper_init.c | 8 + src/bin/pg_autoctl/service_run_hooks.c | 469 +++++++++++++++++++++++ src/bin/pg_autoctl/service_run_hooks.h | 25 ++ src/bin/pg_autoctl/supervisor.h | 1 + 8 files changed, 625 insertions(+) create mode 100644 src/bin/pg_autoctl/service_run_hooks.c create mode 100644 src/bin/pg_autoctl/service_run_hooks.h diff --git a/src/bin/pg_autoctl/cli_do_service.c b/src/bin/pg_autoctl/cli_do_service.c index 94420b7c5..26d9165f5 100644 --- a/src/bin/pg_autoctl/cli_do_service.c +++ b/src/bin/pg_autoctl/cli_do_service.c @@ -27,6 +27,7 @@ #include "service_keeper.h" #include "service_monitor.h" #include "service_postgres_ctl.h" +#include "service_run_hooks.h" #include "signals.h" #include "supervisor.h" @@ -39,14 +40,17 @@ static void cli_do_service_getpid(const char *serviceName); static void cli_do_service_getpid_postgres(int argc, char **argv); static void cli_do_service_getpid_listener(int argc, char **argv); static void cli_do_service_getpid_node_active(int argc, char **argv); +static void cli_do_service_getpid_run_hooks(int argc, char **argv); static void cli_do_service_restart(const char *serviceName); static void cli_do_service_restart_postgres(int argc, char **argv); static void cli_do_service_restart_listener(int argc, char **argv); static void cli_do_service_restart_node_active(int argc, char **argv); +static void cli_do_service_restart_run_hooks(int argc, char **argv); static void cli_do_service_monitor_listener(int argc, char **argv); static void cli_do_service_node_active(int argc, char **argv); +static void cli_do_service_run_hooks(int argc, char **argv); CommandLine service_pgcontroller = make_command("pgcontroller", @@ -80,6 +84,14 @@ CommandLine service_node_active = cli_getopt_pgdata, cli_do_service_node_active); +CommandLine service_run_hooks = + make_command("run-hooks", + "pg_autoctl service that run hooks (scripts)", + CLI_PGDATA_USAGE, + CLI_PGDATA_OPTION, + cli_getopt_pgdata, + cli_do_service_run_hooks); + CommandLine service_getpid_postgres = make_command("postgres", "Get the pid of the pg_autoctl postgres controller service", @@ -104,10 +116,19 @@ CommandLine service_getpid_node_active = cli_getopt_pgdata, cli_do_service_getpid_node_active); +CommandLine service_getpid_run_hooks = + make_command("run-hooks", + "Get the pid of the pg_autoctl run-hooks service", + CLI_PGDATA_USAGE, + CLI_PGDATA_OPTION, + cli_getopt_pgdata, + cli_do_service_getpid_run_hooks); + static CommandLine *service_getpid[] = { &service_getpid_postgres, &service_getpid_listener, &service_getpid_node_active, + &service_getpid_run_hooks, NULL }; @@ -141,10 +162,19 @@ CommandLine service_restart_node_active = cli_getopt_pgdata, cli_do_service_restart_node_active); +CommandLine service_restart_run_hooks = + make_command("run-hooks", + "Restart the pg_autoctl run-hooks service", + CLI_PGDATA_USAGE, + CLI_PGDATA_OPTION, + cli_getopt_pgdata, + cli_do_service_restart_run_hooks); + static CommandLine *service_restart[] = { &service_restart_postgres, &service_restart_listener, &service_restart_node_active, + &service_restart_run_hooks, NULL }; @@ -160,6 +190,7 @@ static CommandLine *service[] = { &service_postgres, &service_monitor_listener, &service_node_active, + &service_run_hooks, NULL }; @@ -255,6 +286,16 @@ cli_do_service_getpid_node_active(int argc, char **argv) } +/* + * cli_do_service_getpid_node_active gets the postgres service pid. + */ +static void +cli_do_service_getpid_run_hooks(int argc, char **argv) +{ + (void) cli_do_service_getpid(SERVICE_NAME_RUN_HOOKS); +} + + /* * cli_do_service_restart sends the TERM signal to the given serviceName, which * is known to have the restart policy RP_PERMANENT (that's hard-coded). As a @@ -352,6 +393,18 @@ cli_do_service_restart_node_active(int argc, char **argv) } +/* + * cli_do_service_restart_run_hooks sends the TERM signal to the run-hooks, + * which is known to have the restart policy RP_PERMANENT (that's hard-coded). + * As a consequence the supervisor will restart the service. + */ +static void +cli_do_service_restart_run_hooks(int argc, char **argv) +{ + (void) cli_do_service_restart(SERVICE_NAME_RUN_HOOKS); +} + + /* * cli_do_pgcontroller starts the process controller service within a supervision * tree. It is used for debug purposes only. When using this entry point we @@ -588,3 +641,44 @@ cli_do_service_node_active(int argc, char **argv) /* Start the node_active() protocol client */ (void) keeper_node_active_loop(&keeper, ppid); } + + +/* + * cli_do_service_run_hooks starts the run-hooks service. + */ +static void +cli_do_service_run_hooks(int argc, char **argv) +{ + Keeper keeper = { 0 }; + + pid_t ppid = getppid(); + + bool exitOnQuit = true; + + keeper.config = keeperOptions; + + /* Establish a handler for signals. */ + (void) set_signal_handlers(exitOnQuit); + + /* display a user-friendly process name */ + (void) set_ps_title("pg_autoctl: run-hooks"); + + /* Prepare our Keeper and KeeperConfig from the CLI options */ + if (!service_run_hooks_init(&keeper)) + { + log_fatal("Failed to initialize the run-hooks service, " + "see above for details"); + exit(EXIT_CODE_INTERNAL_ERROR); + } + + /* create the service pidfile */ + if (!create_service_pidfile(keeper.config.pathnames.pid, + SERVICE_NAME_RUN_HOOKS)) + { + /* errors have already been logged */ + exit(EXIT_CODE_INTERNAL_ERROR); + } + + /* Start the node_active() protocol client */ + (void) service_run_hooks_loop(&keeper, ppid); +} diff --git a/src/bin/pg_autoctl/keeper_config.c b/src/bin/pg_autoctl/keeper_config.c index 1459336df..4c96145c0 100644 --- a/src/bin/pg_autoctl/keeper_config.c +++ b/src/bin/pg_autoctl/keeper_config.c @@ -143,6 +143,18 @@ make_strbuf_option("replication", "backup_directory", NULL, \ false, MAXPGPATH, config->backupDirectory) +#define OPTION_HOOKS_ACTIVE(config) \ + make_int_option_default("hooks", "active", NULL, \ + false, &(config->enableHooks), 0) + +#define OPTION_HOOKS_ON_PRIMARY_CMD(config) \ + make_strbuf_option("hooks", "on_primary", NULL, \ + false, BUFSIZE, config->onPrimaryCmd) + +#define OPTION_HOOKS_SERVICE_START_CMD(config) \ + make_strbuf_option("hooks", "service", NULL, \ + false, BUFSIZE, config->serviceStartCmd) + #define OPTION_TIMEOUT_NETWORK_PARTITION(config) \ make_int_option_default("timeout", "network_partition_timeout", \ NULL, false, \ @@ -241,6 +253,9 @@ OPTION_REPLICATION_MAXIMUM_BACKUP_RATE(config), \ OPTION_REPLICATION_BACKUP_DIR(config), \ OPTION_REPLICATION_PASSWORD(config), \ + OPTION_HOOKS_ACTIVE(config), \ + OPTION_HOOKS_ON_PRIMARY_CMD(config), \ + OPTION_HOOKS_SERVICE_START_CMD(config), \ OPTION_TIMEOUT_NETWORK_PARTITION(config), \ OPTION_TIMEOUT_PREPARE_PROMOTION_CATCHUP(config), \ OPTION_TIMEOUT_PREPARE_PROMOTION_WALRECEIVER(config), \ diff --git a/src/bin/pg_autoctl/keeper_config.h b/src/bin/pg_autoctl/keeper_config.h index 3826923d9..e5ab6468e 100644 --- a/src/bin/pg_autoctl/keeper_config.h +++ b/src/bin/pg_autoctl/keeper_config.h @@ -56,6 +56,11 @@ typedef struct KeeperConfig char maximum_backup_rate[MAXIMUM_BACKUP_RATE_LEN]; char backupDirectory[MAXPGPATH]; + /* Hooks settings (JSON strings) */ + int enableHooks; /* hooks.active */ + char onPrimaryCmd[BUFSIZE]; + char serviceStartCmd[BUFSIZE]; + /* Citus specific options and settings */ char citusRoleStr[NAMEDATALEN]; CitusRole citusRole; diff --git a/src/bin/pg_autoctl/service_keeper.c b/src/bin/pg_autoctl/service_keeper.c index 5c3d4b557..9af0f67ee 100644 --- a/src/bin/pg_autoctl/service_keeper.c +++ b/src/bin/pg_autoctl/service_keeper.c @@ -27,6 +27,7 @@ #include "pidfile.h" #include "service_keeper.h" #include "service_postgres_ctl.h" +#include "service_run_hooks.h" #include "signals.h" #include "state.h" #include "string_utils.h" @@ -85,6 +86,13 @@ start_keeper(Keeper *keeper) -1, &service_keeper_start, (void *) keeper + }, + { + SERVICE_NAME_RUN_HOOKS, + RP_PERMANENT, + -1, + &service_run_hooks_start, + (void *) keeper } }; diff --git a/src/bin/pg_autoctl/service_keeper_init.c b/src/bin/pg_autoctl/service_keeper_init.c index 7604562dd..95f40c39d 100644 --- a/src/bin/pg_autoctl/service_keeper_init.c +++ b/src/bin/pg_autoctl/service_keeper_init.c @@ -28,6 +28,7 @@ #include "service_keeper.h" #include "service_keeper_init.h" #include "service_postgres_ctl.h" +#include "service_run_hooks.h" #include "signals.h" #include "string_utils.h" #include "supervisor.h" @@ -57,6 +58,13 @@ service_keeper_init(Keeper *keeper) -1, &service_keeper_init_start, (void *) keeper + }, + { + SERVICE_NAME_RUN_HOOKS, + RP_PERMANENT, + -1, + &service_run_hooks_start, + (void *) keeper } }; diff --git a/src/bin/pg_autoctl/service_run_hooks.c b/src/bin/pg_autoctl/service_run_hooks.c new file mode 100644 index 000000000..8cefc9d38 --- /dev/null +++ b/src/bin/pg_autoctl/service_run_hooks.c @@ -0,0 +1,469 @@ +/* + * src/bin/pg_autoctl/service_run_hooks.c + * The main loop of the pg_autoctl run-hooks service + * + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the PostgreSQL License. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "parson.h" + +#include "cli_common.h" +#include "cli_root.h" +#include "defaults.h" +#include "fsm.h" +#include "keeper.h" +#include "keeper_config.h" +#include "log.h" +#include "monitor.h" +#include "pidfile.h" +#include "service_run_hooks.h" +#include "signals.h" +#include "state.h" +#include "string_utils.h" +#include "supervisor.h" + +#include "runprogram.h" + + +static void reload_configuration(Keeper *keeper); +static bool service_run_hook(Keeper *keeper, NodeAddress *primary); +static bool service_run_hooks_start_service(Keeper *keeper, pid_t *pid); +static bool service_run_hooks_check_service(Keeper *keeper, pid_t *hookServicePid); + + +/* + * service_run_hooks_start starts a sub-process that listens to the monitor + * notifications and outputs them for the user. + */ +bool +service_run_hooks_start(void *context, pid_t *pid) +{ + Keeper *keeper = (Keeper *) context; + + /* Flush stdio channels just before fork, to avoid double-output problems */ + fflush(stdout); + fflush(stderr); + + /* time to create the node_active sub-process */ + pid_t fpid = fork(); + + switch (fpid) + { + case -1: + { + log_error("Failed to fork the run-hooks process"); + return false; + } + + case 0: + { + /* here we call execv() so we never get back */ + (void) service_run_hooks_runprogram(keeper); + + /* unexpected */ + log_fatal("BUG: returned from service_run_hooks_runprogram()"); + exit(EXIT_CODE_INTERNAL_ERROR); + } + + default: + { + /* fork succeeded, in parent */ + log_debug("pg_autoctl run-hooks process started in subprocess %d", + fpid); + *pid = fpid; + return true; + } + } +} + + +/* + * service_run_hooks_runprogram runs the node_active protocol service: + * + * $ pg_autoctl do service run-hooks --pgdata ... + * + * This function is intended to be called from the child process after a fork() + * has been successfully done at the parent process level: it's calling + * execve() and will never return. + */ +void +service_run_hooks_runprogram(Keeper *keeper) +{ + char *args[12]; + int argsIndex = 0; + + char command[BUFSIZE]; + + /* + * use --pgdata option rather than the config. + * + * On macOS when using /tmp, the file path is then redirected to being + * /private/tmp when using realpath(2) as we do in normalize_filename(). So + * for that case to be supported, we explicitely re-use whatever PGDATA or + * --pgdata was parsed from the main command line to start our sub-process. + */ + char *pgdata = keeperOptions.pgSetup.pgdata; + + setenv(PG_AUTOCTL_DEBUG, "1", 1); + + args[argsIndex++] = (char *) pg_autoctl_program; + args[argsIndex++] = "do"; + args[argsIndex++] = "service"; + args[argsIndex++] = "run-hooks"; + args[argsIndex++] = "--pgdata"; + args[argsIndex++] = pgdata; + args[argsIndex++] = logLevelToString(log_get_level()); + args[argsIndex] = NULL; + + /* we do not want to call setsid() when running this program. */ + Program program = { 0 }; + (void) initialize_program(&program, args, false); + + program.capture = false; /* redirect output, don't capture */ + program.stdOutFd = STDOUT_FILENO; + program.stdErrFd = STDERR_FILENO; + + /* log the exact command line we're using */ + (void) snprintf_program_command_line(&program, command, BUFSIZE); + + log_info("%s", command); + + (void) execute_program(&program); +} + + +/* + * service_run_hooks_init initializes the pg_autoctl service for the run-hooks + * implementation. + */ +bool +service_run_hooks_init(Keeper *keeper) +{ + KeeperConfig *config = &(keeper->config); + + /* wait until the config file exists on-disk */ + ConnectionRetryPolicy retryPolicy = { 0 }; + + /* retry until we have a configuration file ready (create --run) */ + (void) pgsql_set_main_loop_retry_policy(&retryPolicy); + + while (!pgsql_retry_policy_expired(&retryPolicy)) + { + if (file_exists(config->pathnames.config)) + { + /* success: break out of the retry loop */ + break; + } + + if (asked_to_stop || asked_to_stop_fast) + { + return true; + } + + int sleepTimeMs = + pgsql_compute_connection_retry_sleep_time(&retryPolicy); + + log_debug("Checking if config file \"%s\" exists again in %dms", + config->pathnames.config, + sleepTimeMs); + + (void) pg_usleep(sleepTimeMs * 1000); + } + + bool monitorDisabledIsOk = false; + + if (!keeper_config_read_file_skip_pgsetup(config, monitorDisabledIsOk)) + { + /* errors have already been logged. */ + exit(EXIT_CODE_BAD_CONFIG); + } + + if (!config->monitorDisabled) + { + if (!monitor_init(&keeper->monitor, config->monitor_pguri)) + { + log_fatal("Failed to initialize monitor, see above for details"); + return false; + } + } + + return true; +} + + +/* + * run_hooks_loop runs the main loop of the run-hooks service. + */ +bool +service_run_hooks_loop(Keeper *keeper, pid_t start_pid) +{ + Monitor *monitor = &(keeper->monitor); + KeeperConfig *config = &(keeper->config); + + char *formation = config->formation; + int groupId = config->groupId; + + pid_t hookServicePid = 0; + + /* + * At startup, call the registered command line with the current primary + * node. + */ + if (!IS_EMPTY_STRING_BUFFER(config->onPrimaryCmd)) + { + NodeAddress primary = { 0 }; + + if (!monitor_get_primary(monitor, formation, groupId, &primary)) + { + /* errors have already been logged */ + return false; + } + + if (!service_run_hook(keeper, &primary)) + { + /* errors have already been logged */ + return false; + } + } + + /* + * At startup, now that we have run the hooks.on_primary command (if any), + * now is a good time to run the service (if any). + */ + if (!service_run_hooks_start_service(keeper, &hookServicePid)) + { + /* errors have already been logged */ + return false; + } + + bool firstLoop = true; + + for (;; firstLoop = false) + { + if (asked_to_reload || firstLoop) + { + (void) reload_configuration(keeper); + } + else if (!firstLoop) + { + sleep(PG_AUTOCTL_KEEPER_SLEEP_TIME); + } + + if (asked_to_stop || asked_to_stop_fast) + { + log_info("Run-hooks service received signal %s, terminating", + signal_to_string(get_current_signal(SIGTERM))); + break; + } + + /* + * Consider the service disabled unless we have a command to run when a + * primary node is promoted. + */ + if (!config->enableHooks) + { + continue; + } + + /* + * Take care of our hooks.service command, which we restart when it + * fails. + */ + if (!service_run_hooks_check_service(keeper, &hookServicePid)) + { + /* errors have already been logged */ + return false; + } + + if (!monitor_get_notifications(monitor, + + /* we want the time in milliseconds */ + PG_AUTOCTL_MONITOR_SLEEP_TIME * 1000)) + { + log_warn("Re-establishing connection. We might miss notifications."); + pgsql_finish(&(monitor->pgsql)); + pgsql_finish(&(monitor->notificationClient)); + + continue; + } + } + + return true; +} + + +/* + * reload_configuration reads the supposedly new configuration file and + * integrates accepted new values into the current setup. + */ +static void +reload_configuration(Keeper *keeper) +{ + KeeperConfig *config = &(keeper->config); + bool monitorDisabledIsOk = false; + + if (!keeper_config_read_file_skip_pgsetup(config, monitorDisabledIsOk)) + { + /* errors have already been logged */ + asked_to_reload = 0; + return; + } + + /* we are impacted by a monitor configuration change */ + if (!config->monitorDisabled) + { + if (!monitor_init(&keeper->monitor, config->monitor_pguri)) + { + log_fatal("Failed to initialize monitor, see above for details"); + asked_to_reload = 0; + return; + } + } + + /* only take care of the hooks section */ + if (!IS_EMPTY_STRING_BUFFER(config->onPrimaryCmd)) + { + JSON_Value *json = json_parse_string(config->onPrimaryCmd); + + if (json_type(json) != JSONString && + json_type(json) != JSONArray) + { + log_error("Failed to parse hooks.on_primary command \"%s\", " + "a JSON string or a JSON array is expected", + config->onPrimaryCmd); + } + } + + if (!IS_EMPTY_STRING_BUFFER(config->serviceStartCmd)) + { + JSON_Value *json = json_parse_string(config->serviceStartCmd); + + if (json_type(json) != JSONString && + json_type(json) != JSONArray) + { + log_error("Failed to parse hooks.service command \"%s\", " + "a JSON string or a JSON array is expected", + config->serviceStartCmd); + } + } + + /* we're done reloading now. */ + asked_to_reload = 0; +} + + +/* + * service_run_hook runs the hooks.on_primary command. + */ +static bool +service_run_hook(Keeper *keeper, NodeAddress *primary) +{ + KeeperConfig *config = &(keeper->config); + + if (!config->enableHooks || IS_EMPTY_STRING_BUFFER(config->onPrimaryCmd)) + { + return true; + } + + log_warn("Running command: %s", config->onPrimaryCmd); + + return false; +} + + +/* + * service_run_hooks_start_service starts the service that's been setup with + * the hooks registration, if any. Could be a pgloader daemon, for instance. + */ +static bool +service_run_hooks_start_service(Keeper *keeper, pid_t *pid) +{ + KeeperConfig *config = &(keeper->config); + + if (!config->enableHooks || IS_EMPTY_STRING_BUFFER(config->serviceStartCmd)) + { + *pid = 0; + return true; + } + + log_warn("Starting service: %s", config->serviceStartCmd); + + return false; +} + + +/* + * service_run_hooks_check_service makes sure that the hooks service is still + * running, and restarts it otherwise. + */ +static bool +service_run_hooks_check_service(Keeper *keeper, pid_t *hookServicePid) +{ + if (*hookServicePid == 0) + { + return true; + } + + int status; + pid_t pid = waitpid(*hookServicePid, &status, WNOHANG); + + switch (pid) + { + case -1: + { + /* if our PostgresService stopped, just continue */ + if (errno != ECHILD) + { + log_error("Failed to call waitpid(): %m"); + } + break; + } + + case 0: + { + /* + * We're using WNOHANG, 0 means there are no stopped or exited + * children, it's all good. It's the expected case when + * everything is running smoothly, so enjoy and sleep for + * awhile. + */ + break; + } + + default: + { + /* we expect that pid is hookServicePid */ + if (pid != *hookServicePid) + { + log_error("BUG: service_run_hooks_loop waitpid() got %d, " + "expected hookServicePid %d", + pid, + *hookServicePid); + return false; + } + + char *verb = WIFEXITED(status) ? "exited" : "failed"; + log_info("waitpid(): hook service process %d has %s", pid, verb); + + + if (!service_run_hooks_start_service(keeper, hookServicePid)) + { + /* errors have already been logged */ + return false; + } + + break; + } + } + + return true; +} diff --git a/src/bin/pg_autoctl/service_run_hooks.h b/src/bin/pg_autoctl/service_run_hooks.h new file mode 100644 index 000000000..1fab62be2 --- /dev/null +++ b/src/bin/pg_autoctl/service_run_hooks.h @@ -0,0 +1,25 @@ + +/* + * src/bin/pg_autoctl/service_run_hooks.h + * Utilities to start the keeper services. + * + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the PostgreSQL License. + * + */ + +#ifndef SERVICE_RUN_HOOKS_H +#define SERVICE_RUN_HOOKS_H + +#include + +#include "keeper.h" +#include "keeper_config.h" + +bool service_run_hooks_start(void *context, pid_t *pid); +void service_run_hooks_runprogram(Keeper *keeper); +bool service_run_hooks_init(Keeper *keeper); +bool service_run_hooks_loop(Keeper *keeper, pid_t start_pid); + + +#endif /* SERVICE_RUN_HOOKS_H */ diff --git a/src/bin/pg_autoctl/supervisor.h b/src/bin/pg_autoctl/supervisor.h index 903b587ae..f4fd4e2f6 100644 --- a/src/bin/pg_autoctl/supervisor.h +++ b/src/bin/pg_autoctl/supervisor.h @@ -23,6 +23,7 @@ #define SERVICE_NAME_POSTGRES "postgres" #define SERVICE_NAME_KEEPER "node-active" #define SERVICE_NAME_MONITOR "listener" +#define SERVICE_NAME_RUN_HOOKS "hooks" /* * At pg_autoctl create time we use a transient service to initialize our local