diff --git a/diskquota.c b/diskquota.c index 2740f276..519da475 100644 --- a/diskquota.c +++ b/diskquota.c @@ -100,20 +100,27 @@ static DiskquotaLauncherShmemStruct *DiskquotaLauncherShmem; */ BackgroundWorkerHandle **bgworker_handles; +typedef enum +{ + SUCCESS, + INVALID_DB, + NO_FREE_WORKER, + UNKNOWN, +} StartWorkerState; /* functions of disk quota*/ void _PG_init(void); void _PG_fini(void); void disk_quota_worker_main(Datum); void disk_quota_launcher_main(Datum); -static void disk_quota_sigterm(SIGNAL_ARGS); -static void disk_quota_sighup(SIGNAL_ARGS); -static void define_guc_variables(void); -static bool start_worker(DiskquotaDBEntry *dbEntry); -static void create_monitor_db_table(void); -static void add_dbid_to_database_list(Oid dbid); -static void del_dbid_from_database_list(Oid dbid); -static void process_extension_ddl_message(void); +static void disk_quota_sigterm(SIGNAL_ARGS); +static void disk_quota_sighup(SIGNAL_ARGS); +static void define_guc_variables(void); +static StartWorkerState start_worker(DiskquotaDBEntry *dbEntry); +static void create_monitor_db_table(void); +static void add_dbid_to_database_list(Oid dbid); +static void del_dbid_from_database_list(Oid dbid); +static void process_extension_ddl_message(void); static void do_process_extension_ddl_message(MessageResult *code, ExtensionDDLMessage local_extension_ddl_message); static void terminate_all_workers(void); static void on_add_db(Oid dbid, MessageResult *code); @@ -594,6 +601,7 @@ disk_quota_launcher_main(Datum main_arg) Oid curDBId = 0; bool advance_one_db = true; bool timeout = false; + int try_times = 0; while (!got_sigterm) { int rc; @@ -601,8 +609,9 @@ disk_quota_launcher_main(Datum main_arg) /* pick a db to run */ if (advance_one_db) { - curDB = next_db(curDB); - timeout = false; + curDB = next_db(curDB); + timeout = false; + try_times = 0; if (curDB != NULL) { curDBId = curDB->dbid; @@ -728,10 +737,18 @@ disk_quota_launcher_main(Datum main_arg) */ if (TimestampDifferenceExceeds(curDB->next_run_time, GetCurrentTimestamp(), MIN_SLEEPTIME)) { - bool ret = start_worker(curDB); - advance_one_db = ret; - /* has exceeded the next_run_time of current db */ - timeout = true; + StartWorkerState ret = start_worker(curDB); + /* when start_worker successfully or db is invalid, pick up next db to run */ + advance_one_db = (ret == SUCCESS || ret == INVALID_DB) ? true : false; + if (!advance_one_db) + { + /* has exceeded the next_run_time of current db */ + timeout = true; + /* when start_worker return is not 2(no free worker), increase the try_times*/ + if (ret != NO_FREE_WORKER) try_times++; + /* only try to start bgworker for a database at most 3 times */ + if (try_times >= 3) advance_one_db = true; + } } else { @@ -1237,9 +1254,17 @@ terminate_all_workers(void) * Dynamically launch an disk quota worker process. * This function is called when launcher process * schedules a database's diskquota worker to run. + * + * return: + * SUCCESS means starting the bgworker sucessfully. + * INVALID_DB means the database is invalid + * NO_FREE_WORKER means there is no avaliable free workers + * UNKNOWN means registering or starting the bgworker + * failed, maybe there is no free bgworker, or + * forking a process failed and so on. */ -static bool +static StartWorkerState start_worker(DiskquotaDBEntry *dbEntry) { BackgroundWorker worker; @@ -1247,12 +1272,14 @@ start_worker(DiskquotaDBEntry *dbEntry) DiskQuotaWorkerEntry *dq_worker; MemoryContext old_ctx; char *dbname = NULL; + int result = SUCCESS; dq_worker = next_worker(); if (dq_worker == NULL) { elog(DEBUG1, "[diskquota] no free workers"); - return false; + result = NO_FREE_WORKER; + return result; } /* free the BackgroundWorkerHandle used by last database */ free_bgworker_handle(dq_worker->id); @@ -1279,7 +1306,11 @@ start_worker(DiskquotaDBEntry *dbEntry) sprintf(worker.bgw_library_name, DISKQUOTA_BINARY_NAME); sprintf(worker.bgw_function_name, "disk_quota_worker_main"); dbname = get_db_name(dbEntry->dbid); - if (dbname == NULL) goto Failed; + if (dbname == NULL) + { + result = INVALID_DB; + goto Failed; + } snprintf(worker.bgw_name, sizeof(worker.bgw_name), "%s", dbname); pfree(dbname); @@ -1293,6 +1324,7 @@ start_worker(DiskquotaDBEntry *dbEntry) if (!ret) { elog(WARNING, "Create bgworker failed"); + result = UNKNOWN; goto Failed; } BgwHandleStatus status; @@ -1302,6 +1334,7 @@ start_worker(DiskquotaDBEntry *dbEntry) { ereport(WARNING, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("could not start background process"), errhint("More details may be available in the server log."))); + result = UNKNOWN; goto Failed; } if (status == BGWH_POSTMASTER_DIED) @@ -1309,16 +1342,17 @@ start_worker(DiskquotaDBEntry *dbEntry) ereport(WARNING, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("cannot start background processes without postmaster"), errhint("Kill all remaining database processes and restart the database."))); + result = UNKNOWN; goto Failed; } Assert(status == BGWH_STARTED); - return true; + return result; Failed: elog(DEBUG1, "[diskquota] diskquota, starts diskquota failed"); FreeWorker(dq_worker); - return false; + return result; } /* @@ -1655,6 +1689,10 @@ next_db(DiskquotaDBEntry *curDB) nextSlot = curDB->id + 1; } + /* + * SearchSysCache should be run in a transaction + */ + StartTransactionCommand(); LWLockAcquire(diskquota_locks.dblist_lock, LW_SHARED); for (int i = 0; i < MAX_NUM_MONITORED_DB; i++) { @@ -1662,10 +1700,13 @@ next_db(DiskquotaDBEntry *curDB) DiskquotaDBEntry *dbEntry = &DiskquotaLauncherShmem->dbArray[nextSlot]; nextSlot++; if (!dbEntry->in_use || dbEntry->workerId != INVALID_WORKER_ID || dbEntry->dbid == InvalidOid) continue; + /* TODO: should release the invalid db related things */ + if (!is_valid_dbid(dbEntry->dbid)) continue; result = dbEntry; break; } LWLockRelease(diskquota_locks.dblist_lock); + CommitTransactionCommand(); return result; } diff --git a/tests/regress/diskquota_schedule b/tests/regress/diskquota_schedule index 3d34e02b..7722765d 100644 --- a/tests/regress/diskquota_schedule +++ b/tests/regress/diskquota_schedule @@ -40,5 +40,6 @@ test: test_ctas_tablespace_schema test: test_default_tablespace test: test_tablespace_diff_schema test: test_worker_schedule +test: test_worker_schedule_exception test: test_drop_extension test: reset_config diff --git a/tests/regress/expected/test_worker_schedule.out b/tests/regress/expected/test_worker_schedule.out index 8003a4e2..7c6fc7b8 100644 --- a/tests/regress/expected/test_worker_schedule.out +++ b/tests/regress/expected/test_worker_schedule.out @@ -2,6 +2,8 @@ \c DROP DATABASE IF EXISTS t1; NOTICE: database "t1" does not exist, skipping +DROP DATABASE IF EXISTS t2; +NOTICE: database "t2" does not exist, skipping DROP DATABASE IF EXISTS t3; NOTICE: database "t3" does not exist, skipping DROP DATABASE IF EXISTS t4; diff --git a/tests/regress/expected/test_worker_schedule_exception.out b/tests/regress/expected/test_worker_schedule_exception.out new file mode 100644 index 00000000..432e27f9 --- /dev/null +++ b/tests/regress/expected/test_worker_schedule_exception.out @@ -0,0 +1,123 @@ +-- start_ignore +\! gpconfig -c diskquota.max_workers -v 10; +20221209:16:01:17:089154 gpconfig:wxiaoranVKGWQ:wxiaoran-[INFO]:-completed successfully with parameters '-c diskquota.max_workers -v 10' +\! gpconfig -c diskquota.naptime -v 4; +20221209:16:01:19:089255 gpconfig:wxiaoranVKGWQ:wxiaoran-[INFO]:-completed successfully with parameters '-c diskquota.naptime -v 4' +\! gpstop -arf; +20221209:18:21:23:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Starting gpstop with args: -arf +20221209:18:21:23:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Gathering information and validating the environment... +20221209:18:21:23:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Obtaining Greenplum Master catalog information +20221209:18:21:23:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Obtaining Segment details from master... +20221209:18:21:23:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Greenplum Version: 'postgres (Greenplum Database) 6.22.1+dev.36.gedf0e003f8 build dev' +20221209:18:21:23:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Commencing Master instance shutdown with mode='fast' +20221209:18:21:23:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Master segment instance directory=/Users/wxiaoran/gpdb/gpAux/gpdemo/datadirs/qddir/demoDataDir-1 +20221209:18:21:23:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Attempting forceful termination of any leftover master process +20221209:18:21:23:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Terminating processes for segment /Users/wxiaoran/gpdb/gpAux/gpdemo/datadirs/qddir/demoDataDir-1 +20221209:18:21:23:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Stopping master standby host wxiaoranVKGWQ.vmware.com mode=fast +20221209:18:21:24:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Successfully shutdown standby process on wxiaoranVKGWQ.vmware.com +20221209:18:21:24:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Targeting dbid [2, 5, 3, 6, 4, 7] for shutdown +20221209:18:21:24:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Commencing parallel primary segment instance shutdown, please wait... +20221209:18:21:24:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-0.00% of jobs completed +20221209:18:21:25:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-100.00% of jobs completed +20221209:18:21:25:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Commencing parallel mirror segment instance shutdown, please wait... +20221209:18:21:25:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-0.00% of jobs completed +20221209:18:21:26:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-100.00% of jobs completed +20221209:18:21:26:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:----------------------------------------------------- +20221209:18:21:26:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:- Segments stopped successfully = 6 +20221209:18:21:26:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:- Segments with errors during stop = 0 +20221209:18:21:26:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:----------------------------------------------------- +20221209:18:21:26:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Successfully shutdown 6 of 6 segment instances +20221209:18:21:26:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Database successfully shutdown with no errors reported +20221209:18:21:26:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Cleaning up leftover gpmmon process +20221209:18:21:26:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-No leftover gpmmon process found +20221209:18:21:26:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Cleaning up leftover gpsmon processes +20221209:18:21:26:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-No leftover gpsmon processes on some hosts. not attempting forceful termination on these hosts +20221209:18:21:26:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Cleaning up leftover shared memory +20221209:18:21:27:045673 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Restarting System... +\c +DROP DATABASE IF EXISTS t1; +NOTICE: database "t1" does not exist, skipping +DROP DATABASE IF EXISTS t2; +NOTICE: database "t2" does not exist, skipping +--end_ignore +CREATE DATABASE t1; +CREATE DATABASE t2; +\c t1 +CREATE EXTENSION diskquota; +SELECT diskquota.wait_for_worker_new_epoch(); + wait_for_worker_new_epoch +--------------------------- + t +(1 row) + +\! pgrep -f "[p]ostgres.*bgworker.*t1" | xargs kill; +\! sleep 0.5 ; ps -ef | grep postgres | grep "\[diskquota]" | grep -v grep | wc -l +2 +-- start_ignore +\! ps -ef | grep postgres | grep "\[diskquota]" | grep -v grep + 503 89701 89678 0 4:01PM ?? 0:00.17 postgres: 6000, bgworker: [diskquota] - launcher + 503 89743 89678 0 4:01PM ?? 0:00.03 postgres: 6000, bgworker: [diskquota] contrib_regression cmd1 +--end_ignore +\c contrib_regression +DROP DATABASE t1; +\c t2 +CREATE EXTENSION diskquota; +SELECT diskquota.wait_for_worker_new_epoch(); + wait_for_worker_new_epoch +--------------------------- + t +(1 row) + +\c t2 +SELECT diskquota.pause(); + pause +------- + +(1 row) + +SELECT diskquota.wait_for_worker_new_epoch(); + wait_for_worker_new_epoch +--------------------------- + t +(1 row) + +DROP EXTENSION diskquota; +\c contrib_regression +DROP DATABASE t2; +--start_ignore +\! gpconfig -r diskquota.naptime; +20221209:16:02:10:089976 gpconfig:wxiaoranVKGWQ:wxiaoran-[INFO]:-completed successfully with parameters '-r diskquota.naptime' +\! gpconfig -r diskquota.max_workers; +20221209:16:02:12:090078 gpconfig:wxiaoranVKGWQ:wxiaoran-[INFO]:-completed successfully with parameters '-r diskquota.max_workers' +\! gpstop -arf; +20221209:16:02:12:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Starting gpstop with args: -arf +20221209:16:02:12:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Gathering information and validating the environment... +20221209:16:02:12:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Obtaining Greenplum Master catalog information +20221209:16:02:12:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Obtaining Segment details from master... +20221209:16:02:12:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Greenplum Version: 'postgres (Greenplum Database) 6.22.1+dev.36.gedf0e003f8 build dev' +20221209:16:02:12:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Commencing Master instance shutdown with mode='fast' +20221209:16:02:12:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Master segment instance directory=/Users/wxiaoran/gpdb/gpAux/gpdemo/datadirs/qddir/demoDataDir-1 +20221209:16:02:12:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Attempting forceful termination of any leftover master process +20221209:16:02:12:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Terminating processes for segment /Users/wxiaoran/gpdb/gpAux/gpdemo/datadirs/qddir/demoDataDir-1 +20221209:16:02:13:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Stopping master standby host wxiaoranVKGWQ.vmware.com mode=fast +20221209:16:02:14:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Successfully shutdown standby process on wxiaoranVKGWQ.vmware.com +20221209:16:02:14:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Targeting dbid [2, 5, 3, 6, 4, 7] for shutdown +20221209:16:02:14:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Commencing parallel primary segment instance shutdown, please wait... +20221209:16:02:14:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-0.00% of jobs completed +20221209:16:02:14:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-100.00% of jobs completed +20221209:16:02:14:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Commencing parallel mirror segment instance shutdown, please wait... +20221209:16:02:14:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-0.00% of jobs completed +20221209:16:02:15:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-100.00% of jobs completed +20221209:16:02:15:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:----------------------------------------------------- +20221209:16:02:15:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:- Segments stopped successfully = 6 +20221209:16:02:15:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:- Segments with errors during stop = 0 +20221209:16:02:15:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:----------------------------------------------------- +20221209:16:02:15:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Successfully shutdown 6 of 6 segment instances +20221209:16:02:15:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Database successfully shutdown with no errors reported +20221209:16:02:15:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Cleaning up leftover gpmmon process +20221209:16:02:15:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-No leftover gpmmon process found +20221209:16:02:15:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Cleaning up leftover gpsmon processes +20221209:16:02:15:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-No leftover gpsmon processes on some hosts. not attempting forceful termination on these hosts +20221209:16:02:15:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Cleaning up leftover shared memory +20221209:16:02:17:090179 gpstop:wxiaoranVKGWQ:wxiaoran-[INFO]:-Restarting System... +--end_ignore diff --git a/tests/regress/sql/test_worker_schedule.sql b/tests/regress/sql/test_worker_schedule.sql index f63e02f4..94d27e93 100644 --- a/tests/regress/sql/test_worker_schedule.sql +++ b/tests/regress/sql/test_worker_schedule.sql @@ -2,6 +2,7 @@ \c DROP DATABASE IF EXISTS t1; +DROP DATABASE IF EXISTS t2; DROP DATABASE IF EXISTS t3; DROP DATABASE IF EXISTS t4; DROP DATABASE IF EXISTS t5; diff --git a/tests/regress/sql/test_worker_schedule_exception.sql b/tests/regress/sql/test_worker_schedule_exception.sql new file mode 100644 index 00000000..83fe7faf --- /dev/null +++ b/tests/regress/sql/test_worker_schedule_exception.sql @@ -0,0 +1,38 @@ +-- start_ignore +\! gpconfig -c diskquota.max_workers -v 10; +\! gpconfig -c diskquota.naptime -v 4; +\! gpstop -arf; +\c +DROP DATABASE IF EXISTS t1; +DROP DATABASE IF EXISTS t2; +--end_ignore + +CREATE DATABASE t1; +CREATE DATABASE t2; +\c t1 +CREATE EXTENSION diskquota; +SELECT diskquota.wait_for_worker_new_epoch(); + +\! pgrep -f "[p]ostgres.*bgworker.*t1" | xargs kill; +\! sleep 0.5 ; ps -ef | grep postgres | grep "\[diskquota]" | grep -v grep | wc -l +-- start_ignore +\! ps -ef | grep postgres | grep "\[diskquota]" | grep -v grep +--end_ignore +\c contrib_regression +DROP DATABASE t1; +\c t2 +CREATE EXTENSION diskquota; +SELECT diskquota.wait_for_worker_new_epoch(); + +\c t2 +SELECT diskquota.pause(); +SELECT diskquota.wait_for_worker_new_epoch(); +DROP EXTENSION diskquota; + +\c contrib_regression +DROP DATABASE t2; +--start_ignore +\! gpconfig -r diskquota.naptime; +\! gpconfig -r diskquota.max_workers; +\! gpstop -arf; +--end_ignore