Limit diskquota hash table's size according initial request

Diskquota does not control the size of its hash tables in shared memory and may consume shared memory that is not intended for it, which may affect the other database subsystems. Hash tables can also grow indefinitely if the background process on the coordinator has not started, which can happen for a number of reasons: gone done with error, pause, isn’t started. This patch adds a limit on the size of hash tables in shared memory by adding a wrapper over hash_search that controls the size of the table and stops adding elements when the number of elements allocated at the beginning is reached and report a warning. GUC is also added, which limits the frequency of report of the warning, since in some cases it may be report too often.
arenadata · Nov 27, 2023 · f7b0ca1 · f7b0ca1
1 parent d0b41d6
commit f7b0ca1
Show file tree

Hide file tree

Showing 8 changed files with 141 additions and 56 deletions.
diff --git a/src/diskquota.c b/src/diskquota.c
@@ -78,6 +78,7 @@ int  diskquota_max_workers             = 10;
 int  diskquota_max_table_segments      = 0;
 int  diskquota_max_monitored_databases = 0;
 int  diskquota_max_quota_probes        = 0;
+time_t diskquota_hashmap_overflow_report_timeout = 0;
 
 DiskQuotaLocks       diskquota_locks;
 ExtensionDDLMessage *extension_ddl_message = NULL;
@@ -414,6 +415,9 @@ define_guc_variables(void)
 	DefineCustomIntVariable("diskquota.max_quota_probes", "Max number of quotas on the cluster.", NULL,
 	                        &diskquota_max_quota_probes, 1024 * 1024, 1024 * INIT_QUOTA_MAP_ENTRIES, INT_MAX,
 	                        PGC_POSTMASTER, 0, NULL, NULL, NULL);
+	DefineCustomIntVariable("diskquota.hashmap_overflow_report_timeout",
+	                        "Time interval in seconds between shared hash map overflow report.", NULL,
+	                        &diskquota_hashmap_overflow_report_timeout, 60, 0, INT_MAX, PGC_SUSET, 0, NULL, NULL, NULL);
 }
 
 /* ---- Functions for disk quota worker process ---- */

diff --git a/src/diskquota.h b/src/diskquota.h
@@ -286,6 +286,7 @@ extern Datum diskquota_fetch_table_stat(PG_FUNCTION_ARGS);
 extern int   diskquota_naptime;
 extern int   diskquota_max_active_tables;
 extern bool  diskquota_hardlimit;
+extern time_t diskquota_hashmap_overflow_report_timeout;
 
 extern int      SEGCOUNT;
 extern int      worker_spi_get_extension_version(int *major, int *minor);
@@ -316,4 +317,6 @@ extern HTAB        *diskquota_hash_create(const char *tabname, long nelem, HASHC
 extern HTAB *DiskquotaShmemInitHash(const char *name, long init_size, long max_size, HASHCTL *infoP, int hash_flags,
                                     DiskquotaHashFunction hash_function);
 extern void  refresh_monitored_dbid_cache(void);
+extern void *shm_hash_enter(HTAB *hashp, void *keyPtr, bool *foundPtr, uint max_size, const char *warning_message,
+                            time_t *last_overflow_report);
 #endif
diff --git a/src/diskquota_utility.c b/src/diskquota_utility.c
@@ -1683,3 +1683,27 @@ DiskquotaShmemInitHash(const char           *name,       /* table string name fo
 	return ShmemInitHash(name, init_size, max_size, infoP, hash_flags | HASH_BLOBS);
 #endif /* GP_VERSION_NUM */
 }
+
+// Add or find an entry in a hash table with a size limit. If the limit is reached, only the search will be performed.
+// When overflowing, the warning warning_message will be report. But not more often than specified in
+// diskquota_hashmap_overflow_report_timeout. The time of the last warning is passed in last_overflow_report.
+void *
+shm_hash_enter(HTAB *hashp, void *keyPtr, bool *foundPtr, uint max_size, const char *warning_message,
+               time_t *last_overflow_report)
+{
+	if (hash_get_num_entries(hashp) >= max_size)
+	{
+		return hash_search(hashp, keyPtr, HASH_FIND, foundPtr);
+	}
+	else
+	{
+		void *result = hash_search(hashp, keyPtr, HASH_ENTER, foundPtr);
+		if (hash_get_num_entries(hashp) >= max_size &&
+		    (time(NULL) - *last_overflow_report) >= diskquota_hashmap_overflow_report_timeout)
+		{
+			ereport(WARNING, (errmsg(warning_message, max_size)));
+			*last_overflow_report = time(NULL);
+		}
+		return result;
+	}
+}
diff --git a/src/gp_activetable.c b/src/gp_activetable.c
@@ -52,6 +52,14 @@ typedef struct DiskQuotaSetOFCache
 } DiskQuotaSetOFCache;
 
 HTAB *active_tables_map = NULL; // Set<DiskQuotaActiveTableFileEntry>
+time_t active_tables_last_overflow_report = 0;
+
+#define ACTIVE_TABLE_ENTER(keyPtr, foundPtr)                                                     \
+	shm_hash_enter(active_tables_map, keyPtr, foundPtr, diskquota_max_active_tables,             \
+	               "[diskquota] the number of active tables reached the limit, please increase " \
+	               "the GUC value for diskquota.max_active_tables. Current "                     \
+	               "diskquota.max_active_tables value: %d",                                      \
+	               &active_tables_last_overflow_report)
 
 /*
  * monitored_dbid_cache is a allow list for diskquota
@@ -61,6 +69,14 @@ HTAB *active_tables_map = NULL; // Set<DiskQuotaActiveTableFileEntry>
  * dbid will be removed from it when droping diskquota extension
  */
 HTAB *altered_reloid_cache = NULL; // Set<Oid>
+time_t altered_reloid_cache_last_overflow_report = 0;
+
+#define ALTERED_RELOID_CACHE_ENTER(keyPtr, foundPtr)                                                            \
+	shm_hash_enter(altered_reloid_cache, keyPtr, foundPtr, diskquota_max_active_tables,                         \
+	               "[diskquota] the number of altered reloid cache entries reached the limit, please increase " \
+	               "the GUC value for diskquota.max_active_tables. Current "                                    \
+	               "diskquota.max_active_tables value: %d",                                                     \
+	               &altered_reloid_cache_last_overflow_report)
 
 /* active table hooks which detect the disk file size change. */
 static file_create_hook_type   prev_file_create_hook   = NULL;
@@ -236,7 +252,7 @@ report_altered_reloid(Oid reloid)
 	if (IsRoleMirror() || IS_QUERY_DISPATCHER()) return;
 
 	LWLockAcquire(diskquota_locks.altered_reloid_cache_lock, LW_EXCLUSIVE);
-	hash_search(altered_reloid_cache, &reloid, HASH_ENTER, NULL);
+	ALTERED_RELOID_CACHE_ENTER(&reloid, NULL);
 	LWLockRelease(diskquota_locks.altered_reloid_cache_lock);
 }
 
@@ -318,17 +334,9 @@ report_active_table_helper(const RelFileNodeBackend *relFileNode)
 	item.tablespaceoid = relFileNode->node.spcNode;
 
 	LWLockAcquire(diskquota_locks.active_table_lock, LW_EXCLUSIVE);
-	entry = hash_search(active_tables_map, &item, HASH_ENTER_NULL, &found);
+	entry = ACTIVE_TABLE_ENTER(&item, &found);
 	if (entry && !found) *entry = item;
 
-	if (!found && entry == NULL)
-	{
-		/*
-		 * We may miss the file size change of this relation at current
-		 * refresh interval.
-		 */
-		ereport(WARNING, (errmsg("Share memory is not enough for active tables.")));
-	}
 	LWLockRelease(diskquota_locks.active_table_lock);
 }
 
@@ -857,7 +865,7 @@ get_active_tables_oid(void)
 	while ((active_table_file_entry = (DiskQuotaActiveTableFileEntry *)hash_seq_search(&iter)) != NULL)
 	{
 		/* TODO: handle possible ERROR here so that the bgworker will not go down. */
-		hash_search(active_tables_map, active_table_file_entry, HASH_ENTER, NULL);
+		ACTIVE_TABLE_ENTER(active_table_file_entry, NULL);
 	}
 	/* TODO: hash_seq_term(&iter); */
 	LWLockRelease(diskquota_locks.active_table_lock);
@@ -919,7 +927,7 @@ get_active_tables_oid(void)
 		LWLockAcquire(diskquota_locks.active_table_lock, LW_EXCLUSIVE);
 		while ((active_table_file_entry = (DiskQuotaActiveTableFileEntry *)hash_seq_search(&iter)) != NULL)
 		{
-			entry = hash_search(active_tables_map, active_table_file_entry, HASH_ENTER_NULL, &found);
+			entry = ACTIVE_TABLE_ENTER(active_table_file_entry, &found);
 			if (entry) *entry = *active_table_file_entry;
 		}
 		LWLockRelease(diskquota_locks.active_table_lock);

diff --git a/src/quotamodel.c b/src/quotamodel.c
@@ -190,6 +190,21 @@ static HTAB *table_size_map = NULL;
 static HTAB *disk_quota_reject_map       = NULL;
 static HTAB *local_disk_quota_reject_map = NULL;
 
+static time_t disk_quota_reject_last_overflow_report       = 0;
+static time_t local_disk_quota_reject_last_overflow_report = 0;
+
+#define REJECT_MAP_ENTER(keyPtr, foundPtr)                                                 \
+	shm_hash_enter(disk_quota_reject_map, keyPtr, foundPtr, MAX_DISK_QUOTA_REJECT_ENTRIES, \
+	               "[diskquota] Shared disk quota reject map size limit reached."          \
+	               "Some out-of-limit schemas or roles will be lost"                       \
+	               "in rejectmap.",                                                        \
+	               &disk_quota_reject_last_overflow_report)
+
+#define LOCAL_REJECT_MAP_ENTER(keyPtr, foundPtr)                                                 \
+	shm_hash_enter(local_disk_quota_reject_map, keyPtr, foundPtr, MAX_DISK_QUOTA_REJECT_ENTRIES, \
+	               "[diskquota] the number of local reject map entries reached the limit (%d)",  \
+	               &local_disk_quota_reject_last_overflow_report)
+
 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
 
 /* functions to maintain the quota maps */
@@ -326,9 +341,12 @@ add_quota_to_rejectmap(QuotaType type, Oid targetOid, Oid tablespaceoid, bool se
 	keyitem.tablespaceoid = tablespaceoid;
 	keyitem.targettype    = (uint32)type;
 	ereport(DEBUG1, (errmsg("[diskquota] Put object %u to rejectmap", targetOid)));
-	localrejectentry = (LocalRejectMapEntry *)hash_search(local_disk_quota_reject_map, &keyitem, HASH_ENTER, NULL);
-	localrejectentry->isexceeded  = true;
-	localrejectentry->segexceeded = segexceeded;
+	localrejectentry = (LocalRejectMapEntry *)LOCAL_REJECT_MAP_ENTER(&keyitem, NULL);
+	if (localrejectentry)
+	{
+		localrejectentry->isexceeded  = true;
+		localrejectentry->segexceeded = segexceeded;
+	}
 }
 
 /*
@@ -1305,13 +1323,9 @@ flush_local_reject_map(void)
 		 */
 		if (localrejectentry->isexceeded)
 		{
-			rejectentry = (GlobalRejectMapEntry *)hash_search(disk_quota_reject_map, (void *)&localrejectentry->keyitem,
-			                                                  HASH_ENTER_NULL, &found);
+			rejectentry = (GlobalRejectMapEntry *)REJECT_MAP_ENTER(&localrejectentry->keyitem, &found);
 			if (rejectentry == NULL)
 			{
-				ereport(WARNING, (errmsg("[diskquota] Shared disk quota reject map size limit reached."
-				                         "Some out-of-limit schemas or roles will be lost"
-				                         "in rejectmap.")));
 				continue;
 			}
 			/* new db objects which exceed quota limit */
@@ -2145,7 +2159,7 @@ refresh_rejectmap(PG_FUNCTION_ARGS)
 		 */
 		if (OidIsValid(rejectmapentry->keyitem.targetoid)) continue;
 
-		new_entry = hash_search(disk_quota_reject_map, &rejectmapentry->keyitem, HASH_ENTER_NULL, &found);
+		new_entry = REJECT_MAP_ENTER(&rejectmapentry->keyitem, &found);
 		if (!found && new_entry) memcpy(new_entry, rejectmapentry, sizeof(GlobalRejectMapEntry));
 	}
 	LWLockRelease(diskquota_locks.reject_map_lock);

diff --git a/src/relation_cache.c b/src/relation_cache.c
@@ -32,6 +32,22 @@
 HTAB *relation_cache = NULL;
 HTAB *relid_cache    = NULL;
 
+extern time_t active_tables_last_overflow_report;
+
+#define RELATION_CACHE_ENTER(keyPtr, foundPtr)                                                            \
+	shm_hash_enter(relation_cache, keyPtr, foundPtr, diskquota_max_active_tables,                         \
+	               "[diskquota] the number of relation cache entries reached the limit, please increase " \
+	               "the GUC value for diskquota.max_active_tables. Current "                              \
+	               "diskquota.max_active_tables value: %d",                                               \
+	               &active_tables_last_overflow_report)
+
+#define RELID_CACHE_ENTER(keyPtr, foundPtr)                                                            \
+	shm_hash_enter(relid_cache, keyPtr, foundPtr, diskquota_max_active_tables,                         \
+	               "[diskquota] the number of relid cache entries reached the limit, please increase " \
+	               "the GUC value for diskquota.max_active_tables. Current "                           \
+	               "diskquota.max_active_tables value: %d",                                            \
+	               &active_tables_last_overflow_report)
+
 static void update_relation_entry(Oid relid, DiskQuotaRelationCacheEntry *relation_entry,
                                   DiskQuotaRelidCacheEntry *relid_entry);
 
@@ -177,10 +193,19 @@ update_relation_cache(Oid relid)
 	update_relation_entry(relid, &relation_entry_data, &relid_entry_data);
 
 	LWLockAcquire(diskquota_locks.relation_cache_lock, LW_EXCLUSIVE);
-	relation_entry = hash_search(relation_cache, &relation_entry_data.relid, HASH_ENTER, NULL);
+	relation_entry = RELATION_CACHE_ENTER(&relation_entry_data.relid, NULL);
+	if (relation_entry == NULL)
+	{
+		LWLockRelease(diskquota_locks.relation_cache_lock);
+		return;
+	}
 	memcpy(relation_entry, &relation_entry_data, sizeof(DiskQuotaRelationCacheEntry));
-
-	relid_entry = hash_search(relid_cache, &relid_entry_data.relfilenode, HASH_ENTER, NULL);
+	relid_entry = RELID_CACHE_ENTER(&relid_entry_data.relfilenode, NULL);
+	if (relid_entry == NULL)
+	{
+		LWLockRelease(diskquota_locks.relation_cache_lock);
+		return;
+	}
 	memcpy(relid_entry, &relid_entry_data, sizeof(DiskQuotaRelidCacheEntry));
 	LWLockRelease(diskquota_locks.relation_cache_lock);
 

diff --git a/tests/regress/expected/test_activetable_limit.out b/tests/regress/expected/test_activetable_limit.out
@@ -1,5 +1,6 @@
 -- table in 'diskquota not enabled database' should not be activetable
-\! gpconfig -c diskquota.max_active_tables -v 2 > /dev/null
+\! gpconfig -c diskquota.max_active_tables -v 5 > /dev/null
+\! gpconfig -c diskquota.naptime -v 2 > /dev/null
 \! gpstop -arf > /dev/null
 \c
 CREATE DATABASE test_tablenum_limit_01;
@@ -26,31 +27,38 @@ SELECT diskquota.wait_for_worker_new_epoch();
  t
 (1 row)
 
-CREATE TABLE s.t1(i int) DISTRIBUTED BY (i); -- activetable = 1
-INSERT INTO s.t1 SELECT generate_series(1, 100000); -- ok. diskquota soft limit does not check when first write
+-- the other two active tables are in a different database
+CREATE TABLE s.t1(i int) DISTRIBUTED BY (i); -- activetable = 3
+CREATE TABLE s.t2(i int) DISTRIBUTED BY (i); -- activetable = 4
+CREATE TABLE s.t3(i int) DISTRIBUTED BY (i); -- activetable = 5. expected warning.
+WARNING:  [diskquota] the number of active tables reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5  (seg0 127.0.1.1:6002 pid=995937)
+WARNING:  [diskquota] the number of active tables reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5  (seg2 127.0.1.1:6004 pid=995938)
+WARNING:  [diskquota] the number of active tables reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5  (seg1 127.0.1.1:6003 pid=995939)
+CREATE TABLE s.t4(i int) DISTRIBUTED BY (i);
+INSERT INTO s.t4 SELECT generate_series(1, 100000);
 SELECT diskquota.wait_for_worker_new_epoch();
  wait_for_worker_new_epoch 
 ---------------------------
  t
 (1 row)
 
-CREATE TABLE s.t2(i int) DISTRIBUTED BY (i); -- activetable = 2
-INSERT INTO s.t2 SELECT generate_series(1, 10);  -- expect failed
-ERROR:  schema's disk space quota exceeded with name: s
-CREATE TABLE s.t3(i int) DISTRIBUTED BY (i); -- activetable = 3 should not crash.
-INSERT INTO s.t3 SELECT generate_series(1, 10);  -- expect failed
-ERROR:  schema's disk space quota exceeded with name: s
--- Q: why diskquota still works when activetable = 3?
--- A: the activetable limit by shmem size, calculate by hash_estimate_size()
---    the result will bigger than sizeof(DiskQuotaActiveTableEntry) * max_active_tables
---    the real capacity of this data structure based on the hash conflict probability.
---    so we can not predict when the data structure will be fill in fully.
---
---    this test case is useless, remove this if anyone dislike it.
---    but the hash capacity is smaller than 6, so the test case works for issue 51
+INSERT INTO s.t1 SELECT generate_series(1, 10); -- should be successful
+select count(*) from s.t1;
+ count 
+-------
+    10
+(1 row)
+
+-- altered reloid cache overflow check. expected warning.
+vacuum full;
+WARNING:  [diskquota] the number of active tables reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5
+WARNING:  [diskquota] the number of altered reloid cache entries reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5  (seg0 127.0.1.1:6002 pid=995937)
+WARNING:  [diskquota] the number of altered reloid cache entries reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5  (seg1 127.0.1.1:6003 pid=995939)
+WARNING:  [diskquota] the number of altered reloid cache entries reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5  (seg2 127.0.1.1:6004 pid=995938)
 DROP EXTENSION diskquota;
 \c contrib_regression
 DROP DATABASE test_tablenum_limit_01;
 DROP DATABASE test_tablenum_limit_02;
 \! gpconfig -r diskquota.max_active_tables > /dev/null
+\! gpconfig -c diskquota.naptime -v 0 > /dev/null
 \! gpstop -arf > /dev/null
diff --git a/tests/regress/sql/test_activetable_limit.sql b/tests/regress/sql/test_activetable_limit.sql
@@ -1,5 +1,6 @@
 -- table in 'diskquota not enabled database' should not be activetable
-\! gpconfig -c diskquota.max_active_tables -v 2 > /dev/null
+\! gpconfig -c diskquota.max_active_tables -v 5 > /dev/null
+\! gpconfig -c diskquota.naptime -v 2 > /dev/null
 \! gpstop -arf > /dev/null
 
 \c
@@ -24,24 +25,21 @@ SELECT diskquota.set_schema_quota('s', '1 MB');
 
 SELECT diskquota.wait_for_worker_new_epoch();
 
-CREATE TABLE s.t1(i int) DISTRIBUTED BY (i); -- activetable = 1
-INSERT INTO s.t1 SELECT generate_series(1, 100000); -- ok. diskquota soft limit does not check when first write
+-- the other two active tables are in a different database
+CREATE TABLE s.t1(i int) DISTRIBUTED BY (i); -- activetable = 3
+CREATE TABLE s.t2(i int) DISTRIBUTED BY (i); -- activetable = 4
+CREATE TABLE s.t3(i int) DISTRIBUTED BY (i); -- activetable = 5. expected warning.
+CREATE TABLE s.t4(i int) DISTRIBUTED BY (i);
+
+INSERT INTO s.t4 SELECT generate_series(1, 100000);
 
 SELECT diskquota.wait_for_worker_new_epoch();
 
-CREATE TABLE s.t2(i int) DISTRIBUTED BY (i); -- activetable = 2
-INSERT INTO s.t2 SELECT generate_series(1, 10);  -- expect failed
-CREATE TABLE s.t3(i int) DISTRIBUTED BY (i); -- activetable = 3 should not crash.
-INSERT INTO s.t3 SELECT generate_series(1, 10);  -- expect failed
+INSERT INTO s.t1 SELECT generate_series(1, 10); -- should be successful
+select count(*) from s.t1;
 
--- Q: why diskquota still works when activetable = 3?
--- A: the activetable limit by shmem size, calculate by hash_estimate_size()
---    the result will bigger than sizeof(DiskQuotaActiveTableEntry) * max_active_tables
---    the real capacity of this data structure based on the hash conflict probability.
---    so we can not predict when the data structure will be fill in fully.
---
---    this test case is useless, remove this if anyone dislike it.
---    but the hash capacity is smaller than 6, so the test case works for issue 51
+-- altered reloid cache overflow check. expected warning.
+vacuum full;
 
 DROP EXTENSION diskquota;
 
@@ -50,4 +48,5 @@ DROP DATABASE test_tablenum_limit_01;
 DROP DATABASE test_tablenum_limit_02;
 
 \! gpconfig -r diskquota.max_active_tables > /dev/null
+\! gpconfig -c diskquota.naptime -v 0 > /dev/null
 \! gpstop -arf > /dev/null