Skip to content

Commit

Permalink
Limit diskquota hash table's size according initial request
Browse files Browse the repository at this point in the history
Diskquota does not control the size of its hash tables in shared memory and may
consume shared memory that is not intended for it, which may affect the other
database subsystems. Hash tables can also grow indefinitely if the background
process on the coordinator has not started, which can happen for a number of
reasons: gone done with error, pause, isn’t started.

This patch adds a limit on the size of hash tables in shared memory by adding a
wrapper over hash_search that controls the size of the table and stops adding
elements when the number of elements allocated at the beginning is reached and
report a warning. GUC is also added, which limits the frequency of report of the
warning, since in some cases it may be report too often.
  • Loading branch information
KnightMurloc committed Nov 27, 2023
1 parent d0b41d6 commit f7b0ca1
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 56 deletions.
4 changes: 4 additions & 0 deletions src/diskquota.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ int diskquota_max_workers = 10;
int diskquota_max_table_segments = 0;
int diskquota_max_monitored_databases = 0;
int diskquota_max_quota_probes = 0;
time_t diskquota_hashmap_overflow_report_timeout = 0;

DiskQuotaLocks diskquota_locks;
ExtensionDDLMessage *extension_ddl_message = NULL;
Expand Down Expand Up @@ -414,6 +415,9 @@ define_guc_variables(void)
DefineCustomIntVariable("diskquota.max_quota_probes", "Max number of quotas on the cluster.", NULL,
&diskquota_max_quota_probes, 1024 * 1024, 1024 * INIT_QUOTA_MAP_ENTRIES, INT_MAX,
PGC_POSTMASTER, 0, NULL, NULL, NULL);
DefineCustomIntVariable("diskquota.hashmap_overflow_report_timeout",
"Time interval in seconds between shared hash map overflow report.", NULL,
&diskquota_hashmap_overflow_report_timeout, 60, 0, INT_MAX, PGC_SUSET, 0, NULL, NULL, NULL);
}

/* ---- Functions for disk quota worker process ---- */
Expand Down
3 changes: 3 additions & 0 deletions src/diskquota.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ extern Datum diskquota_fetch_table_stat(PG_FUNCTION_ARGS);
extern int diskquota_naptime;
extern int diskquota_max_active_tables;
extern bool diskquota_hardlimit;
extern time_t diskquota_hashmap_overflow_report_timeout;

extern int SEGCOUNT;
extern int worker_spi_get_extension_version(int *major, int *minor);
Expand Down Expand Up @@ -316,4 +317,6 @@ extern HTAB *diskquota_hash_create(const char *tabname, long nelem, HASHC
extern HTAB *DiskquotaShmemInitHash(const char *name, long init_size, long max_size, HASHCTL *infoP, int hash_flags,
DiskquotaHashFunction hash_function);
extern void refresh_monitored_dbid_cache(void);
extern void *shm_hash_enter(HTAB *hashp, void *keyPtr, bool *foundPtr, uint max_size, const char *warning_message,
time_t *last_overflow_report);
#endif
24 changes: 24 additions & 0 deletions src/diskquota_utility.c
Original file line number Diff line number Diff line change
Expand Up @@ -1683,3 +1683,27 @@ DiskquotaShmemInitHash(const char *name, /* table string name fo
return ShmemInitHash(name, init_size, max_size, infoP, hash_flags | HASH_BLOBS);
#endif /* GP_VERSION_NUM */
}

// Add or find an entry in a hash table with a size limit. If the limit is reached, only the search will be performed.
// When overflowing, the warning warning_message will be report. But not more often than specified in
// diskquota_hashmap_overflow_report_timeout. The time of the last warning is passed in last_overflow_report.
void *
shm_hash_enter(HTAB *hashp, void *keyPtr, bool *foundPtr, uint max_size, const char *warning_message,
time_t *last_overflow_report)
{
if (hash_get_num_entries(hashp) >= max_size)
{
return hash_search(hashp, keyPtr, HASH_FIND, foundPtr);
}
else
{
void *result = hash_search(hashp, keyPtr, HASH_ENTER, foundPtr);
if (hash_get_num_entries(hashp) >= max_size &&
(time(NULL) - *last_overflow_report) >= diskquota_hashmap_overflow_report_timeout)
{
ereport(WARNING, (errmsg(warning_message, max_size)));
*last_overflow_report = time(NULL);
}
return result;
}
}
32 changes: 20 additions & 12 deletions src/gp_activetable.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ typedef struct DiskQuotaSetOFCache
} DiskQuotaSetOFCache;

HTAB *active_tables_map = NULL; // Set<DiskQuotaActiveTableFileEntry>
time_t active_tables_last_overflow_report = 0;

#define ACTIVE_TABLE_ENTER(keyPtr, foundPtr) \
shm_hash_enter(active_tables_map, keyPtr, foundPtr, diskquota_max_active_tables, \
"[diskquota] the number of active tables reached the limit, please increase " \
"the GUC value for diskquota.max_active_tables. Current " \
"diskquota.max_active_tables value: %d", \
&active_tables_last_overflow_report)

/*
* monitored_dbid_cache is a allow list for diskquota
Expand All @@ -61,6 +69,14 @@ HTAB *active_tables_map = NULL; // Set<DiskQuotaActiveTableFileEntry>
* dbid will be removed from it when droping diskquota extension
*/
HTAB *altered_reloid_cache = NULL; // Set<Oid>
time_t altered_reloid_cache_last_overflow_report = 0;

#define ALTERED_RELOID_CACHE_ENTER(keyPtr, foundPtr) \
shm_hash_enter(altered_reloid_cache, keyPtr, foundPtr, diskquota_max_active_tables, \
"[diskquota] the number of altered reloid cache entries reached the limit, please increase " \
"the GUC value for diskquota.max_active_tables. Current " \
"diskquota.max_active_tables value: %d", \
&altered_reloid_cache_last_overflow_report)

/* active table hooks which detect the disk file size change. */
static file_create_hook_type prev_file_create_hook = NULL;
Expand Down Expand Up @@ -236,7 +252,7 @@ report_altered_reloid(Oid reloid)
if (IsRoleMirror() || IS_QUERY_DISPATCHER()) return;

LWLockAcquire(diskquota_locks.altered_reloid_cache_lock, LW_EXCLUSIVE);
hash_search(altered_reloid_cache, &reloid, HASH_ENTER, NULL);
ALTERED_RELOID_CACHE_ENTER(&reloid, NULL);
LWLockRelease(diskquota_locks.altered_reloid_cache_lock);
}

Expand Down Expand Up @@ -318,17 +334,9 @@ report_active_table_helper(const RelFileNodeBackend *relFileNode)
item.tablespaceoid = relFileNode->node.spcNode;

LWLockAcquire(diskquota_locks.active_table_lock, LW_EXCLUSIVE);
entry = hash_search(active_tables_map, &item, HASH_ENTER_NULL, &found);
entry = ACTIVE_TABLE_ENTER(&item, &found);
if (entry && !found) *entry = item;

if (!found && entry == NULL)
{
/*
* We may miss the file size change of this relation at current
* refresh interval.
*/
ereport(WARNING, (errmsg("Share memory is not enough for active tables.")));
}
LWLockRelease(diskquota_locks.active_table_lock);
}

Expand Down Expand Up @@ -857,7 +865,7 @@ get_active_tables_oid(void)
while ((active_table_file_entry = (DiskQuotaActiveTableFileEntry *)hash_seq_search(&iter)) != NULL)
{
/* TODO: handle possible ERROR here so that the bgworker will not go down. */
hash_search(active_tables_map, active_table_file_entry, HASH_ENTER, NULL);
ACTIVE_TABLE_ENTER(active_table_file_entry, NULL);
}
/* TODO: hash_seq_term(&iter); */
LWLockRelease(diskquota_locks.active_table_lock);
Expand Down Expand Up @@ -919,7 +927,7 @@ get_active_tables_oid(void)
LWLockAcquire(diskquota_locks.active_table_lock, LW_EXCLUSIVE);
while ((active_table_file_entry = (DiskQuotaActiveTableFileEntry *)hash_seq_search(&iter)) != NULL)
{
entry = hash_search(active_tables_map, active_table_file_entry, HASH_ENTER_NULL, &found);
entry = ACTIVE_TABLE_ENTER(active_table_file_entry, &found);
if (entry) *entry = *active_table_file_entry;
}
LWLockRelease(diskquota_locks.active_table_lock);
Expand Down
32 changes: 23 additions & 9 deletions src/quotamodel.c
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,21 @@ static HTAB *table_size_map = NULL;
static HTAB *disk_quota_reject_map = NULL;
static HTAB *local_disk_quota_reject_map = NULL;

static time_t disk_quota_reject_last_overflow_report = 0;
static time_t local_disk_quota_reject_last_overflow_report = 0;

#define REJECT_MAP_ENTER(keyPtr, foundPtr) \
shm_hash_enter(disk_quota_reject_map, keyPtr, foundPtr, MAX_DISK_QUOTA_REJECT_ENTRIES, \
"[diskquota] Shared disk quota reject map size limit reached." \
"Some out-of-limit schemas or roles will be lost" \
"in rejectmap.", \
&disk_quota_reject_last_overflow_report)

#define LOCAL_REJECT_MAP_ENTER(keyPtr, foundPtr) \
shm_hash_enter(local_disk_quota_reject_map, keyPtr, foundPtr, MAX_DISK_QUOTA_REJECT_ENTRIES, \
"[diskquota] the number of local reject map entries reached the limit (%d)", \
&local_disk_quota_reject_last_overflow_report)

static shmem_startup_hook_type prev_shmem_startup_hook = NULL;

/* functions to maintain the quota maps */
Expand Down Expand Up @@ -326,9 +341,12 @@ add_quota_to_rejectmap(QuotaType type, Oid targetOid, Oid tablespaceoid, bool se
keyitem.tablespaceoid = tablespaceoid;
keyitem.targettype = (uint32)type;
ereport(DEBUG1, (errmsg("[diskquota] Put object %u to rejectmap", targetOid)));
localrejectentry = (LocalRejectMapEntry *)hash_search(local_disk_quota_reject_map, &keyitem, HASH_ENTER, NULL);
localrejectentry->isexceeded = true;
localrejectentry->segexceeded = segexceeded;
localrejectentry = (LocalRejectMapEntry *)LOCAL_REJECT_MAP_ENTER(&keyitem, NULL);
if (localrejectentry)
{
localrejectentry->isexceeded = true;
localrejectentry->segexceeded = segexceeded;
}
}

/*
Expand Down Expand Up @@ -1305,13 +1323,9 @@ flush_local_reject_map(void)
*/
if (localrejectentry->isexceeded)
{
rejectentry = (GlobalRejectMapEntry *)hash_search(disk_quota_reject_map, (void *)&localrejectentry->keyitem,
HASH_ENTER_NULL, &found);
rejectentry = (GlobalRejectMapEntry *)REJECT_MAP_ENTER(&localrejectentry->keyitem, &found);
if (rejectentry == NULL)
{
ereport(WARNING, (errmsg("[diskquota] Shared disk quota reject map size limit reached."
"Some out-of-limit schemas or roles will be lost"
"in rejectmap.")));
continue;
}
/* new db objects which exceed quota limit */
Expand Down Expand Up @@ -2145,7 +2159,7 @@ refresh_rejectmap(PG_FUNCTION_ARGS)
*/
if (OidIsValid(rejectmapentry->keyitem.targetoid)) continue;

new_entry = hash_search(disk_quota_reject_map, &rejectmapentry->keyitem, HASH_ENTER_NULL, &found);
new_entry = REJECT_MAP_ENTER(&rejectmapentry->keyitem, &found);
if (!found && new_entry) memcpy(new_entry, rejectmapentry, sizeof(GlobalRejectMapEntry));
}
LWLockRelease(diskquota_locks.reject_map_lock);
Expand Down
31 changes: 28 additions & 3 deletions src/relation_cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,22 @@
HTAB *relation_cache = NULL;
HTAB *relid_cache = NULL;

extern time_t active_tables_last_overflow_report;

#define RELATION_CACHE_ENTER(keyPtr, foundPtr) \
shm_hash_enter(relation_cache, keyPtr, foundPtr, diskquota_max_active_tables, \
"[diskquota] the number of relation cache entries reached the limit, please increase " \
"the GUC value for diskquota.max_active_tables. Current " \
"diskquota.max_active_tables value: %d", \
&active_tables_last_overflow_report)

#define RELID_CACHE_ENTER(keyPtr, foundPtr) \
shm_hash_enter(relid_cache, keyPtr, foundPtr, diskquota_max_active_tables, \
"[diskquota] the number of relid cache entries reached the limit, please increase " \
"the GUC value for diskquota.max_active_tables. Current " \
"diskquota.max_active_tables value: %d", \
&active_tables_last_overflow_report)

static void update_relation_entry(Oid relid, DiskQuotaRelationCacheEntry *relation_entry,
DiskQuotaRelidCacheEntry *relid_entry);

Expand Down Expand Up @@ -177,10 +193,19 @@ update_relation_cache(Oid relid)
update_relation_entry(relid, &relation_entry_data, &relid_entry_data);

LWLockAcquire(diskquota_locks.relation_cache_lock, LW_EXCLUSIVE);
relation_entry = hash_search(relation_cache, &relation_entry_data.relid, HASH_ENTER, NULL);
relation_entry = RELATION_CACHE_ENTER(&relation_entry_data.relid, NULL);
if (relation_entry == NULL)
{
LWLockRelease(diskquota_locks.relation_cache_lock);
return;
}
memcpy(relation_entry, &relation_entry_data, sizeof(DiskQuotaRelationCacheEntry));

relid_entry = hash_search(relid_cache, &relid_entry_data.relfilenode, HASH_ENTER, NULL);
relid_entry = RELID_CACHE_ENTER(&relid_entry_data.relfilenode, NULL);
if (relid_entry == NULL)
{
LWLockRelease(diskquota_locks.relation_cache_lock);
return;
}
memcpy(relid_entry, &relid_entry_data, sizeof(DiskQuotaRelidCacheEntry));
LWLockRelease(diskquota_locks.relation_cache_lock);

Expand Down
42 changes: 25 additions & 17 deletions tests/regress/expected/test_activetable_limit.out
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
-- table in 'diskquota not enabled database' should not be activetable
\! gpconfig -c diskquota.max_active_tables -v 2 > /dev/null
\! gpconfig -c diskquota.max_active_tables -v 5 > /dev/null
\! gpconfig -c diskquota.naptime -v 2 > /dev/null
\! gpstop -arf > /dev/null
\c
CREATE DATABASE test_tablenum_limit_01;
Expand All @@ -26,31 +27,38 @@ SELECT diskquota.wait_for_worker_new_epoch();
t
(1 row)

CREATE TABLE s.t1(i int) DISTRIBUTED BY (i); -- activetable = 1
INSERT INTO s.t1 SELECT generate_series(1, 100000); -- ok. diskquota soft limit does not check when first write
-- the other two active tables are in a different database
CREATE TABLE s.t1(i int) DISTRIBUTED BY (i); -- activetable = 3
CREATE TABLE s.t2(i int) DISTRIBUTED BY (i); -- activetable = 4
CREATE TABLE s.t3(i int) DISTRIBUTED BY (i); -- activetable = 5. expected warning.
WARNING: [diskquota] the number of active tables reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5 (seg0 127.0.1.1:6002 pid=995937)
WARNING: [diskquota] the number of active tables reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5 (seg2 127.0.1.1:6004 pid=995938)
WARNING: [diskquota] the number of active tables reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5 (seg1 127.0.1.1:6003 pid=995939)
CREATE TABLE s.t4(i int) DISTRIBUTED BY (i);
INSERT INTO s.t4 SELECT generate_series(1, 100000);
SELECT diskquota.wait_for_worker_new_epoch();
wait_for_worker_new_epoch
---------------------------
t
(1 row)

CREATE TABLE s.t2(i int) DISTRIBUTED BY (i); -- activetable = 2
INSERT INTO s.t2 SELECT generate_series(1, 10); -- expect failed
ERROR: schema's disk space quota exceeded with name: s
CREATE TABLE s.t3(i int) DISTRIBUTED BY (i); -- activetable = 3 should not crash.
INSERT INTO s.t3 SELECT generate_series(1, 10); -- expect failed
ERROR: schema's disk space quota exceeded with name: s
-- Q: why diskquota still works when activetable = 3?
-- A: the activetable limit by shmem size, calculate by hash_estimate_size()
-- the result will bigger than sizeof(DiskQuotaActiveTableEntry) * max_active_tables
-- the real capacity of this data structure based on the hash conflict probability.
-- so we can not predict when the data structure will be fill in fully.
--
-- this test case is useless, remove this if anyone dislike it.
-- but the hash capacity is smaller than 6, so the test case works for issue 51
INSERT INTO s.t1 SELECT generate_series(1, 10); -- should be successful
select count(*) from s.t1;
count
-------
10
(1 row)

-- altered reloid cache overflow check. expected warning.
vacuum full;
WARNING: [diskquota] the number of active tables reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5
WARNING: [diskquota] the number of altered reloid cache entries reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5 (seg0 127.0.1.1:6002 pid=995937)
WARNING: [diskquota] the number of altered reloid cache entries reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5 (seg1 127.0.1.1:6003 pid=995939)
WARNING: [diskquota] the number of altered reloid cache entries reached the limit, please increase the GUC value for diskquota.max_active_tables. Current diskquota.max_active_tables value: 5 (seg2 127.0.1.1:6004 pid=995938)
DROP EXTENSION diskquota;
\c contrib_regression
DROP DATABASE test_tablenum_limit_01;
DROP DATABASE test_tablenum_limit_02;
\! gpconfig -r diskquota.max_active_tables > /dev/null
\! gpconfig -c diskquota.naptime -v 0 > /dev/null
\! gpstop -arf > /dev/null
29 changes: 14 additions & 15 deletions tests/regress/sql/test_activetable_limit.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
-- table in 'diskquota not enabled database' should not be activetable
\! gpconfig -c diskquota.max_active_tables -v 2 > /dev/null
\! gpconfig -c diskquota.max_active_tables -v 5 > /dev/null
\! gpconfig -c diskquota.naptime -v 2 > /dev/null
\! gpstop -arf > /dev/null

\c
Expand All @@ -24,24 +25,21 @@ SELECT diskquota.set_schema_quota('s', '1 MB');

SELECT diskquota.wait_for_worker_new_epoch();

CREATE TABLE s.t1(i int) DISTRIBUTED BY (i); -- activetable = 1
INSERT INTO s.t1 SELECT generate_series(1, 100000); -- ok. diskquota soft limit does not check when first write
-- the other two active tables are in a different database
CREATE TABLE s.t1(i int) DISTRIBUTED BY (i); -- activetable = 3
CREATE TABLE s.t2(i int) DISTRIBUTED BY (i); -- activetable = 4
CREATE TABLE s.t3(i int) DISTRIBUTED BY (i); -- activetable = 5. expected warning.
CREATE TABLE s.t4(i int) DISTRIBUTED BY (i);

INSERT INTO s.t4 SELECT generate_series(1, 100000);

SELECT diskquota.wait_for_worker_new_epoch();

CREATE TABLE s.t2(i int) DISTRIBUTED BY (i); -- activetable = 2
INSERT INTO s.t2 SELECT generate_series(1, 10); -- expect failed
CREATE TABLE s.t3(i int) DISTRIBUTED BY (i); -- activetable = 3 should not crash.
INSERT INTO s.t3 SELECT generate_series(1, 10); -- expect failed
INSERT INTO s.t1 SELECT generate_series(1, 10); -- should be successful
select count(*) from s.t1;

-- Q: why diskquota still works when activetable = 3?
-- A: the activetable limit by shmem size, calculate by hash_estimate_size()
-- the result will bigger than sizeof(DiskQuotaActiveTableEntry) * max_active_tables
-- the real capacity of this data structure based on the hash conflict probability.
-- so we can not predict when the data structure will be fill in fully.
--
-- this test case is useless, remove this if anyone dislike it.
-- but the hash capacity is smaller than 6, so the test case works for issue 51
-- altered reloid cache overflow check. expected warning.
vacuum full;

DROP EXTENSION diskquota;

Expand All @@ -50,4 +48,5 @@ DROP DATABASE test_tablenum_limit_01;
DROP DATABASE test_tablenum_limit_02;

\! gpconfig -r diskquota.max_active_tables > /dev/null
\! gpconfig -c diskquota.naptime -v 0 > /dev/null
\! gpstop -arf > /dev/null

0 comments on commit f7b0ca1

Please sign in to comment.