diff --git a/src/backend/utils/mmgr/vmem_tracker.c b/src/backend/utils/mmgr/vmem_tracker.c index d2b729c97b38..1996d1917da6 100644 --- a/src/backend/utils/mmgr/vmem_tracker.c +++ b/src/backend/utils/mmgr/vmem_tracker.c @@ -633,6 +633,12 @@ VmemTracker_ReleaseVmem(int64 toBeFreedRequested) } } +int32 +VmemTracker_GetStartupChunks(void) +{ + return startupChunks; +} + /* * Register the startup memory to vmem tracker. * @@ -670,6 +676,8 @@ VmemTracker_RegisterStartupMemory(int64 bytes) pg_atomic_add_fetch_u32((pg_atomic_uint32 *) segmentVmemChunks, startupChunks); + ResGroupProcAddStartupChunks(startupChunks); + /* * Step 2, check if an OOM error should be raised by allocating 0 chunk. */ diff --git a/src/backend/utils/resgroup/resgroup.c b/src/backend/utils/resgroup/resgroup.c index 65e52db46ca9..91c23958e9b0 100644 --- a/src/backend/utils/resgroup/resgroup.c +++ b/src/backend/utils/resgroup/resgroup.c @@ -1594,6 +1594,17 @@ selfDetachResGroup(ResGroupData *group, ResGroupSlotData *slot) selfUnsetGroup(); } +/* + * Add startup memory before a resgroup is assigned. This memory + * will later be added to resgroup via selfAttachResGroup + */ +void +ResGroupProcAddStartupChunks(int32 chunks) +{ + if (IsResGroupEnabled()) + self->memUsage += chunks; +} + /* * Initialize the members of a slot */ diff --git a/src/include/utils/resgroup.h b/src/include/utils/resgroup.h index 87b57d65c66b..c20bc8674cc0 100644 --- a/src/include/utils/resgroup.h +++ b/src/include/utils/resgroup.h @@ -235,6 +235,8 @@ extern Oid ResGroupGetGroupIdBySessionId(int sessionId); extern char *getCpuSetByRole(const char *cpuset); extern void checkCpuSetByRole(const char *cpuset); +extern void ResGroupProcAddStartupChunks(int32 chunks); + #define LOG_RESGROUP_DEBUG(...) \ do {if (Debug_resource_group) elog(__VA_ARGS__); } while(false); diff --git a/src/include/utils/vmem_tracker.h b/src/include/utils/vmem_tracker.h index 448f3a6bf833..637e53ce0a44 100644 --- a/src/include/utils/vmem_tracker.h +++ b/src/include/utils/vmem_tracker.h @@ -66,6 +66,7 @@ extern void VmemTracker_UnregisterStartupMemory(void); extern void VmemTracker_RequestWaiver(int64 waiver_bytes); extern void VmemTracker_ResetWaiver(void); extern int64 VmemTracker_Fault(int32 reason, int64 arg); +extern int32 VmemTracker_GetStartupChunks(void); extern int32 RedZoneHandler_GetRedZoneLimitChunks(void); extern int32 RedZoneHandler_GetRedZoneLimitMB(void); diff --git a/src/test/isolation2/input/resgroup/resgroup_startup_memory.source b/src/test/isolation2/input/resgroup/resgroup_startup_memory.source new file mode 100644 index 000000000000..4590d76aaf88 --- /dev/null +++ b/src/test/isolation2/input/resgroup/resgroup_startup_memory.source @@ -0,0 +1,113 @@ +-- start_ignore +drop table if exists t1; +drop role if exists test1; +drop role if exists test2; +drop resource group rg1; +drop resource group rg2; +-- end_ignore + +create extension if not exists gp_inject_fault; + +create or replace function resGroupPalloc(float) returns int as '@abs_builddir@/../regress/regress@DLSUFFIX@', 'resGroupPalloc' language C reads sql data; +create or replace function hold_memory_by_percent_on_qe(float) returns int as $$ +select resGroupPalloc($1) from gp_dist_random('gp_id') +$$ language sql; + +create or replace function repeatPalloc(int, int) returns int as +'@abs_builddir@/../regress/regress@DLSUFFIX@', 'repeatPalloc' + language C reads sql data; + +-- In the next test, we need to be able to allocate 100% of the available +-- memory. Therefore, we set the memory_limit for the built-in groups to 0. +-- memory_spill_ratio must also be set to 0. +alter resource group admin_group set memory_spill_ratio 0; +alter resource group default_group set memory_spill_ratio 0; +alter resource group admin_group set memory_limit 0; +alter resource group default_group set memory_limit 0; + +create resource group rg1 with (cpu_rate_limit=20, memory_limit=15, memory_shared_quota=100); +create resource group rg2 with (cpu_rate_limit=20, memory_limit=85, memory_shared_quota=100); + +create role test1 with resource group rg1; +create role test2 with resource group rg2; + +1: set role test1; +1: create table t1 (a int) distributed randomly; +-- We need to have a random distribution so that each select in the join is +-- performed in a separate slice. +-- Add exactly one line to each segment. +0U: insert into t1 values (1); +1U: insert into t1 values (2); +2U: insert into t1 values (3); +0Uq: +1Uq: +2Uq: + +-- Force nested loop join to avoid unnecessary memory allocations. +1: set enable_nestloop = on; +1: set enable_hashjoin = off; + +set role test2; +begin; + +-- Allocate all the memory in the second resource group. +select hold_memory_by_percent_on_qe(1); +-- Wait for all backends to run before executing the query. This is necessary +-- to avoid executing hold_memory before all backends are up. +select gp_inject_fault_infinite('executor_pre_tuple_processed', 'suspend', dbid) + from gp_segment_configuration where role = 'p' and content > -1; +-- The first resource group has 682 * 0.15 = 102 MB of memory available. +-- 6 slices consume 12 * 6 = 72 MB, plus one slice explicitly allocates 40 MB of +-- memory, 72 + 40 = 112 MB. It is expected that the query will be canceled due +-- to lack of memory in the resource group but not in the Vmem tracker. +1&: with sleep_cte as +( + select a, pg_sleep(60) is null from t1 +), +alloc_cte as +( + select * from sleep_cte where repeatPalloc(40, 1) is not null +) +select * from alloc_cte a1 + join sleep_cte a2 using(a) + join sleep_cte a3 using(a) + join sleep_cte a4 using(a) + join sleep_cte a5 using(a); +select gp_wait_until_triggered_fault('executor_pre_tuple_processed', 6, dbid) + from gp_segment_configuration where role = 'p' and content > -1; +select gp_inject_fault('executor_pre_tuple_processed', 'reset', dbid) + from gp_segment_configuration where role = 'p' and content > -1; +1<: +rollback; + +-- Test that the starting memory is visible to the resource group. +select gp_inject_fault('executor_pre_tuple_processed', 'suspend', dbid) + from gp_segment_configuration where role = 'p' and content > -1; +1&: select count(*) from t1; +select gp_wait_until_triggered_fault('executor_pre_tuple_processed', 1, dbid) + from gp_segment_configuration where role = 'p' and content > -1; + +select segment, mem.* from gp_toolkit.gp_resgroup_status, json_object_keys(memory_usage) + as segment, json_to_record(memory_usage -> segment) mem (used int) where rsgname = 'rg1'; + +select gp_inject_fault('executor_pre_tuple_processed', 'reset', dbid) + from gp_segment_configuration where role = 'p' and content > -1; +1<: +1q: + +reset role; +drop table t1; +drop role test1; +drop role test2; +drop resource group rg1; +drop resource group rg2; + +alter resource group default_group set memory_limit 30; +alter resource group default_group set memory_spill_ratio 10; + +alter resource group admin_group set memory_limit 10; +alter resource group admin_group set memory_spill_ratio 10; + +drop function repeatPalloc(int, int); +drop function hold_memory_by_percent_on_qe(float); +drop function resGroupPalloc(float); diff --git a/src/test/isolation2/isolation2_resgroup_schedule b/src/test/isolation2/isolation2_resgroup_schedule index 065067e14d45..17e191bde5a5 100644 --- a/src/test/isolation2/isolation2_resgroup_schedule +++ b/src/test/isolation2/isolation2_resgroup_schedule @@ -59,4 +59,6 @@ test: resgroup/resgroup_dumpinfo # test larget group id test: resgroup/resgroup_large_group_id +test: resgroup/resgroup_startup_memory + test: resgroup/disable_resgroup diff --git a/src/test/isolation2/output/resgroup/resgroup_bypass.source b/src/test/isolation2/output/resgroup/resgroup_bypass.source index d4c311d316c5..793f100ef57e 100644 --- a/src/test/isolation2/output/resgroup/resgroup_bypass.source +++ b/src/test/isolation2/output/resgroup/resgroup_bypass.source @@ -161,7 +161,7 @@ SELECT * FROM memory_result; rsgname | ismaster | avg_mem ----------------+----------+--------- rg_bypass_test | 0 | 0.0 - rg_bypass_test | 1 | 12.0 + rg_bypass_test | 1 | 24.0 (2 rows) 61: SELECT * FROM eat_memory_on_qd_large; ERROR: Out of memory @@ -202,8 +202,8 @@ BEGIN SELECT * FROM memory_result; rsgname | ismaster | avg_mem ----------------+----------+--------- - rg_bypass_test | 0 | 4.0 - rg_bypass_test | 1 | 0.0 + rg_bypass_test | 0 | 16.0 + rg_bypass_test | 1 | 12.0 (2 rows) 61: SELECT * FROM eat_memory_on_one_slice; count @@ -213,8 +213,8 @@ SELECT * FROM memory_result; SELECT * FROM memory_result; rsgname | ismaster | avg_mem ----------------+----------+--------- - rg_bypass_test | 0 | 8.0 - rg_bypass_test | 1 | 0.0 + rg_bypass_test | 0 | 20.0 + rg_bypass_test | 1 | 12.0 (2 rows) 61: SELECT * FROM eat_memory_on_one_slice; ERROR: Out of memory (seg0 slice1 127.0.0.1:25432 pid=336) @@ -234,7 +234,7 @@ SELECT * FROM memory_result; rsgname | ismaster | avg_mem ----------------+----------+--------- rg_bypass_test | 0 | 0.0 - rg_bypass_test | 1 | 0.0 + rg_bypass_test | 1 | 12.0 (2 rows) 61q: ... @@ -256,8 +256,8 @@ BEGIN SELECT * FROM memory_result; rsgname | ismaster | avg_mem ----------------+----------+--------- - rg_bypass_test | 0 | 4.0 - rg_bypass_test | 1 | 0.0 + rg_bypass_test | 0 | 16.0 + rg_bypass_test | 1 | 12.0 (2 rows) 61: SELECT * FROM eat_memory_on_slices; count @@ -267,8 +267,8 @@ SELECT * FROM memory_result; SELECT * FROM memory_result; rsgname | ismaster | avg_mem ----------------+----------+--------- - rg_bypass_test | 0 | 8.0 - rg_bypass_test | 1 | 0.0 + rg_bypass_test | 0 | 20.0 + rg_bypass_test | 1 | 12.0 (2 rows) 61: SELECT * FROM eat_memory_on_slices; ERROR: Out of memory (seg0 slice2 127.0.0.1:25432 pid=354) @@ -288,7 +288,7 @@ SELECT * FROM memory_result; rsgname | ismaster | avg_mem ----------------+----------+--------- rg_bypass_test | 0 | 0.0 - rg_bypass_test | 1 | 0.0 + rg_bypass_test | 1 | 12.0 (2 rows) 61q: ... diff --git a/src/test/isolation2/output/resgroup/resgroup_memory_statistic.source b/src/test/isolation2/output/resgroup/resgroup_memory_statistic.source index 052db8cdced2..6b679bb59865 100644 --- a/src/test/isolation2/output/resgroup/resgroup_memory_statistic.source +++ b/src/test/isolation2/output/resgroup/resgroup_memory_statistic.source @@ -80,7 +80,7 @@ SELECT * FROM memory_result; rsgname | ismaster | avg_mem -----------------+----------+--------- rg1_memory_test | 0 | 0.0 - rg1_memory_test | 1 | 20.0 + rg1_memory_test | 1 | 30.0 rg2_memory_test | 0 | 0.0 rg2_memory_test | 1 | 0.0 (4 rows) @@ -144,8 +144,8 @@ BEGIN SELECT * FROM memory_result; rsgname | ismaster | avg_mem -----------------+----------+--------- - rg1_memory_test | 0 | 20.0 - rg1_memory_test | 1 | 0.0 + rg1_memory_test | 0 | 30.0 + rg1_memory_test | 1 | 10.0 rg2_memory_test | 0 | 0.0 rg2_memory_test | 1 | 0.0 (4 rows) diff --git a/src/test/isolation2/output/resgroup/resgroup_startup_memory.source b/src/test/isolation2/output/resgroup/resgroup_startup_memory.source new file mode 100644 index 000000000000..fb3c07a3c708 --- /dev/null +++ b/src/test/isolation2/output/resgroup/resgroup_startup_memory.source @@ -0,0 +1,178 @@ +-- start_ignore +drop table if exists t1; +drop role if exists test1; +drop role if exists test2; +drop resource group rg1; +drop resource group rg2; +-- end_ignore + +create extension if not exists gp_inject_fault; +CREATE + +create or replace function resGroupPalloc(float) returns int as '/home/gpadmin/gpdb_src/src/test/isolation2/../regress/regress.so', 'resGroupPalloc' language C reads sql data; +CREATE +create or replace function hold_memory_by_percent_on_qe(float) returns int as $$ select resGroupPalloc($1) from gp_dist_random('gp_id') $$ language sql; +CREATE + +create or replace function repeatPalloc(int, int) returns int as '/home/gpadmin/gpdb_src/src/test/isolation2/../regress/regress.so', 'repeatPalloc' language C reads sql data; +CREATE + +-- In the next test, we need to be able to allocate 100% of the available +-- memory. Therefore, we set the memory_limit for the built-in groups to 0. +-- memory_spill_ratio must also be set to 0. +alter resource group admin_group set memory_spill_ratio 0; +ALTER +alter resource group default_group set memory_spill_ratio 0; +ALTER +alter resource group admin_group set memory_limit 0; +ALTER +alter resource group default_group set memory_limit 0; +ALTER + +create resource group rg1 with (cpu_rate_limit=20, memory_limit=15, memory_shared_quota=100); +CREATE +create resource group rg2 with (cpu_rate_limit=20, memory_limit=85, memory_shared_quota=100); +CREATE + +create role test1 with resource group rg1; +CREATE +create role test2 with resource group rg2; +CREATE + +1: set role test1; +SET +1: create table t1 (a int) distributed randomly; +CREATE +-- We need to have a random distribution so that each select in the join is +-- performed in a separate slice. +-- Add exactly one line to each segment. +0U: insert into t1 values (1); +INSERT 1 +1U: insert into t1 values (2); +INSERT 1 +2U: insert into t1 values (3); +INSERT 1 +0Uq: ... +1Uq: ... +2Uq: ... + +-- Force nested loop join to avoid unnecessary memory allocations. +1: set enable_nestloop = on; +SET +1: set enable_hashjoin = off; +SET + +set role test2; +SET +begin; +BEGIN + +-- Allocate all the memory in the second resource group. +select hold_memory_by_percent_on_qe(1); + hold_memory_by_percent_on_qe +------------------------------ + 0 +(1 row) +-- Wait for all backends to run before executing the query. This is necessary +-- to avoid executing hold_memory before all backends are up. +select gp_inject_fault_infinite('executor_pre_tuple_processed', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content > -1; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) +-- The first resource group has 682 * 0.15 = 102 MB of memory available. +-- 6 slices consume 12 * 6 = 72 MB, plus one slice explicitly allocates 40 MB of +-- memory, 72 + 40 = 112 MB. It is expected that the query will be canceled due +-- to lack of memory in the resource group but not in the Vmem tracker. +1&: with sleep_cte as ( select a, pg_sleep(60) is null from t1 ), alloc_cte as ( select * from sleep_cte where repeatPalloc(40, 1) is not null ) select * from alloc_cte a1 join sleep_cte a2 using(a) join sleep_cte a3 using(a) join sleep_cte a4 using(a) join sleep_cte a5 using(a); +select gp_wait_until_triggered_fault('executor_pre_tuple_processed', 6, dbid) from gp_segment_configuration where role = 'p' and content > -1; + gp_wait_until_triggered_fault +------------------------------- + Success: + Success: + Success: +(3 rows) +select gp_inject_fault('executor_pre_tuple_processed', 'reset', dbid) from gp_segment_configuration where role = 'p' and content > -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) +1<: <... completed> +ERROR: Out of memory (seg0 slice3 172.22.0.2:6002 pid=4876) +DETAIL: Resource group memory limit reached +rollback; +ROLLBACK + +-- Test that the starting memory is visible to the resource group. +select gp_inject_fault('executor_pre_tuple_processed', 'suspend', dbid) from gp_segment_configuration where role = 'p' and content > -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) +1&: select count(*) from t1; +select gp_wait_until_triggered_fault('executor_pre_tuple_processed', 1, dbid) from gp_segment_configuration where role = 'p' and content > -1; + gp_wait_until_triggered_fault +------------------------------- + Success: + Success: + Success: +(3 rows) + +select segment, mem.* from gp_toolkit.gp_resgroup_status, json_object_keys(memory_usage) as segment, json_to_record(memory_usage -> segment) mem (used int) where rsgname = 'rg1'; + segment | used +---------+------ + -1 | 12 + 0 | 12 + 1 | 12 + 2 | 12 +(4 rows) + +select gp_inject_fault('executor_pre_tuple_processed', 'reset', dbid) from gp_segment_configuration where role = 'p' and content > -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) +1<: <... completed> + count +------- + 3 +(1 row) +1q: ... + +reset role; +RESET +drop table t1; +DROP +drop role test1; +DROP +drop role test2; +DROP +drop resource group rg1; +DROP +drop resource group rg2; +DROP + +alter resource group default_group set memory_limit 30; +ALTER +alter resource group default_group set memory_spill_ratio 10; +ALTER + +alter resource group admin_group set memory_limit 10; +ALTER +alter resource group admin_group set memory_spill_ratio 10; +ALTER + +drop function repeatPalloc(int, int); +DROP +drop function hold_memory_by_percent_on_qe(float); +DROP +drop function resGroupPalloc(float); +DROP diff --git a/src/test/regress/regress_gp.c b/src/test/regress/regress_gp.c index 988a9a0d2252..638126609e5d 100644 --- a/src/test/regress/regress_gp.c +++ b/src/test/regress/regress_gp.c @@ -54,6 +54,7 @@ #include "utils/memutils.h" #include "utils/resource_manager.h" #include "utils/timestamp.h" +#include "utils/vmem_tracker.h" /* table_functions test */ extern Datum multiset_example(PG_FUNCTION_ARGS); @@ -632,6 +633,7 @@ PG_FUNCTION_INFO_V1(resGroupPalloc); Datum resGroupPalloc(PG_FUNCTION_ARGS) { + static int32 startUpMbRemains = -1; float ratio = PG_GETARG_FLOAT8(0); int memLimit, slotQuota, sharedQuota; int size; @@ -641,8 +643,26 @@ resGroupPalloc(PG_FUNCTION_ARGS) if (!IsResGroupEnabled()) PG_RETURN_INT32(0); + if (startUpMbRemains == -1) + { + startUpMbRemains = + (VmemTracker_GetStartupChunks()) + << (VmemTracker_GetChunkSizeInBits() - BITS_IN_MB); + } + ResGroupGetMemInfo(&memLimit, &slotQuota, &sharedQuota); size = ceilf(memLimit * ratio); + // At startup, the backend process is already consuming some amount of + // memory. In order not to complicate the logic of the tests, we take this + // memory into account when allocating memory for tests. + if (startUpMbRemains >= size) + { + startUpMbRemains -= size; + PG_RETURN_INT32(0); + } + size -= startUpMbRemains; + startUpMbRemains = 0; + count = size / 512; for (i = 0; i < count; i++) MemoryContextAlloc(TopMemoryContext, 512 * 1024 * 1024);