Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ADBDEV-6156 Count startup memory of each process when using resource groups #1023

Open
wants to merge 26 commits into
base: adb-6.x-dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
635f0da
Count startup memory of each process when using resource groups
Aug 23, 2024
f075158
Try to fix diff caused by gpstop
Oct 21, 2024
f8bd708
Merge branch 'adb-6.x-dev' into ADBDEV-6156
Oct 23, 2024
da24513
Use the actual number of startup chunks instaed of a heardcoded value
Oct 24, 2024
c1acd81
Merge branch 'adb-6.x-dev' into ADBDEV-6156
Oct 24, 2024
fb14e74
Add resGroupPallocIgnoreStartup which copies old Add resGroupPalloc
Oct 30, 2024
cdf357a
Add new test case which tests redzone
Oct 31, 2024
ec269c9
Ignore generated file
Oct 31, 2024
8c71e1c
Ignore function creation output
Oct 31, 2024
4579e61
Add a comment for resGroupPallocIgnoreStartup
Oct 31, 2024
ef84cdb
Consider startup memory only if allocation was successful
Nov 1, 2024
cf1b560
Bring out allocation logic to a dedicated function
Nov 14, 2024
562ac84
Reword comments so they would be more clear
Nov 14, 2024
94328c9
Merge branch 'adb-6.x-dev' into ADBDEV-6156
Nov 14, 2024
73582b3
Remove redundant test
Nov 14, 2024
4a9360e
Make resGroupPallocImpl static
Nov 15, 2024
475f3cb
Fix library path for test output
Nov 22, 2024
ef3a7e5
Merge branch 'adb-6.x-dev' into ADBDEV-6156
Nov 22, 2024
524afa5
new test
KnightMurloc Dec 26, 2024
021c829
fix test
KnightMurloc Dec 27, 2024
bd7ab6e
rework test
KnightMurloc Jan 14, 2025
c765b67
rework resGroupPalloc
KnightMurloc Jan 15, 2025
0c1831d
improve the test
KnightMurloc Jan 16, 2025
caa4e57
improve the test
KnightMurloc Jan 16, 2025
492e4bd
Merge branch 'adb-6.x-dev' into ADBDEV-6156
KnightMurloc Jan 16, 2025
c4f373c
reduce diff
KnightMurloc Jan 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/backend/utils/mmgr/vmem_tracker.c
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,12 @@ VmemTracker_ReleaseVmem(int64 toBeFreedRequested)
}
}

int32
VmemTracker_GetStartupChunks(void)
{
return startupChunks;
}

/*
* Register the startup memory to vmem tracker.
*
Expand Down Expand Up @@ -670,6 +676,8 @@ VmemTracker_RegisterStartupMemory(int64 bytes)
pg_atomic_add_fetch_u32((pg_atomic_uint32 *) segmentVmemChunks,
startupChunks);

ResGroupProcAddStartupChunks(startupChunks);

/*
* Step 2, check if an OOM error should be raised by allocating 0 chunk.
*/
Expand All @@ -692,6 +700,8 @@ VmemTracker_UnregisterStartupMemory(void)
pg_atomic_sub_fetch_u32((pg_atomic_uint32 *) &MySessionState->sessionVmem,
startupChunks);

ResGroupProcSubStartupChunks(startupChunks);

trackedBytes -= startupBytes;
trackedVmemChunks -= startupChunks;

Expand Down
24 changes: 24 additions & 0 deletions src/backend/utils/resgroup/resgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -1594,6 +1594,30 @@ selfDetachResGroup(ResGroupData *group, ResGroupSlotData *slot)
selfUnsetGroup();
}

/*
* Add startup memory before a resgroup is assigned. This memory
* will later be added to resgroup via selfAttachResGroup
*/
void
ResGroupProcAddStartupChunks(int32 chunks)
{
if (IsResGroupEnabled())
self->memUsage += chunks;
}

/*
* Sub startup memory at cleanup. This memory should already been
* subtracted from a resource group via selfDetachResGroup.
* Actually, this is not needed because a running process will always have
* startup memory consumpion, but let it be just for symmetry.
*/
void
ResGroupProcSubStartupChunks(int32 chunks)
Copy link
Member

@andr-sokolov andr-sokolov Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about removing this function and calling ResGroupProcAddStartupChunks(-startupChunks) instead of ResGroupProcSubStartupChunks(startupChunks) or removing the chunks argument and using VmemTracker_GetStartupChunks()?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed.

{
if (IsResGroupEnabled())
self->memUsage -= chunks;
}

/*
* Initialize the members of a slot
*/
Expand Down
3 changes: 3 additions & 0 deletions src/include/utils/resgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ extern Oid ResGroupGetGroupIdBySessionId(int sessionId);
extern char *getCpuSetByRole(const char *cpuset);
extern void checkCpuSetByRole(const char *cpuset);

extern void ResGroupProcAddStartupChunks(int32 chunks);
extern void ResGroupProcSubStartupChunks(int32 chunks);

#define LOG_RESGROUP_DEBUG(...) \
do {if (Debug_resource_group) elog(__VA_ARGS__); } while(false);

Expand Down
1 change: 1 addition & 0 deletions src/include/utils/vmem_tracker.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ extern void VmemTracker_UnregisterStartupMemory(void);
extern void VmemTracker_RequestWaiver(int64 waiver_bytes);
extern void VmemTracker_ResetWaiver(void);
extern int64 VmemTracker_Fault(int32 reason, int64 arg);
extern int32 VmemTracker_GetStartupChunks(void);

extern int32 RedZoneHandler_GetRedZoneLimitChunks(void);
extern int32 RedZoneHandler_GetRedZoneLimitMB(void);
Expand Down
60 changes: 60 additions & 0 deletions src/test/isolation2/expected/resgroup/resgroup_startup_memory.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
-- start_ignore
! gpconfig -c runaway_detector_activation_percent -v 10;
! gpstop -rai;

drop table if exists t1;
drop role if exists test;
drop resource group test_group;
-- end_ignore

create resource group test_group with (cpu_rate_limit=20, memory_limit=15, memory_shared_quota=100, memory_spill_ratio=0);
CREATE

create role test with resource group test_group;
CREATE

set role test;
SET
create table t1 (a int) distributed by (a);
CREATE
insert into t1 select a from generate_series(1, 10) a;
INSERT 10
alter table t1 set distributed randomly;
ALTER

-- Test that the starting memory is visible to the resource group.
1: set role test;
SET
1&: select count(*) from t1 where pg_sleep(1) is not null; <waiting ...>

2: select segment, mem.* from gp_toolkit.gp_resgroup_status, json_object_keys(memory_usage) as segment, json_to_record(memory_usage -> segment) mem (used int) where rsgname = 'test_group';
segment | used
---------+------
-1 | 12
0 | 12
1 | 12
2 | 12
(4 rows)
1<: <... completed>
count
-------
10
(1 row)
1q: ... <quitting>
2q: ... <quitting>

-- The runaway detector test. A query with a large number of slices should
-- be terminated due to high memory consumption.
select count(*) from t1 a1 join t1 a2 using(a) join t1 a3 using(a) join t1 a4 using(a) join t1 a5 using(a) join t1 a6 using(a) join t1 a7 using(a) join t1 a8 using(a) join t1 a9 using(a) join t1 a10 using(a);
ERROR: Canceling query because of high VMEM usage. current group id is 712716, group memory usage 133 MB, group shared memory quota is 102 MB, slot memory quota is 0 MB, global freechunks memory is 277 MB, global safe memory threshold is 277 MB (runaway_cleaner.c:197) (seg1 slice10 172.18.0.3:6003 pid=88018) (runaway_cleaner.c:197)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rewrite please the test so that the error about memory consumption before the patch and after the patch occurs for different reasons


drop table t1;
DROP
reset role;
RESET
drop role test;
DROP
drop resource group test_group;
DROP
-- start_ignore
-- end_ignore
8 changes: 6 additions & 2 deletions src/test/isolation2/input/resgroup/resgroup_move_query.source
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
--
-- end_matchsubs

-- start_ignore
! gpstop -rai;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this line is added?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reverted

-- end_ignore

CREATE OR REPLACE FUNCTION resGroupPalloc(float) RETURNS int AS
'@abs_builddir@/../regress/regress@DLSUFFIX@', 'resGroupPalloc'
LANGUAGE C READS SQL DATA;
Expand Down Expand Up @@ -135,7 +139,7 @@ SELECT num_running FROM gp_toolkit.gp_resgroup_status WHERE rsgname='rg_move_que
1&: SELECT pg_sleep(3);
2: SET ROLE role_move_query_mem_small;
2: BEGIN;
2: SELECT hold_memory_by_percent_on_qe(1,0.1);
2: SELECT hold_memory_by_percent_on_qe(1,0.2);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why 0.1 is replaced with 0.2 here and below?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reverted

3&: SELECT gp_toolkit.pg_resgroup_move_query(pid, 'rg_move_query') FROM pg_stat_activity WHERE query LIKE '%hold_memory_by_percent_on_qe%' AND rsgname='rg_move_query_mem_small';
1<:
-- connection 1 finished, it will wake up connection 3
Expand All @@ -150,7 +154,7 @@ SELECT num_running FROM gp_toolkit.gp_resgroup_status WHERE rsgname='rg_move_que
1: ALTER RESOURCE GROUP rg_move_query SET memory_limit 0;
1: SET ROLE role_move_query_mem_small;
1: BEGIN;
1: SELECT hold_memory_by_percent_on_qe(1,0.1);
1: SELECT hold_memory_by_percent_on_qe(1,0.2);
2: SELECT gp_toolkit.pg_resgroup_move_query(pid, 'rg_move_query') FROM pg_stat_activity WHERE query LIKE '%hold_memory_by_percent_on_qe%' AND rsgname='rg_move_query_mem_small';
2: SELECT is_session_in_group(pid, 'rg_move_query') FROM pg_stat_activity WHERE query LIKE '%hold_memory_by_percent_on_qe%' AND state = 'idle in transaction';
1: END;
Expand Down
2 changes: 2 additions & 0 deletions src/test/isolation2/isolation2_resgroup_schedule
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,6 @@ test: resgroup/resgroup_dumpinfo
# test larget group id
test: resgroup/resgroup_large_group_id

test: resgroup/resgroup_startup_memory
RekGRpth marked this conversation as resolved.
Show resolved Hide resolved

test: resgroup/disable_resgroup
22 changes: 11 additions & 11 deletions src/test/isolation2/output/resgroup/resgroup_bypass.source
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ SELECT * FROM memory_result;
rsgname | ismaster | avg_mem
----------------+----------+---------
rg_bypass_test | 0 | 0.0
rg_bypass_test | 1 | 12.0
rg_bypass_test | 1 | 24.0
bandetto marked this conversation as resolved.
Show resolved Hide resolved
(2 rows)
61: SELECT * FROM eat_memory_on_qd_large;
ERROR: Out of memory
Expand Down Expand Up @@ -202,8 +202,8 @@ BEGIN
SELECT * FROM memory_result;
rsgname | ismaster | avg_mem
----------------+----------+---------
rg_bypass_test | 0 | 4.0
rg_bypass_test | 1 | 0.0
rg_bypass_test | 0 | 16.0
rg_bypass_test | 1 | 12.0
(2 rows)
61: SELECT * FROM eat_memory_on_one_slice;
count
Expand All @@ -213,8 +213,8 @@ SELECT * FROM memory_result;
SELECT * FROM memory_result;
rsgname | ismaster | avg_mem
----------------+----------+---------
rg_bypass_test | 0 | 8.0
rg_bypass_test | 1 | 0.0
rg_bypass_test | 0 | 20.0
rg_bypass_test | 1 | 12.0
(2 rows)
61: SELECT * FROM eat_memory_on_one_slice;
ERROR: Out of memory (seg0 slice1 127.0.0.1:25432 pid=336)
Expand All @@ -234,7 +234,7 @@ SELECT * FROM memory_result;
rsgname | ismaster | avg_mem
----------------+----------+---------
rg_bypass_test | 0 | 0.0
rg_bypass_test | 1 | 0.0
rg_bypass_test | 1 | 12.0
(2 rows)
61q: ... <quitting>

Expand All @@ -256,8 +256,8 @@ BEGIN
SELECT * FROM memory_result;
rsgname | ismaster | avg_mem
----------------+----------+---------
rg_bypass_test | 0 | 4.0
rg_bypass_test | 1 | 0.0
rg_bypass_test | 0 | 16.0
rg_bypass_test | 1 | 12.0
(2 rows)
61: SELECT * FROM eat_memory_on_slices;
count
Expand All @@ -267,8 +267,8 @@ SELECT * FROM memory_result;
SELECT * FROM memory_result;
rsgname | ismaster | avg_mem
----------------+----------+---------
rg_bypass_test | 0 | 8.0
rg_bypass_test | 1 | 0.0
rg_bypass_test | 0 | 20.0
rg_bypass_test | 1 | 12.0
(2 rows)
61: SELECT * FROM eat_memory_on_slices;
ERROR: Out of memory (seg0 slice2 127.0.0.1:25432 pid=354)
Expand All @@ -288,7 +288,7 @@ SELECT * FROM memory_result;
rsgname | ismaster | avg_mem
----------------+----------+---------
rg_bypass_test | 0 | 0.0
rg_bypass_test | 1 | 0.0
rg_bypass_test | 1 | 12.0
(2 rows)
61q: ... <quitting>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ SELECT * FROM memory_result;
rsgname | ismaster | avg_mem
-----------------+----------+---------
rg1_memory_test | 0 | 0.0
rg1_memory_test | 1 | 20.0
rg1_memory_test | 1 | 30.0
rg2_memory_test | 0 | 0.0
rg2_memory_test | 1 | 0.0
(4 rows)
Expand Down Expand Up @@ -144,8 +144,8 @@ BEGIN
SELECT * FROM memory_result;
rsgname | ismaster | avg_mem
-----------------+----------+---------
rg1_memory_test | 0 | 20.0
rg1_memory_test | 1 | 0.0
rg1_memory_test | 0 | 30.0
rg1_memory_test | 1 | 10.0
rg2_memory_test | 0 | 0.0
rg2_memory_test | 1 | 0.0
(4 rows)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
--
-- end_matchsubs

-- start_ignore
! gpstop -rai;
-- end_ignore

CREATE OR REPLACE FUNCTION resGroupPalloc(float) RETURNS int AS '@abs_builddir@/../regress/regress@DLSUFFIX@', 'resGroupPalloc' LANGUAGE C READS SQL DATA;
CREATE

Expand Down Expand Up @@ -185,7 +189,7 @@ SET
SET
2: BEGIN;
BEGIN
2: SELECT hold_memory_by_percent_on_qe(1,0.1);
2: SELECT hold_memory_by_percent_on_qe(1,0.2);
hold_memory_by_percent_on_qe
------------------------------
0
Expand Down Expand Up @@ -220,7 +224,7 @@ ALTER
SET
1: BEGIN;
BEGIN
1: SELECT hold_memory_by_percent_on_qe(1,0.1);
1: SELECT hold_memory_by_percent_on_qe(1,0.2);
hold_memory_by_percent_on_qe
------------------------------
0
Expand Down
49 changes: 49 additions & 0 deletions src/test/isolation2/sql/resgroup/resgroup_startup_memory.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
-- start_ignore
! gpconfig -c runaway_detector_activation_percent -v 10;
! gpstop -rai;

drop table if exists t1;
drop role if exists test;
drop resource group test_group;
-- end_ignore

create resource group test_group with (cpu_rate_limit=20, memory_limit=15, memory_shared_quota=100, memory_spill_ratio=0);

create role test with resource group test_group;

set role test;
create table t1 (a int) distributed by (a);
insert into t1 select a from generate_series(1, 10) a;
alter table t1 set distributed randomly;

-- Test that the starting memory is visible to the resource group.
1: set role test;
1&: select count(*) from t1 where pg_sleep(1) is not null;

2: select segment, mem.* from gp_toolkit.gp_resgroup_status, json_object_keys(memory_usage)
as segment, json_to_record(memory_usage -> segment) mem (used int) where rsgname = 'test_group';
1<:
1q:
2q:

-- The runaway detector test. A query with a large number of slices should
-- be terminated due to high memory consumption.
select count(*) from t1 a1
join t1 a2 using(a)
join t1 a3 using(a)
join t1 a4 using(a)
join t1 a5 using(a)
join t1 a6 using(a)
join t1 a7 using(a)
join t1 a8 using(a)
join t1 a9 using(a)
join t1 a10 using(a);

drop table t1;
reset role;
drop role test;
drop resource group test_group;
-- start_ignore
! gpconfig -c runaway_detector_activation_percent -v 100;
! gpstop -rai;
-- end_ignore
18 changes: 18 additions & 0 deletions src/test/regress/regress_gp.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
#include "utils/memutils.h"
#include "utils/resource_manager.h"
#include "utils/timestamp.h"
#include "utils/vmem_tracker.h"

/* table_functions test */
extern Datum multiset_example(PG_FUNCTION_ARGS);
Expand Down Expand Up @@ -632,6 +633,7 @@ PG_FUNCTION_INFO_V1(resGroupPalloc);
Datum
resGroupPalloc(PG_FUNCTION_ARGS)
{
static int32 startUpMBToAccount = -1;
float ratio = PG_GETARG_FLOAT8(0);
int memLimit, slotQuota, sharedQuota;
int size;
Expand All @@ -641,8 +643,24 @@ resGroupPalloc(PG_FUNCTION_ARGS)
if (!IsResGroupEnabled())
PG_RETURN_INT32(0);

if (startUpMBToAccount == -1)
{
startUpMBToAccount =
(VmemTracker_GetStartupChunks())
<< (VmemTracker_GetChunkSizeInBits() - BITS_IN_MB);
}

ResGroupGetMemInfo(&memLimit, &slotQuota, &sharedQuota);
size = ceilf(memLimit * ratio);
// At startup, the backend process is already consuming some amount of
// memory. In order not to complicate the logic of the tests, we take this
// memory into account when allocating memory for tests.
if (startUpMBToAccount)
andr-sokolov marked this conversation as resolved.
Show resolved Hide resolved
{
int32 tmp = Max(0, startUpMBToAccount - size);
size = Max(0, size - startUpMBToAccount);
startUpMBToAccount = tmp;
}
count = size / 512;
for (i = 0; i < count; i++)
MemoryContextAlloc(TopMemoryContext, 512 * 1024 * 1024);
Expand Down
Loading