Skip to content

Commit

Permalink
expose metric to report reasons why full GCs were triggered (JuliaLan…
Browse files Browse the repository at this point in the history
…g#55826)

Additional GC observability tool.

This will help us to diagnose why some of our servers are triggering so
many full GCs in certain circumstances.
  • Loading branch information
d-netto authored Sep 27, 2024
1 parent 0dbb6eb commit 6e33dfb
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 2 deletions.
27 changes: 27 additions & 0 deletions base/timing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,33 @@ function gc_page_utilization_data()
return Base.unsafe_wrap(Array, page_utilization_raw, JL_GC_N_MAX_POOLS, own=false)
end

# must be kept in sync with `src/gc-stock.h``
const FULL_SWEEP_REASONS = [:FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL, :FULL_SWEEP_REASON_FORCED_FULL_SWEEP,
:FULL_SWEEP_REASON_USER_MAX_EXCEEDED, :FULL_SWEEP_REASON_LARGE_PROMOTION_RATE]

"""
Base.full_sweep_reasons()
Return a dictionary of the number of times each full sweep reason has occurred.
The reasons are:
- `:FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL`: Full sweep was caused due to `always_full` being set in the GC debug environment
- `:FULL_SWEEP_REASON_FORCED_FULL_SWEEP`: Full sweep was forced by `GC.gc(true)`
- `:FULL_SWEEP_REASON_USER_MAX_EXCEEDED`: Full sweep was forced due to the system reaching the heap soft size limit
- `:FULL_SWEEP_REASON_LARGE_PROMOTION_RATE`: Full sweep was forced by a large promotion rate across GC generations
Note that the set of reasons is not guaranteed to be stable across minor versions of Julia.
"""
function full_sweep_reasons()
reason = cglobal(:jl_full_sweep_reasons, UInt64)
reasons_as_array = Base.unsafe_wrap(Vector{UInt64}, reason, length(FULL_SWEEP_REASONS), own=false)
d = Dict{Symbol, Int64}()
for (i, r) in enumerate(FULL_SWEEP_REASONS)
d[r] = reasons_as_array[i]
end
return d
end

"""
Base.jit_total_bytes()
Expand Down
15 changes: 13 additions & 2 deletions src/gc-stock.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ uv_sem_t gc_sweep_assists_needed;
uv_mutex_t gc_queue_observer_lock;
// Tag for sentinel nodes in bigval list
uintptr_t gc_bigval_sentinel_tag;
// Table recording number of full GCs due to each reason
JL_DLLEXPORT uint64_t jl_full_sweep_reasons[FULL_SWEEP_NUM_REASONS];

// Flag that tells us whether we need to support conservative marking
// of objects.
Expand Down Expand Up @@ -3043,10 +3045,12 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
// we either free some space or get an OOM error.
if (gc_sweep_always_full) {
sweep_full = 1;
gc_count_full_sweep_reason(FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL);
}
if (collection == JL_GC_FULL && !prev_sweep_full) {
sweep_full = 1;
recollect = 1;
gc_count_full_sweep_reason(FULL_SWEEP_REASON_FORCED_FULL_SWEEP);
}
if (sweep_full) {
// these are the difference between the number of gc-perm bytes scanned
Expand Down Expand Up @@ -3182,10 +3186,17 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
}

double old_ratio = (double)promoted_bytes/(double)heap_size;
if (heap_size > user_max || old_ratio > 0.15)
if (heap_size > user_max) {
next_sweep_full = 1;
else
gc_count_full_sweep_reason(FULL_SWEEP_REASON_USER_MAX_EXCEEDED);
}
else if (old_ratio > 0.15) {
next_sweep_full = 1;
gc_count_full_sweep_reason(FULL_SWEEP_REASON_LARGE_PROMOTION_RATE);
}
else {
next_sweep_full = 0;
}
if (heap_size > user_max || thrashing)
under_pressure = 1;
// sweeping is over
Expand Down
14 changes: 14 additions & 0 deletions src/gc-stock.h
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,20 @@ FORCE_INLINE void gc_big_object_link(bigval_t *sentinel_node, bigval_t *node) JL
sentinel_node->next = node;
}

// Must be kept in sync with `base/timing.jl`
#define FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL (0)
#define FULL_SWEEP_REASON_FORCED_FULL_SWEEP (1)
#define FULL_SWEEP_REASON_USER_MAX_EXCEEDED (2)
#define FULL_SWEEP_REASON_LARGE_PROMOTION_RATE (3)
#define FULL_SWEEP_NUM_REASONS (4)

extern JL_DLLEXPORT uint64_t jl_full_sweep_reasons[FULL_SWEEP_NUM_REASONS];
STATIC_INLINE void gc_count_full_sweep_reason(int reason) JL_NOTSAFEPOINT
{
assert(reason >= 0 && reason < FULL_SWEEP_NUM_REASONS);
jl_full_sweep_reasons[reason]++;
}

extern uv_mutex_t gc_perm_lock;
extern uv_mutex_t gc_threads_lock;
extern uv_cond_t gc_threads_cond;
Expand Down
11 changes: 11 additions & 0 deletions test/gc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ function issue_54275_test()
@test !live_bytes_has_grown_too_much
end

function full_sweep_reasons_test()
GC.gc()
reasons = Base.full_sweep_reasons()
@test reasons[:FULL_SWEEP_REASON_FORCED_FULL_SWEEP] >= 1
@test keys(reasons) == Set(Base.FULL_SWEEP_REASONS)
end

# !!! note:
# Since we run our tests on 32bit OS as well we confine ourselves
# to parameters that allocate about 512MB of objects. Max RSS is lower
Expand All @@ -73,6 +80,10 @@ end
@test isempty(Docs.undocumented_names(GC))
end

@testset "Full GC reasons" begin
full_sweep_reasons_test()
end

#testset doesn't work here because this needs to run in top level
#Check that we ensure objects in toplevel exprs are rooted
global dims54422 = [] # allocate the Binding
Expand Down

0 comments on commit 6e33dfb

Please sign in to comment.