From b6a8eede821591794a88a87fdca374cc19e360f2 Mon Sep 17 00:00:00 2001 From: Jake Hillion Date: Tue, 7 Oct 2025 17:18:56 +0100 Subject: [PATCH 1/2] chaos: reimplement better random support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit c576237ad039 added random support to scx_chaos with less bias (very very nearly 0). It was reverted because it broke random delays. It turns out the random implementation was fine but the callsite was wrong. Instead of adding the random delay to the current time it was setting the target time to the random delay, which was always in the past, and hence scheduling things immediately. With that fixed, this appears to work. Test plan: - CI ``` # wakeup latencies with no flags 56.166µs |▁▄▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████ 132.125µs ███▇▇▇▇▆▆▆▃| 141.834µs 63.75µs |▁▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████ 74.767µs ███████▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▅▅▅▅▄▄▄▄▄▄▄▄▄▃▃| 88.6µs 65.835µs |▁▆▇▇ 84.235µs █████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▅▅▅▅▅▅▄▄▄▄▄▄▄▄▄▄▃▃| 446.137µs 59.289µs |▁▅▆▆▆▆▆▇▇▇▇▇██ 117.979µs ████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▄▄▃▃▂▁| 407.666µs 64.987µs |▁▄▅▅▆▆▆▆▇▇▇▇▇██ 148.62µs █████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁| 536.936µs 52.78µs |▁▂▂▃▃▄▄▅▆▆▆▆▆▆▇▇▇▇▇▇██ 146.406µs ██████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▄▄▃▁| 415.672µs 58.819µs |▁▃▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇████ 174.184µs █████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▄▄▁| 437.196µs 62.938µs |▁▆▇ 127.197µs ██▇▇▇▇▇▆▆▆▅▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁| 2.105205ms 59.026µs |▁▂▄▄▅▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████ 227.526µs ██████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▄▄▃▂▁| 456.578µs Benchmarking sleep_wakeup_histogram/wakeup_latency: Collecting 10 samples in estimated 5.0415 s (490 iterations) 64.466µs |▁▃▄▄▄▅▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████ 312.808µs ██████▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▅▅▄▄▁| 431.229µs 48.977µs |▁▂▄▄▅▅▆▆▆▆▆▇▇▇▇▇██ 125.064µs ██████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▄▄▄▁| 410.671µs 58.827µs |▁▃▄▅▆▆▆▆▆▇▇▇▇▇██ 127.664µs ██████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▄▄▄▁| 421.334µs 63.12µs |▁▃▄▄▅▅▆▆▆▆▇▇▇▇▇███ 135.895µs ██████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▄▁| 424.716µs 37.173µs |▁▁▂▂▃▃▄▄▅▆▆▆▆▇▇▇▇██ 132.831µs ███████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▂▂▁| 470.139µs 60.392µs |▁▂▂▃▃▄▄▄▅▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████ 410.584µs ██▇▇▆▆▅▅▄▄▁| 457.748µs 58.813µs |▁▃▄▅▆▆▆▆▇▇▇▇▇█ 124.729µs ███████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▄▄▁| 453.846µs 55.203µs |▁▃▄▄▅▅▆▆▆▆▇▇▇▇██ 130.237µs ██████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▄▄▄▁| 466.242µs 62.011µs |▁▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇███ 186.162µs ████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▄▄▃▃▃▂▂▂▁▁| 499.317µs 44.48µs |▁▃▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██ 151.382µs █████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▄▄▄▁| 450.172µs # wakeup latencies with --random-delay-frequency 1.0 --random-delay-min-us 100000 --random-delay-max-us 200000 67.984µs | 103.273µs ▁████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▅▅▅▅▅▅▄▄▄▄▄▄▄▄▄▄▄▃▃| 149.695558ms 69.85µs |▁▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████ 73.03691ms ████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▄▃| 197.940289ms 66.854µs |▁▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████ 79.471527ms ██████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▄▄▄▃| 200.842494ms 72.268µs |▁▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████ 79.757623ms ██████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▄▄▁| 195.335728ms 45.997µs |▁▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████ 59.430492ms ███████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▄▄▄▁| 201.135123ms Benchmarking sleep_wakeup_histogram/wakeup_latency: Collecting 10 samples in estimated 6.1568 s (40 iterations) 72.645µs |▁▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████ 83.468765ms ████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃| 198.469905ms 77.531µs |▁▆▆▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████ 92.318916ms █████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▄▃| 194.16041ms 67.939µs |▁▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████ 75.667921ms ██████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▄▄▃| 188.527733ms 66.176µs |▁▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████ 75.595375ms ██████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃| 193.914143ms 67.455µs |▁▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████ 75.774696ms ██████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃| 199.696546ms 61.654µs |▁▆▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████ 86.050604ms █████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▄▄▃| 186.936313ms 65.914µs |▁▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████ 76.277914ms ██████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▅▄▄▄▃| 196.387308ms 91.47µs |▁▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████ 76.357756ms ██████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▅▄▄▄▄▃| 200.616152ms 79.914µs |▁▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████ 83.185347ms ████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃| 198.712172ms 77.44µs |▁▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████ 72.513261ms ████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▄▄▃| 199.976316ms ``` --- scheds/include/scx/common.bpf.h | 8 ++++ scheds/rust/scx_chaos/src/bpf/intf.h | 8 ++-- scheds/rust/scx_chaos/src/bpf/main.bpf.c | 50 ++++++++++++++++++++---- 3 files changed, 55 insertions(+), 11 deletions(-) diff --git a/scheds/include/scx/common.bpf.h b/scheds/include/scx/common.bpf.h index f5ef882aa4..9e2d9f5e9e 100644 --- a/scheds/include/scx/common.bpf.h +++ b/scheds/include/scx/common.bpf.h @@ -763,6 +763,14 @@ static inline u64 scale_by_task_weight_inverse(const struct task_struct *p, u64 return value * 100 / p->scx.weight; } +/* + * Get a random u64 from the kernel's pseudo-random generator. + */ +static inline u64 get_prandom_u64() +{ + return ((u64)bpf_get_prandom_u32() << 32) | bpf_get_prandom_u32(); +} + #include "compat.bpf.h" #include "enums.bpf.h" diff --git a/scheds/rust/scx_chaos/src/bpf/intf.h b/scheds/rust/scx_chaos/src/bpf/intf.h index 67168d609a..c0676a1950 100644 --- a/scheds/rust/scx_chaos/src/bpf/intf.h +++ b/scheds/rust/scx_chaos/src/bpf/intf.h @@ -10,10 +10,12 @@ typedef unsigned long long u64; #endif enum chaos_consts { - CHAOS_DSQ_BASE_SHIFT = 16, - CHAOS_DSQ_BASE = 1 << CHAOS_DSQ_BASE_SHIFT, + CHAOS_DSQ_BASE_SHIFT = 16, + CHAOS_DSQ_BASE = 1 << CHAOS_DSQ_BASE_SHIFT, - CHAOS_NUM_PPIDS_CHECK = 1 << 20, + CHAOS_NUM_PPIDS_CHECK = 1 << 20, + + CHAOS_MAX_RAND_ATTEMPTS = 512, }; enum chaos_match { diff --git a/scheds/rust/scx_chaos/src/bpf/main.bpf.c b/scheds/rust/scx_chaos/src/bpf/main.bpf.c index fd8f6a9fa2..0039b6b137 100644 --- a/scheds/rust/scx_chaos/src/bpf/main.bpf.c +++ b/scheds/rust/scx_chaos/src/bpf/main.bpf.c @@ -95,6 +95,47 @@ struct chaos_task_ctx *lookup_create_chaos_task_ctx(struct task_struct *p) BPF_LOCAL_STORAGE_GET_F_CREATE); } +static __always_inline u64 chaos_get_prandom_u64_limit(u64 s) +{ + // Implementation of Lemire's algorithm 5 without 128-bit arithmetic. + // See https://arxiv.org/pdf/1805.10941v2 for details. + // Uses a bounded loop given this is BPF, but given the loop should + // rarely be entered this is fine. + u64 x, m_low, m_high; + u64 t; + + x = get_prandom_u64(); + + // Compute 64-bit multiplication high and low parts + // m = x * s, split into m_high and m_low + m_high = ((x >> 32) * (s >> 32)) + + (((x & 0xFFFFFFFF) * (s >> 32)) >> 32) + + (((x >> 32) * (s & 0xFFFFFFFF)) >> 32); + m_low = x * s; + + if (m_low < s) { + t = ((u64)(-s)) % s; + bpf_repeat(CHAOS_MAX_RAND_ATTEMPTS) + { + if (m_low >= t) + break; + + x = get_prandom_u64(); + m_high = ((x >> 32) * (s >> 32)) + + (((x & 0xFFFFFFFF) * (s >> 32)) >> 32) + + (((x >> 32) * (s & 0xFFFFFFFF)) >> 32); + m_low = x * s; + } + } + + return m_high; +} + +static __always_inline u64 chaos_get_uniform_u64(u64 min, u64 max) +{ + return min + chaos_get_prandom_u64_limit(max - min + 1); +} + static __always_inline void chaos_stat_inc(enum chaos_stat_idx stat) { u64 *cnt_p = bpf_map_lookup_elem(&chaos_stats, &stat); @@ -270,15 +311,8 @@ __weak s32 enqueue_random_delay(struct task_struct *p __arg_trusted, struct chaos_task_ctx *taskc __arg_nonnull, u64 min_ns, u64 max_ns) { - u64 rand64 = ((u64)bpf_get_prandom_u32() << 32) | bpf_get_prandom_u32(); - - u64 vtime = bpf_ktime_get_ns() + min_ns; - if (min_ns != max_ns) { - vtime += rand64 % (max_ns - min_ns); - } - + u64 vtime = bpf_ktime_get_ns() + chaos_get_uniform_u64(min_ns, max_ns); scx_bpf_dsq_insert_vtime(p, get_cpu_delay_dsq(-1), 0, vtime, enq_flags); - return true; } From 69434ac4f7439bd668abd7492f47a61004e3f640 Mon Sep 17 00:00:00 2001 From: Jake Hillion Date: Thu, 9 Oct 2025 15:23:09 +0100 Subject: [PATCH 2/2] chaos: add futex delays trait Add futex delays to chaos. To best reproduce deadlocks and other futex issues we need to affect locking. The approach here: - Delays a waiter when a lock has contention up to futex_uncontended_delay_ns. - Swaps out the existing delayed waiter when another waiter comes along. - Delays the previous waiter by a random delay between futex_contended_delay_ns and futex_uncontended_delay_ns. This approach is chosen over random delays to flip futex conditions with minimal performance impact on a machine/process. If we had a futex and pair of threads that have many idle seconds after a short period of contention we would need huge random delays to affect their ordering at all, on every task that touches the futex. Instead we can limit the delays to a solo waiter at any point, and have a much smaller delay when we know the mutex is already under contention. We'll see how this works in practice. This is the most complicated chaos trait in terms of data structures by far. Currently we use a BPF hash map and a built in DSQ to maintain the data. The hash map maps a specific futex (well, close, a tgid/uaddr pair) to an entry in a CPU's delay DSQ. The delay DSQ holds the task until its timeout, and the map stores how to find that entry in the DSQ to re-queue it with the uncontended timeout. As commented in the code, the complexity of a search in a native DSQ is hideous - it's O(n). We can change the implementation in the future while keeping the logic the same. Test plan: - Lightly tested. Futex is attached to and sees many entries. Slow futex waiters are delayed. The hand off between an old delayed waiter and a new delayed waiter are not reliable and likely have a bug. - This change is a no-op unless you provide new command line flags. --- scheds/rust/scx_chaos/src/bpf/intf.h | 6 + scheds/rust/scx_chaos/src/bpf/main.bpf.c | 257 ++++++++++++++++++++++- scheds/rust/scx_chaos/src/lib.rs | 69 ++++++ scheds/rust/scx_chaos/src/stats.rs | 11 +- 4 files changed, 336 insertions(+), 7 deletions(-) diff --git a/scheds/rust/scx_chaos/src/bpf/intf.h b/scheds/rust/scx_chaos/src/bpf/intf.h index c0676a1950..b82e1750da 100644 --- a/scheds/rust/scx_chaos/src/bpf/intf.h +++ b/scheds/rust/scx_chaos/src/bpf/intf.h @@ -33,6 +33,7 @@ enum chaos_trait_kind { CHAOS_TRAIT_CPU_FREQ, CHAOS_TRAIT_DEGRADATION, CHAOS_TRAIT_KPROBE_RANDOM_DELAYS, + CHAOS_TRAIT_FUTEX_DELAYS, CHAOS_TRAIT_MAX, }; @@ -44,12 +45,17 @@ struct chaos_task_ctx { enum chaos_trait_kind pending_trait; u64 enq_flags; u64 p2dq_vtime; + + // Futex delay state + u64 futex_uaddr; }; enum chaos_stat_idx { CHAOS_STAT_TRAIT_RANDOM_DELAYS, CHAOS_STAT_TRAIT_CPU_FREQ, CHAOS_STAT_TRAIT_DEGRADATION, + CHAOS_STAT_TRAIT_FUTEX_DELAYS, + CHAOS_STAT_TRAIT_FUTEX_DELAYS_CONTENDED, CHAOS_STAT_CHAOS_EXCLUDED, CHAOS_STAT_CHAOS_SKIPPED, CHAOS_STAT_KPROBE_RANDOM_DELAYS, diff --git a/scheds/rust/scx_chaos/src/bpf/main.bpf.c b/scheds/rust/scx_chaos/src/bpf/main.bpf.c index 0039b6b137..6e6c5b9767 100644 --- a/scheds/rust/scx_chaos/src/bpf/main.bpf.c +++ b/scheds/rust/scx_chaos/src/bpf/main.bpf.c @@ -37,6 +37,39 @@ scx_bpf_dispatch_vtime_from_dsq___compat( \ (it__iter), (p), (dsq_id), (enq_flags))) +/* + * The following defines are from 'linux/include/uapi/linux/futex.h' + */ +#define FUTEX_WAIT 0 +#define FUTEX_WAKE 1 +#define FUTEX_FD 2 +#define FUTEX_REQUEUE 3 +#define FUTEX_CMP_REQUEUE 4 +#define FUTEX_WAKE_OP 5 +#define FUTEX_LOCK_PI 6 +#define FUTEX_UNLOCK_PI 7 +#define FUTEX_TRYLOCK_PI 8 +#define FUTEX_WAIT_BITSET 9 +#define FUTEX_WAKE_BITSET 10 +#define FUTEX_WAIT_REQUEUE_PI 11 +#define FUTEX_CMP_REQUEUE_PI 12 +#define FUTEX_LOCK_PI2 13 + +#define FUTEX_PRIVATE_FLAG 128 +#define FUTEX_CLOCK_REALTIME 256 +#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME) + +struct tp_syscall_enter_futex { + struct trace_entry ent; + int __syscall_nr; + u32 __attribute__((btf_type_tag("user"))) * uaddr; + int op; + u32 val; + struct __kernel_timespec __attribute__((btf_type_tag("user"))) * utime; + u32 __attribute__((btf_type_tag("user"))) * uaddr2; + u32 val3; +}; + const volatile int ppid_targeting_ppid = 1; const volatile bool ppid_targeting_inclusive = false; /* include ppid_targeting_ppid in chaos */ @@ -60,6 +93,11 @@ const volatile u32 kprobe_delays_freq_frac32 = 1; const volatile u64 kprobe_delays_min_ns = 1; const volatile u64 kprobe_delays_max_ns = 2; +const volatile u64 futex_uncontended_delay_ns = 1; +const volatile u64 futex_contended_delay_min_ns = 1; +const volatile u64 futex_contended_delay_max_ns = 1; + + #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define MAX(x, y) ((x) > (y) ? (x) : (y)) @@ -89,6 +127,30 @@ struct { __type(value, u64); } chaos_stats SEC(".maps"); +struct chaos_futex_key { + u32 tgid; + u64 uaddr; +}; + +struct chaos_futex_waiter { + struct bpf_spin_lock lock; + u64 timeout_key; + u32 pid; + s32 delay_dsq_cpu_idx; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1024*1024); + __type(key, struct chaos_futex_key); + __type(value, struct chaos_futex_waiter); +} chaos_futex_waiters SEC(".maps"); + +static __always_inline u64 chaos_get_prandom_u64() +{ + return ((u64)bpf_get_prandom_u32() << 32) | bpf_get_prandom_u32(); +} + struct chaos_task_ctx *lookup_create_chaos_task_ctx(struct task_struct *p) { return bpf_task_storage_get(&chaos_task_ctxs, p, NULL, @@ -166,8 +228,14 @@ choose_chaos(struct chaos_task_ctx *taskc) static __always_inline bool chaos_trait_skips_select_cpu(struct chaos_task_ctx *taskc) { - return taskc->next_trait == CHAOS_TRAIT_RANDOM_DELAYS || - taskc->next_trait == CHAOS_TRAIT_KPROBE_RANDOM_DELAYS; + switch (taskc->next_trait) { + case CHAOS_TRAIT_RANDOM_DELAYS: + case CHAOS_TRAIT_KPROBE_RANDOM_DELAYS: + case CHAOS_TRAIT_FUTEX_DELAYS: + return true; + default: + return false; + } } static __always_inline u64 get_cpu_delay_dsq(int cpu_idx) @@ -306,6 +374,134 @@ static __always_inline s32 calculate_chaos_match(struct task_struct *p) return ret; } +// Traverse a DSQ to find the first element with a key with hideous complexity. +// This is O(n) in DSQ members. +// +// To improve: +// - Add this as a kfunc to the kernel where it can be O(log n) +// - Use arena DSQs where we can get this behaviour in O(log n) +static __always_inline +void bpf_iter_scx_dsq_search(struct bpf_iter_scx_dsq *it, + struct task_struct **p, + u64 dsq_id, + u64 flags, + u64 key) +{ + bpf_iter_scx_dsq_new(it, dsq_id, flags); + + while((*p = bpf_iter_scx_dsq_next(it))) { + if ((*p)->scx.dsq_vtime == key) + return; + + if ((*p)->scx.dsq_vtime > key) + break; + } + + *p = NULL; +} + +static __always_inline bool update_delayed_task_vtime(s32 cpu_idx, u64 key, + u64 pid, u64 new_vtime) +{ + u64 dsq_id = get_cpu_delay_dsq(cpu_idx); + struct bpf_iter_scx_dsq it; + struct task_struct *p; + bool ret = false; + + bpf_iter_scx_dsq_search(&it, &p, dsq_id, 0, key); + if (!p) + goto out; + + while (p->pid != pid && (p = bpf_iter_scx_dsq_next(&it)) && p->scx.dsq_vtime == key) {} + if (!p || p->pid != pid) + goto out; + + ret = true; + __COMPAT_chaos_scx_bpf_dsq_move_set_vtime(&it, new_vtime); + ret = __COMPAT_chaos_scx_bpf_dsq_move_vtime(&it, p, dsq_id, 0); + +out: + bpf_iter_scx_dsq_destroy(&it); + return ret; +} + +__weak s32 enqueue_futex_delay(struct task_struct *p __arg_trusted, + u64 enq_flags, + struct chaos_task_ctx *taskc __arg_nonnull) +{ + s64 ret; + struct chaos_futex_key key; + struct chaos_futex_waiter *entry; + struct chaos_futex_waiter val; + u64 vtime, now; + s32 cpu; + + key.tgid = p->tgid; + key.uaddr = taskc->futex_uaddr; + + // First ensure an entry exists but in a largely empty state. We need the + // spinlock to correctly interlock with the delay DSQ. + val.pid = -1; + + ret = bpf_map_update_elem(&chaos_futex_waiters, &key, &val, BPF_NOEXIST); + if (ret && ret != -EEXIST) { + scx_bpf_error("failed to create chaos_futex_waiter in runnable_futex_delays"); + return false; + } + + // Get the real element. This might be an empty element that we inserted + // or it might be an element filled with another PID. It doesn't matter + // whether we inserted the element or somebody else did, this races. + entry = (struct chaos_futex_waiter*)bpf_map_lookup_elem(&chaos_futex_waiters, &key); + if (!entry) { + scx_bpf_error("failed to lookup chaos_futex_waiter in runnable_futex_delays"); + return false; + } + + // enqueue ourselves before entering the spinlock. critical sections + // can't call kfuncs. + now = bpf_ktime_get_ns(); + cpu = bpf_get_smp_processor_id(); + + chaos_stat_inc(CHAOS_STAT_TRAIT_FUTEX_DELAYS); + scx_bpf_dsq_insert_vtime(p, get_cpu_delay_dsq(cpu), 0, now + futex_uncontended_delay_ns, enq_flags); + + // critical sections can't call kfuncs which makes this very complicated. + // we must have already enqueued ourselves, and we must then insert + // ourselves in the hashmap. when we take a task out of the lock we + // should attempt to re-queue it after. the task will not hit this path + // again until it has been re-queued, thus this isn't racy - either we + // will re-queue it, or it will run naturally when its delay expires. + // This might mean it doesn't get quite enough delay, but no invariants + // are broken. + bpf_spin_lock(&entry->lock); + + val.pid = entry->pid; + val.timeout_key = entry->timeout_key; + val.delay_dsq_cpu_idx = entry->delay_dsq_cpu_idx; + + // enqueue ourselves and prepare the metadata for the next one to come along + entry->pid = p->pid; + entry->timeout_key = now + futex_uncontended_delay_ns; + entry->delay_dsq_cpu_idx = cpu; + + bpf_spin_unlock(&entry->lock); + + // re-queue task that has a contender behind it + if (val.pid != -1) { + vtime = now + futex_contended_delay_min_ns; + if (futex_contended_delay_min_ns != futex_contended_delay_max_ns) { + vtime += chaos_get_prandom_u64() + % (futex_contended_delay_max_ns - futex_contended_delay_min_ns); + } + + if (update_delayed_task_vtime(val.delay_dsq_cpu_idx, val.timeout_key, val.pid, vtime)) + chaos_stat_inc(CHAOS_STAT_TRAIT_FUTEX_DELAYS_CONTENDED); + } + + return true; +} + __weak s32 enqueue_random_delay(struct task_struct *p __arg_trusted, u64 enq_flags, struct chaos_task_ctx *taskc __arg_nonnull, @@ -334,6 +530,10 @@ __weak s32 enqueue_chaotic(struct task_struct *p __arg_trusted, u64 enq_flags, random_delays_max_ns); chaos_stat_inc(CHAOS_STAT_TRAIT_RANDOM_DELAYS); break; + + case CHAOS_TRAIT_FUTEX_DELAYS: + out = enqueue_futex_delay(p, enq_flags, taskc); + break; case CHAOS_TRAIT_NONE: chaos_stat_inc(CHAOS_STAT_CHAOS_SKIPPED); out = false; @@ -345,7 +545,6 @@ __weak s32 enqueue_chaotic(struct task_struct *p __arg_trusted, u64 enq_flags, break; } - taskc->next_trait = CHAOS_TRAIT_NONE; return out; } @@ -580,10 +779,10 @@ void BPF_STRUCT_OPS(chaos_enqueue, struct task_struct *p __arg_trusted, if (promise.kind == P2DQ_ENQUEUE_PROMISE_FAILED) goto cleanup; - if ((taskc->next_trait == CHAOS_TRAIT_RANDOM_DELAYS || - taskc->next_trait == CHAOS_TRAIT_KPROBE_RANDOM_DELAYS) && - enqueue_chaotic(p, enq_flags, taskc)) + if (enqueue_chaotic(p, enq_flags, taskc)) { + taskc->next_trait = CHAOS_TRAIT_NONE; goto cleanup; + } // NOTE: this may not work for affinitized tasks because p2dq does // direct dispatch in some situations. @@ -696,6 +895,52 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(chaos_init_task, struct task_struct *p, return 0; } +SEC("?tracepoint/syscalls/sys_enter_futex") +int rtp_sys_enter_futex(struct tp_syscall_enter_futex *ctx) +{ + struct task_struct *p; + struct chaos_task_ctx *taskc; + int futex_op; + s32 ret; + + // should be detached from userspace but if it is attached then no-op + if (!futex_uncontended_delay_ns && !futex_contended_delay_min_ns && + !futex_contended_delay_max_ns) + return 0; + + p = (struct task_struct *)bpf_get_current_task_btf(); + taskc = lookup_create_chaos_task_ctx(p); + if (!taskc) + return 0; + + if (!(taskc->match & CHAOS_MATCH_COMPLETE)) { + ret = calculate_chaos_match(p); + if (ret) { + scx_bpf_error("failed to match task"); + return 0; + } + } + + if (taskc->match & CHAOS_MATCH_EXCLUDED) + return 0; + + futex_op = ctx->op & FUTEX_CMD_MASK; + + if (futex_op != FUTEX_WAIT && futex_op != FUTEX_WAIT_BITSET && + futex_op != FUTEX_WAIT_REQUEUE_PI) + return 0; + + // The task is either about to wait because it hit FUTEX_WAIT on the slow + // path or hit the fast path. The fast path is irrelevant for our purposes + // as we have no scheduler input there, so it's safe to delay our work + // until a struct_ops .runnable callback comes along. + taskc->pending_trait = CHAOS_TRAIT_FUTEX_DELAYS; + taskc->futex_uaddr = (u64)ctx->uaddr; + + return 0; +} + + SEC("kprobe/generic") int generic(struct pt_regs *ctx) { diff --git a/scheds/rust/scx_chaos/src/lib.rs b/scheds/rust/scx_chaos/src/lib.rs index 53edf88f34..851702a4a0 100644 --- a/scheds/rust/scx_chaos/src/lib.rs +++ b/scheds/rust/scx_chaos/src/lib.rs @@ -109,6 +109,11 @@ pub enum Trait { frequency: f64, degradation_frac7: u64, }, + FutexDelays { + uncontended_us: u64, + contended_min_us: u64, + contended_max_us: u64, + }, } impl Trait { @@ -117,6 +122,7 @@ impl Trait { Self::RandomDelays { .. } => bpf_intf::chaos_trait_kind_CHAOS_TRAIT_RANDOM_DELAYS, Self::CpuFreq { .. } => bpf_intf::chaos_trait_kind_CHAOS_TRAIT_CPU_FREQ, Self::PerfDegradation { .. } => bpf_intf::chaos_trait_kind_CHAOS_TRAIT_DEGRADATION, + Self::FutexDelays { .. } => bpf_intf::chaos_trait_kind_CHAOS_TRAIT_FUTEX_DELAYS, } } @@ -125,6 +131,7 @@ impl Trait { Self::RandomDelays { frequency, .. } => *frequency, Self::CpuFreq { frequency, .. } => *frequency, Self::PerfDegradation { frequency, .. } => *frequency, + Self::FutexDelays { .. } => 0_f64, // triggered an alternative way } } } @@ -195,6 +202,10 @@ impl Scheduler { trait_cpu_freq: stats[bpf_intf::chaos_stat_idx_CHAOS_STAT_TRAIT_CPU_FREQ as usize], trait_degradation: stats [bpf_intf::chaos_stat_idx_CHAOS_STAT_TRAIT_DEGRADATION as usize], + trait_futex_delays: stats + [bpf_intf::chaos_stat_idx_CHAOS_STAT_TRAIT_FUTEX_DELAYS as usize], + trait_futex_delays_contended: stats + [bpf_intf::chaos_stat_idx_CHAOS_STAT_TRAIT_FUTEX_DELAYS_CONTENDED as usize], chaos_excluded: stats[bpf_intf::chaos_stat_idx_CHAOS_STAT_CHAOS_EXCLUDED as usize], chaos_skipped: stats[bpf_intf::chaos_stat_idx_CHAOS_STAT_CHAOS_SKIPPED as usize], kprobe_random_delays: stats @@ -315,6 +326,23 @@ impl Builder<'_> { } open_skel.struct_ops.chaos_mut().flags |= *compat::SCX_OPS_KEEP_BUILTIN_IDLE; + // Enable futex tracepoint conditionally if futex delays are enabled + if self + .traits + .iter() + .any(|x| matches!(x, Trait::FutexDelays { .. })) + & !compat::cond_tracepoint_enable( + "syscalls:sys_enter_futex", + &open_skel.progs.rtp_sys_enter_futex, + )? + { + bail!("couldn't attach to sys_enter_futex and futex delays were enabled"); + } + + // TODO: figure out how to abstract waking a CPU in enqueue properly, but for now disable + // this codepath + // rodata.select_idle_in_enqueue = false; // Field no longer exists + match self.requires_ppid { None => { rodata.ppid_targeting_ppid = -1; @@ -395,6 +423,15 @@ impl Builder<'_> { rodata.degradation_freq_frac32 = (frequency * 2_f64.powf(32_f64)) as u32; rodata.degradation_frac7 = *degradation_frac7; } + Trait::FutexDelays { + uncontended_us, + contended_min_us, + contended_max_us, + } => { + rodata.futex_uncontended_delay_ns = uncontended_us * 1000; + rodata.futex_contended_delay_min_ns = contended_min_us * 1000; + rodata.futex_contended_delay_max_ns = contended_max_us * 1000; + } } } @@ -516,6 +553,22 @@ pub struct KprobeArgs { pub kprobe_random_delay_max_us: Option, } +/// Adds delays to futex operations to induce contention +#[derive(Debug, Parser)] +pub struct FutexDelayArgs { + /// Time to hold mutex with no contention. + #[clap(long, requires = "futex_contended_delay_max_us")] + pub futex_uncontended_delay_us: Option, + + /// Minimum time to hold mutex after contention starts. + #[clap(long, requires = "futex_uncontended_delay_us")] + pub futex_contended_delay_min_us: Option, + + /// Maximum time to hold mutex after contention starts. + #[clap(long, requires = "futex_contended_delay_min_us")] + pub futex_contended_delay_max_us: Option, +} + /// scx_chaos: A general purpose sched_ext scheduler designed to amplify race conditions /// /// WARNING: This scheduler is a very early alpha, and hasn't been production tested yet. The CLI @@ -576,6 +629,9 @@ pub struct Args { #[command(flatten, next_help_heading = "Kprobe Random Delays")] pub kprobe_random_delays: KprobeArgs, + #[command(flatten, next_help_heading = "Futex Delays")] + pub futex_delay: FutexDelayArgs, + #[command(flatten, next_help_heading = "General Scheduling")] pub p2dq: P2dqOpts, @@ -647,6 +703,19 @@ impl<'a> Iterator for BuilderIterator<'a> { }); }; + if let FutexDelayArgs { + futex_uncontended_delay_us: Some(uncontended_us), + futex_contended_delay_min_us: Some(contended_min_us), + futex_contended_delay_max_us: Some(contended_max_us), + } = self.args.futex_delay + { + traits.push(Trait::FutexDelays { + uncontended_us, + contended_min_us, + contended_max_us, + }) + }; + let requires_ppid = if self.args.ppid_targeting { if let Some(p) = self.args.pid { Some(RequiresPpid::IncludeParent(Pid::from_raw(p))) diff --git a/scheds/rust/scx_chaos/src/stats.rs b/scheds/rust/scx_chaos/src/stats.rs index 5173f22ceb..c4759d7597 100644 --- a/scheds/rust/scx_chaos/src/stats.rs +++ b/scheds/rust/scx_chaos/src/stats.rs @@ -21,6 +21,10 @@ pub struct Metrics { pub trait_cpu_freq: u64, #[stat(desc = "Number of times performance degradation chaos trait was applied")] pub trait_degradation: u64, + #[stat(desc = "Number of futex wait syscalls delayed")] + pub trait_futex_delays: u64, + #[stat(desc = "Number of futex wait syscalls delayed until replaced")] + pub trait_futex_delays_contended: u64, #[stat(desc = "Number of times chaos was excluded due to task matching")] pub chaos_excluded: u64, #[stat(desc = "Number of times chaos was skipped (TRAIT_NONE selected)")] @@ -35,7 +39,7 @@ impl Metrics { fn format(&self, w: &mut W) -> Result<()> { writeln!( w, - "chaos traits: random_delays/cpu_freq/degradation {}/{}/{}\n\tchaos excluded/skipped {}/{}\n\tkprobe_random_delays {}\n\ttimer kicks: {}", + "chaos traits: random_delays/cpu_freq/degradation {}/{}/{}\n\tchaos excluded/skipped {}/{}\n\tkprobe_random_delays {}\n\ttimer kicks: {}\n\tfutex: contended/total: {}/{}", self.trait_random_delays, self.trait_cpu_freq, self.trait_degradation, @@ -43,6 +47,8 @@ impl Metrics { self.chaos_skipped, self.kprobe_random_delays, self.timer_kicks, + self.trait_futex_delays_contended, + self.trait_futex_delays, )?; Ok(()) } @@ -52,6 +58,9 @@ impl Metrics { trait_random_delays: self.trait_random_delays - rhs.trait_random_delays, trait_cpu_freq: self.trait_cpu_freq - rhs.trait_cpu_freq, trait_degradation: self.trait_degradation - rhs.trait_degradation, + trait_futex_delays: self.trait_futex_delays - rhs.trait_futex_delays, + trait_futex_delays_contended: self.trait_futex_delays_contended + - rhs.trait_futex_delays_contended, chaos_excluded: self.chaos_excluded - rhs.chaos_excluded, chaos_skipped: self.chaos_skipped - rhs.chaos_skipped, kprobe_random_delays: self.kprobe_random_delays - rhs.kprobe_random_delays,