Skip to content

Commit 57f60dc

Browse files
committed
scx_layered: Add per-layer LLC stickiness control
Add a new configuration option `llc_sticky_runs` that controls how many times a task must run on its current LLC before being allowed to migrate to a different LLC. This provides finer-grained control over LLC locality and complements the existing `xllc_mig_min_us` option. The implementation tracks the number of consecutive runs on the current LLC in the task context (`taskc->llc_runs`). When the scheduler attempts to find an idle CPU and no idle CPUs are available on the current LLC, it checks if the task has run fewer times than the configured threshold. If so, cross-LLC migration is prevented, keeping the task sticky to its current LLC to preserve cache locality. Key changes: - Add `llc_runs` counter to `task_ctx` to track consecutive runs on current LLC - Add `llc_sticky_runs` configuration field to `layer` struct and `LayerConfig` in config.rs - Increment `llc_runs` in `layered_running()` when stickiness is enabled - Reset `llc_runs` to 0 in `maybe_update_task_llc()` on LLC migration - Add stickiness check in `pick_idle_cpu()` to prevent cross-LLC migration when task hasn't run enough times on current LLC - Add `LSTAT_LLC_STICKY_SKIP` statistic to track prevented migrations - Initialize `llc_runs` to 0 in `layered_init_task()` The feature is disabled by default (llc_sticky_runs = 0) and can be configured per-layer. Example configuration: { "name": "batch", "kind": { "Confined": { "llc_sticky_runs": 20, "xllc_mig_min_us": 1000.0 } } } This would keep tasks on their current LLC for 20 runs before allowing cross-LLC migration, providing stronger LLC affinity for workloads that benefit from cache locality. The new statistic appears in the output as: xllc_mig/skip={xllc_migration%}/{xllc_skip%}/{llc_sticky_skip%} Signed-off-by: Daniel Hodges <[email protected]>
1 parent 9d2bf93 commit 57f60dc

File tree

5 files changed

+84
-3
lines changed

5 files changed

+84
-3
lines changed

scheds/rust/scx_layered/src/bpf/intf.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ enum layer_stat_id {
148148
LSTAT_XNUMA_MIGRATION,
149149
LSTAT_XLLC_MIGRATION,
150150
LSTAT_XLLC_MIGRATION_SKIP,
151+
LSTAT_LLC_STICKY_SKIP,
151152
LSTAT_XLAYER_WAKE,
152153
LSTAT_XLAYER_REWAKE,
153154
LSTAT_LLC_DRAIN_TRY,
@@ -348,6 +349,7 @@ struct layer {
348349
u64 disallow_open_after_ns;
349350
u64 disallow_preempt_after_ns;
350351
u64 xllc_mig_min_ns;
352+
u32 llc_sticky_runs;
351353

352354
int kind;
353355
bool preempt;

scheds/rust/scx_layered/src/bpf/main.bpf.c

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,7 @@ struct task_ctx {
611611
u64 runtime_avg;
612612
u64 dsq_id;
613613
u32 llc_id;
614+
u32 llc_runs;
614615

615616
/* for llcc->queue_runtime */
616617
u32 qrt_layer_id;
@@ -1281,6 +1282,14 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
12811282
if ((cpu = pick_idle_cpu_from(cpumask, prev_cpu, idle_smtmask, layer)) >= 0)
12821283
goto out_put;
12831284

1285+
/* Check if task is sticky to current LLC */
1286+
if (layer->llc_sticky_runs > 0 &&
1287+
taskc->llc_runs < layer->llc_sticky_runs) {
1288+
lstat_inc(LSTAT_LLC_STICKY_SKIP, layer, cpuc);
1289+
cpu = -1;
1290+
goto out_put;
1291+
}
1292+
12841293
if (!(prev_llcc = lookup_llc_ctx(prev_cpuc->llc_id)) ||
12851294
prev_llcc->queued_runtime[layer_id] < layer->xllc_mig_min_ns) {
12861295
lstat_inc(LSTAT_XLLC_MIGRATION_SKIP, layer, cpuc);
@@ -1365,6 +1374,7 @@ bool maybe_update_task_llc(struct task_struct *p, struct task_ctx *taskc, s32 ne
13651374
p->scx.dsq_vtime = new_llcc->vtime_now[layer_id] + vtime_delta;
13661375

13671376
taskc->llc_id = new_llc_id;
1377+
taskc->llc_runs = 0;
13681378
return true;
13691379
}
13701380

@@ -2174,6 +2184,7 @@ static __always_inline bool try_consume_layer(u32 layer_id, struct cpu_ctx *cpuc
21742184
u32 nid = llc_node_id(llcc->id);
21752185
bool xllc_mig_skipped = false;
21762186
bool skip_remote_node;
2187+
u64 dsq_id;
21772188
u32 u;
21782189

21792190
if (!(layer = lookup_layer(layer_id)))
@@ -2196,6 +2207,8 @@ static __always_inline bool try_consume_layer(u32 layer_id, struct cpu_ctx *cpuc
21962207
return false;
21972208
}
21982209

2210+
dsq_id = layer_dsq_id(layer_id, *llc_idp);
2211+
21992212
if (u > 0) {
22002213
struct llc_ctx *remote_llcc;
22012214

@@ -2211,9 +2224,42 @@ static __always_inline bool try_consume_layer(u32 layer_id, struct cpu_ctx *cpuc
22112224
xllc_mig_skipped = true;
22122225
continue;
22132226
}
2227+
2228+
/*
2229+
* For remote LLC DSQs with LLC stickiness enabled, use
2230+
* DSQ iterator to validate tasks can migrate before
2231+
* dispatching directly.
2232+
*/
2233+
if (layer->llc_sticky_runs > 0 && bpf_ksym_exists(scx_bpf_dsq_move)) {
2234+
struct task_struct *p;
2235+
bool dispatched = false;
2236+
2237+
bpf_for_each(scx_dsq, p, dsq_id, 0) {
2238+
struct task_ctx *taskc;
2239+
2240+
if (!(taskc = lookup_task_ctx(p)))
2241+
continue;
2242+
2243+
if (taskc->llc_runs < layer->llc_sticky_runs) {
2244+
lstat_inc(LSTAT_LLC_STICKY_SKIP, layer, cpuc);
2245+
continue;
2246+
}
2247+
2248+
if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
2249+
SCX_DSQ_LOCAL, 0)) {
2250+
dispatched = true;
2251+
break;
2252+
}
2253+
}
2254+
2255+
if (dispatched)
2256+
return true;
2257+
2258+
continue;
2259+
}
22142260
}
22152261

2216-
if (scx_bpf_dsq_move_to_local(layer_dsq_id(layer_id, *llc_idp)))
2262+
if (scx_bpf_dsq_move_to_local(dsq_id))
22172263
return true;
22182264
}
22192265

@@ -2546,7 +2592,22 @@ static __noinline bool match_one(struct layer *layer, struct layer_match *match,
25462592
bpf_rcu_read_unlock();
25472593
return result;
25482594
}
2549-
pid_t nspid = get_pid_nr_ns(p_pid, pid_ns);
2595+
2596+
/* Inline get_pid_nr_ns logic to avoid RCU lock crossing
2597+
* function boundary, this all depends on if it gets inlined so
2598+
* we can't just do:
2599+
* pid_t nspid = get_pid_nr_ns(p_pid, pid_ns);
2600+
*/
2601+
pid_t nspid = 0;
2602+
int level = BPF_CORE_READ(p_pid, level);
2603+
int ns_level = BPF_CORE_READ(pid_ns, level);
2604+
if (ns_level <= level) {
2605+
struct upid upid;
2606+
upid = BPF_CORE_READ(p_pid, numbers[ns_level]);
2607+
if (upid.ns == pid_ns)
2608+
nspid = upid.nr;
2609+
}
2610+
25502611
u64 nsid = BPF_CORE_READ(pid_ns, ns.inum);
25512612
bpf_rcu_read_unlock();
25522613
return (u32)nspid == match->pid && nsid == match->nsid;
@@ -3085,6 +3146,10 @@ void BPF_STRUCT_OPS(layered_running, struct task_struct *p)
30853146
if (time_before(llcc->vtime_now[layer_id], p->scx.dsq_vtime))
30863147
llcc->vtime_now[layer_id] = p->scx.dsq_vtime;
30873148

3149+
/* Increment LLC run counter if stickiness is enabled */
3150+
if (layer->llc_sticky_runs > 0)
3151+
taskc->llc_runs++;
3152+
30883153
cpuc->current_preempt = layer->preempt ||
30893154
(is_percpu_kthread(p) && is_percpu_kthread_preempting(p));
30903155
cpuc->used_at = now;
@@ -3439,6 +3504,7 @@ s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
34393504
taskc->layer_id = MAX_LAYERS;
34403505
taskc->refresh_layer = true;
34413506
taskc->llc_id = MAX_LLCS;
3507+
taskc->llc_runs = 0;
34423508
taskc->qrt_layer_id = MAX_LLCS;
34433509
taskc->qrt_llc_id = MAX_LLCS;
34443510

scheds/rust/scx_layered/src/config.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ pub struct LayerCommon {
132132
pub disallow_preempt_after_us: Option<u64>,
133133
#[serde(default)]
134134
pub xllc_mig_min_us: f64,
135+
#[serde(default)]
136+
pub llc_sticky_runs: u32,
135137
#[serde(default, skip_serializing)]
136138
pub idle_smt: Option<bool>,
137139
#[serde(default)]

scheds/rust/scx_layered/src/main.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ lazy_static! {
152152
disallow_open_after_us: None,
153153
disallow_preempt_after_us: None,
154154
xllc_mig_min_us: 1000.0,
155+
llc_sticky_runs: 0,
155156
growth_algo: LayerGrowthAlgo::Sticky,
156157
idle_resume_us: None,
157158
perf: 1024,
@@ -188,6 +189,7 @@ lazy_static! {
188189
disallow_open_after_us: None,
189190
disallow_preempt_after_us: None,
190191
xllc_mig_min_us: 0.0,
192+
llc_sticky_runs: 0,
191193
growth_algo: LayerGrowthAlgo::Sticky,
192194
perf: 1024,
193195
idle_resume_us: None,
@@ -229,6 +231,7 @@ lazy_static! {
229231
disallow_open_after_us: None,
230232
disallow_preempt_after_us: None,
231233
xllc_mig_min_us: 0.0,
234+
llc_sticky_runs: 2,
232235
growth_algo: LayerGrowthAlgo::Topo,
233236
perf: 1024,
234237
idle_resume_us: None,
@@ -268,6 +271,7 @@ lazy_static! {
268271
disallow_open_after_us: None,
269272
disallow_preempt_after_us: None,
270273
xllc_mig_min_us: 100.0,
274+
llc_sticky_runs: 0,
271275
growth_algo: LayerGrowthAlgo::Linear,
272276
perf: 1024,
273277
idle_resume_us: None,
@@ -1908,6 +1912,7 @@ impl<'a> Scheduler<'a> {
19081912
disallow_open_after_us,
19091913
disallow_preempt_after_us,
19101914
xllc_mig_min_us,
1915+
llc_sticky_runs,
19111916
placement,
19121917
member_expire_ms,
19131918
..
@@ -1944,6 +1949,7 @@ impl<'a> Scheduler<'a> {
19441949
v => v * 1000,
19451950
};
19461951
layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
1952+
layer.llc_sticky_runs = *llc_sticky_runs;
19471953
layer_weights.push(layer.weight.try_into().unwrap());
19481954
layer.perf = u32::try_from(*perf)?;
19491955
layer.node_mask = nodemask_from_nodes(nodes) as u64;

scheds/rust/scx_layered/src/stats.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ const LSTAT_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_MIGRATION as usize;
7171
const LSTAT_XNUMA_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XNUMA_MIGRATION as usize;
7272
const LSTAT_XLLC_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION as usize;
7373
const LSTAT_XLLC_MIGRATION_SKIP: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION_SKIP as usize;
74+
const LSTAT_LLC_STICKY_SKIP: usize = bpf_intf::layer_stat_id_LSTAT_LLC_STICKY_SKIP as usize;
7475
const LSTAT_XLAYER_WAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_WAKE as usize;
7576
const LSTAT_XLAYER_REWAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_REWAKE as usize;
7677
const LSTAT_LLC_DRAIN_TRY: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN_TRY as usize;
@@ -184,6 +185,8 @@ pub struct LayerStats {
184185
pub xllc_migration: f64,
185186
#[stat(desc = "% migration skipped across LLCs due to xllc_mig_min_us")]
186187
pub xllc_migration_skip: f64,
188+
#[stat(desc = "% migration skipped across LLCs due to llc_sticky_runs")]
189+
pub llc_sticky_skip: f64,
187190
#[stat(desc = "% wakers across layers")]
188191
pub xlayer_wake: f64,
189192
#[stat(desc = "% rewakers across layers where waker has waken the task previously")]
@@ -306,6 +309,7 @@ impl LayerStats {
306309
xlayer_rewake: lstat_pct(LSTAT_XLAYER_REWAKE),
307310
xllc_migration: lstat_pct(LSTAT_XLLC_MIGRATION),
308311
xllc_migration_skip: lstat_pct(LSTAT_XLLC_MIGRATION_SKIP),
312+
llc_sticky_skip: lstat_pct(LSTAT_LLC_STICKY_SKIP),
309313
llc_drain_try: lstat_pct(LSTAT_LLC_DRAIN_TRY),
310314
llc_drain: lstat_pct(LSTAT_LLC_DRAIN),
311315
skip_remote_node: lstat_pct(LSTAT_SKIP_REMOTE_NODE),
@@ -378,13 +382,14 @@ impl LayerStats {
378382

379383
writeln!(
380384
w,
381-
" {:<width$} open_idle={} mig={} xnuma_mig={} xllc_mig/skip={}/{} affn_viol={}",
385+
" {:<width$} open_idle={} mig={} xnuma_mig={} xllc_mig/skip/sticky_skip={}/{}/{} affn_viol={}",
382386
"",
383387
fmt_pct(self.open_idle),
384388
fmt_pct(self.migration),
385389
fmt_pct(self.xnuma_migration),
386390
fmt_pct(self.xllc_migration),
387391
fmt_pct(self.xllc_migration_skip),
392+
fmt_pct(self.llc_sticky_skip),
388393
fmt_pct(self.affn_viol),
389394
width = header_width,
390395
)?;

0 commit comments

Comments
 (0)