diff --git a/scheds/rust/scx_layered/src/bpf/intf.h b/scheds/rust/scx_layered/src/bpf/intf.h index c23d7714dd..8b3ccf6ae2 100644 --- a/scheds/rust/scx_layered/src/bpf/intf.h +++ b/scheds/rust/scx_layered/src/bpf/intf.h @@ -148,6 +148,7 @@ enum layer_stat_id { LSTAT_XNUMA_MIGRATION, LSTAT_XLLC_MIGRATION, LSTAT_XLLC_MIGRATION_SKIP, + LSTAT_LLC_STICKY_SKIP, LSTAT_XLAYER_WAKE, LSTAT_XLAYER_REWAKE, LSTAT_LLC_DRAIN_TRY, @@ -348,6 +349,7 @@ struct layer { u64 disallow_open_after_ns; u64 disallow_preempt_after_ns; u64 xllc_mig_min_ns; + u32 llc_sticky_runs; int kind; bool preempt; diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c index 5fad6f505d..37a7a6fa16 100644 --- a/scheds/rust/scx_layered/src/bpf/main.bpf.c +++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c @@ -611,6 +611,7 @@ struct task_ctx { u64 runtime_avg; u64 dsq_id; u32 llc_id; + u32 llc_runs; /* for llcc->queue_runtime */ u32 qrt_layer_id; @@ -1281,6 +1282,14 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, if ((cpu = pick_idle_cpu_from(cpumask, prev_cpu, idle_smtmask, layer)) >= 0) goto out_put; + /* Check if task is sticky to current LLC */ + if (layer->llc_sticky_runs > 0 && + taskc->llc_runs < layer->llc_sticky_runs) { + lstat_inc(LSTAT_LLC_STICKY_SKIP, layer, cpuc); + cpu = -1; + goto out_put; + } + if (!(prev_llcc = lookup_llc_ctx(prev_cpuc->llc_id)) || prev_llcc->queued_runtime[layer_id] < layer->xllc_mig_min_ns) { lstat_inc(LSTAT_XLLC_MIGRATION_SKIP, layer, cpuc); @@ -1365,6 +1374,7 @@ bool maybe_update_task_llc(struct task_struct *p, struct task_ctx *taskc, s32 ne p->scx.dsq_vtime = new_llcc->vtime_now[layer_id] + vtime_delta; taskc->llc_id = new_llc_id; + taskc->llc_runs = 0; return true; } @@ -2174,6 +2184,7 @@ static __always_inline bool try_consume_layer(u32 layer_id, struct cpu_ctx *cpuc u32 nid = llc_node_id(llcc->id); bool xllc_mig_skipped = false; bool skip_remote_node; + u64 dsq_id; u32 u; if (!(layer = lookup_layer(layer_id))) @@ -2196,6 +2207,8 @@ static __always_inline bool try_consume_layer(u32 layer_id, struct cpu_ctx *cpuc return false; } + dsq_id = layer_dsq_id(layer_id, *llc_idp); + if (u > 0) { struct llc_ctx *remote_llcc; @@ -2211,9 +2224,42 @@ static __always_inline bool try_consume_layer(u32 layer_id, struct cpu_ctx *cpuc xllc_mig_skipped = true; continue; } + + /* + * For remote LLC DSQs with LLC stickiness enabled, use + * DSQ iterator to validate tasks can migrate before + * dispatching directly. + */ + if (layer->llc_sticky_runs > 0 && bpf_ksym_exists(scx_bpf_dsq_move)) { + struct task_struct *p; + bool dispatched = false; + + bpf_for_each(scx_dsq, p, dsq_id, 0) { + struct task_ctx *taskc; + + if (!(taskc = lookup_task_ctx(p))) + continue; + + if (taskc->llc_runs < layer->llc_sticky_runs) { + lstat_inc(LSTAT_LLC_STICKY_SKIP, layer, cpuc); + continue; + } + + if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, + SCX_DSQ_LOCAL, 0)) { + dispatched = true; + break; + } + } + + if (dispatched) + return true; + + continue; + } } - if (scx_bpf_dsq_move_to_local(layer_dsq_id(layer_id, *llc_idp))) + if (scx_bpf_dsq_move_to_local(dsq_id)) return true; } @@ -2546,7 +2592,22 @@ static __noinline bool match_one(struct layer *layer, struct layer_match *match, bpf_rcu_read_unlock(); return result; } - pid_t nspid = get_pid_nr_ns(p_pid, pid_ns); + + /* Inline get_pid_nr_ns logic to avoid RCU lock crossing + * function boundary, this all depends on if it gets inlined so + * we can't just do: + * pid_t nspid = get_pid_nr_ns(p_pid, pid_ns); + */ + pid_t nspid = 0; + int level = BPF_CORE_READ(p_pid, level); + int ns_level = BPF_CORE_READ(pid_ns, level); + if (ns_level <= level) { + struct upid upid; + upid = BPF_CORE_READ(p_pid, numbers[ns_level]); + if (upid.ns == pid_ns) + nspid = upid.nr; + } + u64 nsid = BPF_CORE_READ(pid_ns, ns.inum); bpf_rcu_read_unlock(); return (u32)nspid == match->pid && nsid == match->nsid; @@ -3085,6 +3146,10 @@ void BPF_STRUCT_OPS(layered_running, struct task_struct *p) if (time_before(llcc->vtime_now[layer_id], p->scx.dsq_vtime)) llcc->vtime_now[layer_id] = p->scx.dsq_vtime; + /* Increment LLC run counter if stickiness is enabled */ + if (layer->llc_sticky_runs > 0) + taskc->llc_runs++; + cpuc->current_preempt = layer->preempt || (is_percpu_kthread(p) && is_percpu_kthread_preempting(p)); cpuc->used_at = now; @@ -3439,6 +3504,7 @@ s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p, taskc->layer_id = MAX_LAYERS; taskc->refresh_layer = true; taskc->llc_id = MAX_LLCS; + taskc->llc_runs = 0; taskc->qrt_layer_id = MAX_LLCS; taskc->qrt_llc_id = MAX_LLCS; diff --git a/scheds/rust/scx_layered/src/config.rs b/scheds/rust/scx_layered/src/config.rs index ea36be82b5..ba3d089758 100644 --- a/scheds/rust/scx_layered/src/config.rs +++ b/scheds/rust/scx_layered/src/config.rs @@ -132,6 +132,8 @@ pub struct LayerCommon { pub disallow_preempt_after_us: Option, #[serde(default)] pub xllc_mig_min_us: f64, + #[serde(default)] + pub llc_sticky_runs: u32, #[serde(default, skip_serializing)] pub idle_smt: Option, #[serde(default)] diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs index c619775719..67adf5d940 100644 --- a/scheds/rust/scx_layered/src/main.rs +++ b/scheds/rust/scx_layered/src/main.rs @@ -152,6 +152,7 @@ lazy_static! { disallow_open_after_us: None, disallow_preempt_after_us: None, xllc_mig_min_us: 1000.0, + llc_sticky_runs: 0, growth_algo: LayerGrowthAlgo::Sticky, idle_resume_us: None, perf: 1024, @@ -188,6 +189,7 @@ lazy_static! { disallow_open_after_us: None, disallow_preempt_after_us: None, xllc_mig_min_us: 0.0, + llc_sticky_runs: 0, growth_algo: LayerGrowthAlgo::Sticky, perf: 1024, idle_resume_us: None, @@ -229,6 +231,7 @@ lazy_static! { disallow_open_after_us: None, disallow_preempt_after_us: None, xllc_mig_min_us: 0.0, + llc_sticky_runs: 2, growth_algo: LayerGrowthAlgo::Topo, perf: 1024, idle_resume_us: None, @@ -268,6 +271,7 @@ lazy_static! { disallow_open_after_us: None, disallow_preempt_after_us: None, xllc_mig_min_us: 100.0, + llc_sticky_runs: 0, growth_algo: LayerGrowthAlgo::Linear, perf: 1024, idle_resume_us: None, @@ -1908,6 +1912,7 @@ impl<'a> Scheduler<'a> { disallow_open_after_us, disallow_preempt_after_us, xllc_mig_min_us, + llc_sticky_runs, placement, member_expire_ms, .. @@ -1944,6 +1949,7 @@ impl<'a> Scheduler<'a> { v => v * 1000, }; layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64; + layer.llc_sticky_runs = *llc_sticky_runs; layer_weights.push(layer.weight.try_into().unwrap()); layer.perf = u32::try_from(*perf)?; layer.node_mask = nodemask_from_nodes(nodes) as u64; diff --git a/scheds/rust/scx_layered/src/stats.rs b/scheds/rust/scx_layered/src/stats.rs index b12a9b23da..a168f1e25c 100644 --- a/scheds/rust/scx_layered/src/stats.rs +++ b/scheds/rust/scx_layered/src/stats.rs @@ -71,6 +71,7 @@ const LSTAT_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_MIGRATION as usize; const LSTAT_XNUMA_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XNUMA_MIGRATION as usize; const LSTAT_XLLC_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION as usize; const LSTAT_XLLC_MIGRATION_SKIP: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION_SKIP as usize; +const LSTAT_LLC_STICKY_SKIP: usize = bpf_intf::layer_stat_id_LSTAT_LLC_STICKY_SKIP as usize; const LSTAT_XLAYER_WAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_WAKE as usize; const LSTAT_XLAYER_REWAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_REWAKE as usize; const LSTAT_LLC_DRAIN_TRY: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN_TRY as usize; @@ -184,6 +185,8 @@ pub struct LayerStats { pub xllc_migration: f64, #[stat(desc = "% migration skipped across LLCs due to xllc_mig_min_us")] pub xllc_migration_skip: f64, + #[stat(desc = "% migration skipped across LLCs due to llc_sticky_runs")] + pub llc_sticky_skip: f64, #[stat(desc = "% wakers across layers")] pub xlayer_wake: f64, #[stat(desc = "% rewakers across layers where waker has waken the task previously")] @@ -306,6 +309,7 @@ impl LayerStats { xlayer_rewake: lstat_pct(LSTAT_XLAYER_REWAKE), xllc_migration: lstat_pct(LSTAT_XLLC_MIGRATION), xllc_migration_skip: lstat_pct(LSTAT_XLLC_MIGRATION_SKIP), + llc_sticky_skip: lstat_pct(LSTAT_LLC_STICKY_SKIP), llc_drain_try: lstat_pct(LSTAT_LLC_DRAIN_TRY), llc_drain: lstat_pct(LSTAT_LLC_DRAIN), skip_remote_node: lstat_pct(LSTAT_SKIP_REMOTE_NODE), @@ -378,13 +382,14 @@ impl LayerStats { writeln!( w, - " {: