scx_layered: Add per-layer LLC stickiness control

hodgesds · hodgesds · commit 57f60dc5f15a · 2025-10-29T07:36:33.000-07:00
Add a new configuration option `llc_sticky_runs` that controls how many
times a task must run on its current LLC before being allowed to migrate
to a different LLC. This provides finer-grained control over LLC locality
and complements the existing `xllc_mig_min_us` option.

The implementation tracks the number of consecutive runs on the current
LLC in the task context (`taskc-&gt;llc_runs`). When the scheduler attempts
to find an idle CPU and no idle CPUs are available on the current LLC,
it checks if the task has run fewer times than the configured threshold.
If so, cross-LLC migration is prevented, keeping the task sticky to its
current LLC to preserve cache locality.

Key changes:

- Add `llc_runs` counter to `task_ctx` to track consecutive runs on
  current LLC
- Add `llc_sticky_runs` configuration field to `layer` struct and
  `LayerConfig` in config.rs
- Increment `llc_runs` in `layered_running()` when stickiness is enabled
- Reset `llc_runs` to 0 in `maybe_update_task_llc()` on LLC migration
- Add stickiness check in `pick_idle_cpu()` to prevent cross-LLC
  migration when task hasn't run enough times on current LLC
- Add `LSTAT_LLC_STICKY_SKIP` statistic to track prevented migrations
- Initialize `llc_runs` to 0 in `layered_init_task()`

The feature is disabled by default (llc_sticky_runs = 0) and can be
configured per-layer. Example configuration:

  {
    "name": "batch",
    "kind": {
      "Confined": {
        "llc_sticky_runs": 20,
        "xllc_mig_min_us": 1000.0
      }
    }
  }

This would keep tasks on their current LLC for 20 runs before allowing
cross-LLC migration, providing stronger LLC affinity for workloads that
benefit from cache locality.

The new statistic appears in the output as:
  xllc_mig/skip={xllc_migration%}/{xllc_skip%}/{llc_sticky_skip%}

Signed-off-by: Daniel Hodges &lt;hodgesd@meta.com&gt;
diff --git a/scheds/rust/scx_layered/src/bpf/intf.h b/scheds/rust/scx_layered/src/bpf/intf.h
@@ -148,6 +148,7 @@ enum layer_stat_id {
 	LSTAT_XNUMA_MIGRATION,
 	LSTAT_XLLC_MIGRATION,
 	LSTAT_XLLC_MIGRATION_SKIP,
+	LSTAT_LLC_STICKY_SKIP,
 	LSTAT_XLAYER_WAKE,
 	LSTAT_XLAYER_REWAKE,
 	LSTAT_LLC_DRAIN_TRY,
@@ -348,6 +349,7 @@ struct layer {
 	u64			disallow_open_after_ns;
 	u64			disallow_preempt_after_ns;
 	u64			xllc_mig_min_ns;
+	u32			llc_sticky_runs;
 
 	int			kind;
 	bool			preempt;
diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -611,6 +611,7 @@ struct task_ctx {
 	u64			runtime_avg;
 	u64			dsq_id;
 	u32			llc_id;
+	u32			llc_runs;
 
 	/* for llcc->queue_runtime */
 	u32			qrt_layer_id;
@@ -1281,6 +1282,14 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 		if ((cpu = pick_idle_cpu_from(cpumask, prev_cpu, idle_smtmask, layer)) >= 0)
 			goto out_put;
 
+		/* Check if task is sticky to current LLC */
+		if (layer->llc_sticky_runs > 0 &&
+		    taskc->llc_runs < layer->llc_sticky_runs) {
+			lstat_inc(LSTAT_LLC_STICKY_SKIP, layer, cpuc);
+			cpu = -1;
+			goto out_put;
+		}
+
 		if (!(prev_llcc = lookup_llc_ctx(prev_cpuc->llc_id)) ||
 		    prev_llcc->queued_runtime[layer_id] < layer->xllc_mig_min_ns) {
 			lstat_inc(LSTAT_XLLC_MIGRATION_SKIP, layer, cpuc);
@@ -1365,6 +1374,7 @@ bool maybe_update_task_llc(struct task_struct *p, struct task_ctx *taskc, s32 ne
 	p->scx.dsq_vtime = new_llcc->vtime_now[layer_id] + vtime_delta;
 
 	taskc->llc_id = new_llc_id;
+	taskc->llc_runs = 0;
 	return true;
 }
 
@@ -2174,6 +2184,7 @@ static __always_inline bool try_consume_layer(u32 layer_id, struct cpu_ctx *cpuc
 	u32 nid = llc_node_id(llcc->id);
 	bool xllc_mig_skipped = false;
 	bool skip_remote_node;
+	u64 dsq_id;
 	u32 u;
 
 	if (!(layer = lookup_layer(layer_id)))
@@ -2196,6 +2207,8 @@ static __always_inline bool try_consume_layer(u32 layer_id, struct cpu_ctx *cpuc
 			return false;
 		}
 
+		dsq_id = layer_dsq_id(layer_id, *llc_idp);
+
 		if (u > 0) {
 			struct llc_ctx *remote_llcc;
 
@@ -2211,9 +2224,42 @@ static __always_inline bool try_consume_layer(u32 layer_id, struct cpu_ctx *cpuc
 				xllc_mig_skipped = true;
 				continue;
 			}
+
+			/*
+			 * For remote LLC DSQs with LLC stickiness enabled, use
+			 * DSQ iterator to validate tasks can migrate before
+			 * dispatching directly.
+			 */
+			if (layer->llc_sticky_runs > 0 && bpf_ksym_exists(scx_bpf_dsq_move)) {
+				struct task_struct *p;
+				bool dispatched = false;
+
+				bpf_for_each(scx_dsq, p, dsq_id, 0) {
+					struct task_ctx *taskc;
+
+					if (!(taskc = lookup_task_ctx(p)))
+						continue;
+
+					if (taskc->llc_runs < layer->llc_sticky_runs) {
+						lstat_inc(LSTAT_LLC_STICKY_SKIP, layer, cpuc);
+						continue;
+					}
+
+					if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
+							     SCX_DSQ_LOCAL, 0)) {
+						dispatched = true;
+						break;
+					}
+				}
+
+				if (dispatched)
+					return true;
+
+				continue;
+			}
 		}
 
-		if (scx_bpf_dsq_move_to_local(layer_dsq_id(layer_id, *llc_idp)))
+		if (scx_bpf_dsq_move_to_local(dsq_id))
 			return true;
 	}
 
@@ -2546,7 +2592,22 @@ static __noinline bool match_one(struct layer *layer, struct layer_match *match,
 			bpf_rcu_read_unlock();
 			return result;
 		}
-		pid_t nspid = get_pid_nr_ns(p_pid, pid_ns);
+
+		/* Inline get_pid_nr_ns logic to avoid RCU lock crossing
+		 * function boundary, this all depends on if it gets inlined so
+		 * we can't just do: 
+		 * pid_t nspid = get_pid_nr_ns(p_pid, pid_ns);
+		 */
+		pid_t nspid = 0;
+		int level = BPF_CORE_READ(p_pid, level);
+		int ns_level = BPF_CORE_READ(pid_ns, level);
+		if (ns_level <= level) {
+			struct upid upid;
+			upid = BPF_CORE_READ(p_pid, numbers[ns_level]);
+			if (upid.ns == pid_ns)
+				nspid = upid.nr;
+		}
+
 		u64 nsid = BPF_CORE_READ(pid_ns, ns.inum);
 		bpf_rcu_read_unlock();
 		return (u32)nspid == match->pid && nsid == match->nsid;
@@ -3085,6 +3146,10 @@ void BPF_STRUCT_OPS(layered_running, struct task_struct *p)
 	if (time_before(llcc->vtime_now[layer_id], p->scx.dsq_vtime))
 		llcc->vtime_now[layer_id] = p->scx.dsq_vtime;
 
+	/* Increment LLC run counter if stickiness is enabled */
+	if (layer->llc_sticky_runs > 0)
+		taskc->llc_runs++;
+
 	cpuc->current_preempt = layer->preempt ||
 		(is_percpu_kthread(p) && is_percpu_kthread_preempting(p));
 	cpuc->used_at = now;
@@ -3439,6 +3504,7 @@ s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
 	taskc->layer_id = MAX_LAYERS;
 	taskc->refresh_layer = true;
 	taskc->llc_id = MAX_LLCS;
+	taskc->llc_runs = 0;
 	taskc->qrt_layer_id = MAX_LLCS;
 	taskc->qrt_llc_id = MAX_LLCS;
 
diff --git a/scheds/rust/scx_layered/src/config.rs b/scheds/rust/scx_layered/src/config.rs
@@ -132,6 +132,8 @@ pub struct LayerCommon {
     pub disallow_preempt_after_us: Option<u64>,
     #[serde(default)]
     pub xllc_mig_min_us: f64,
+    #[serde(default)]
+    pub llc_sticky_runs: u32,
     #[serde(default, skip_serializing)]
     pub idle_smt: Option<bool>,
     #[serde(default)]
diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs
@@ -152,6 +152,7 @@ lazy_static! {
                         disallow_open_after_us: None,
                         disallow_preempt_after_us: None,
                         xllc_mig_min_us: 1000.0,
+                        llc_sticky_runs: 0,
                         growth_algo: LayerGrowthAlgo::Sticky,
                         idle_resume_us: None,
                         perf: 1024,
@@ -188,6 +189,7 @@ lazy_static! {
                         disallow_open_after_us: None,
                         disallow_preempt_after_us: None,
                         xllc_mig_min_us: 0.0,
+                        llc_sticky_runs: 0,
                         growth_algo: LayerGrowthAlgo::Sticky,
                         perf: 1024,
                         idle_resume_us: None,
@@ -229,6 +231,7 @@ lazy_static! {
                         disallow_open_after_us: None,
                         disallow_preempt_after_us: None,
                         xllc_mig_min_us: 0.0,
+                        llc_sticky_runs: 2,
                         growth_algo: LayerGrowthAlgo::Topo,
                         perf: 1024,
                         idle_resume_us: None,
@@ -268,6 +271,7 @@ lazy_static! {
                         disallow_open_after_us: None,
                         disallow_preempt_after_us: None,
                         xllc_mig_min_us: 100.0,
+                        llc_sticky_runs: 0,
                         growth_algo: LayerGrowthAlgo::Linear,
                         perf: 1024,
                         idle_resume_us: None,
@@ -1908,6 +1912,7 @@ impl<'a> Scheduler<'a> {
                     disallow_open_after_us,
                     disallow_preempt_after_us,
                     xllc_mig_min_us,
+                    llc_sticky_runs,
                     placement,
                     member_expire_ms,
                     ..
@@ -1944,6 +1949,7 @@ impl<'a> Scheduler<'a> {
                     v => v * 1000,
                 };
                 layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
+                layer.llc_sticky_runs = *llc_sticky_runs;
                 layer_weights.push(layer.weight.try_into().unwrap());
                 layer.perf = u32::try_from(*perf)?;
                 layer.node_mask = nodemask_from_nodes(nodes) as u64;
diff --git a/scheds/rust/scx_layered/src/stats.rs b/scheds/rust/scx_layered/src/stats.rs
@@ -71,6 +71,7 @@ const LSTAT_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_MIGRATION as usize;
 const LSTAT_XNUMA_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XNUMA_MIGRATION as usize;
 const LSTAT_XLLC_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION as usize;
 const LSTAT_XLLC_MIGRATION_SKIP: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION_SKIP as usize;
+const LSTAT_LLC_STICKY_SKIP: usize = bpf_intf::layer_stat_id_LSTAT_LLC_STICKY_SKIP as usize;
 const LSTAT_XLAYER_WAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_WAKE as usize;
 const LSTAT_XLAYER_REWAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_REWAKE as usize;
 const LSTAT_LLC_DRAIN_TRY: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN_TRY as usize;
@@ -184,6 +185,8 @@ pub struct LayerStats {
     pub xllc_migration: f64,
     #[stat(desc = "% migration skipped across LLCs due to xllc_mig_min_us")]
     pub xllc_migration_skip: f64,
+    #[stat(desc = "% migration skipped across LLCs due to llc_sticky_runs")]
+    pub llc_sticky_skip: f64,
     #[stat(desc = "% wakers across layers")]
     pub xlayer_wake: f64,
     #[stat(desc = "% rewakers across layers where waker has waken the task previously")]
@@ -306,6 +309,7 @@ impl LayerStats {
             xlayer_rewake: lstat_pct(LSTAT_XLAYER_REWAKE),
             xllc_migration: lstat_pct(LSTAT_XLLC_MIGRATION),
             xllc_migration_skip: lstat_pct(LSTAT_XLLC_MIGRATION_SKIP),
+            llc_sticky_skip: lstat_pct(LSTAT_LLC_STICKY_SKIP),
             llc_drain_try: lstat_pct(LSTAT_LLC_DRAIN_TRY),
             llc_drain: lstat_pct(LSTAT_LLC_DRAIN),
             skip_remote_node: lstat_pct(LSTAT_SKIP_REMOTE_NODE),
@@ -378,13 +382,14 @@ impl LayerStats {
 
         writeln!(
             w,
-            "  {:<width$}  open_idle={} mig={} xnuma_mig={} xllc_mig/skip={}/{} affn_viol={}",
+            "  {:<width$}  open_idle={} mig={} xnuma_mig={} xllc_mig/skip/sticky_skip={}/{}/{} affn_viol={}",
             "",
             fmt_pct(self.open_idle),
             fmt_pct(self.migration),
             fmt_pct(self.xnuma_migration),
             fmt_pct(self.xllc_migration),
             fmt_pct(self.xllc_migration_skip),
+            fmt_pct(self.llc_sticky_skip),
             fmt_pct(self.affn_viol),
             width = header_width,
         )?;