Skip to content

Commit 733c7ed

Browse files
authored
Merge pull request #2939 from sched-ext/bpfland-pref-idle-scan
scx_bpfland: Add --preferred-idle-scan option to prioritize faster CPUs
2 parents 7b79575 + 35c14c1 commit 733c7ed

File tree

2 files changed

+182
-2
lines changed

2 files changed

+182
-2
lines changed

scheds/rust/scx_bpfland/src/bpf/main.bpf.c

Lines changed: 159 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@
66
#include <scx/percpu.bpf.h>
77
#include "intf.h"
88

9+
/*
10+
* Maximum amount of CPUs supported by the scheduler when flat or preferred
11+
* idle CPU scan is enabled.
12+
*/
13+
#define MAX_CPUS 1024
14+
915
/*
1016
* Maximum rate of task wakeups/sec (tasks with a higher rate are capped to
1117
* this value).
@@ -75,6 +81,21 @@ const volatile bool local_pcpu = true;
7581
*/
7682
volatile s64 cpufreq_perf_lvl;
7783

84+
/*
85+
* Enable preferred cores prioritization.
86+
*/
87+
const volatile bool preferred_idle_scan;
88+
89+
/*
90+
* CPUs sorted by their capacity in descendent order.
91+
*/
92+
const volatile u64 preferred_cpus[MAX_CPUS];
93+
94+
/*
95+
* Cache CPU capacity values.
96+
*/
97+
const volatile u64 cpu_capacity[MAX_CPUS];
98+
7899
/*
79100
* Scheduling statistics.
80101
*/
@@ -252,6 +273,33 @@ static inline const struct cpumask *get_idle_cpumask(s32 cpu)
252273
return __COMPAT_scx_bpf_get_idle_cpumask_node(__COMPAT_scx_bpf_cpu_node(cpu));
253274
}
254275

276+
/*
277+
* Return the cpumask of fully idle SMT cores within the NUMA node that
278+
* contains @cpu.
279+
*
280+
* If NUMA support is disabled, @cpu is ignored.
281+
*/
282+
static inline const struct cpumask *get_idle_smtmask(s32 cpu)
283+
{
284+
if (!numa_enabled)
285+
return scx_bpf_get_idle_smtmask();
286+
287+
return __COMPAT_scx_bpf_get_idle_smtmask_node(__COMPAT_scx_bpf_cpu_node(cpu));
288+
}
289+
290+
/*
291+
* Return true if @cpu is valid, otherwise trigger an error and return
292+
* false.
293+
*/
294+
static inline bool is_cpu_valid(s32 cpu)
295+
{
296+
if (cpu < 0 || cpu >= MAX_CPUS) {
297+
scx_bpf_error("invalid CPU id: %d", cpu);
298+
return false;
299+
}
300+
return true;
301+
}
302+
255303
/*
256304
* Return true if @this_cpu and @that_cpu are in the same LLC, false
257305
* otherwise.
@@ -261,6 +309,9 @@ static inline bool cpus_share_cache(s32 this_cpu, s32 that_cpu)
261309
if (this_cpu == that_cpu)
262310
return true;
263311

312+
if (!is_cpu_valid(this_cpu) || !is_cpu_valid(that_cpu))
313+
return false;
314+
264315
return cpu_llc_id(this_cpu) == cpu_llc_id(that_cpu);
265316
}
266317

@@ -272,7 +323,10 @@ static inline bool is_cpu_faster(s32 this_cpu, s32 that_cpu)
272323
if (this_cpu == that_cpu)
273324
return false;
274325

275-
return cpu_priority(this_cpu) > cpu_priority(that_cpu);
326+
if (!is_cpu_valid(this_cpu) || !is_cpu_valid(that_cpu))
327+
return false;
328+
329+
return cpu_capacity[this_cpu] > cpu_capacity[that_cpu];
276330
}
277331

278332
/*
@@ -329,6 +383,102 @@ static inline bool is_wakeup(u64 wake_flags)
329383
return wake_flags & SCX_WAKE_TTWU;
330384
}
331385

386+
/*
387+
* Try to pick the best idle CPU based on the @preferred_cpus ranking.
388+
* Return a full-idle SMT core if @do_idle_smt is true, or any idle CPU if
389+
* @do_idle_smt is false.
390+
*/
391+
static s32 pick_idle_cpu_pref_smt(struct task_struct *p, s32 prev_cpu, bool is_prev_allowed,
392+
const struct cpumask *primary, const struct cpumask *smt)
393+
{
394+
u64 max_cpus = MIN(nr_cpu_ids, MAX_CPUS);
395+
int i;
396+
397+
if (is_prev_allowed &&
398+
(!primary || bpf_cpumask_test_cpu(prev_cpu, primary)) &&
399+
(!smt || bpf_cpumask_test_cpu(prev_cpu, smt)) &&
400+
scx_bpf_test_and_clear_cpu_idle(prev_cpu))
401+
return prev_cpu;
402+
403+
bpf_for(i, 0, max_cpus) {
404+
s32 cpu = preferred_cpus[i];
405+
406+
if ((cpu == prev_cpu) || !bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
407+
continue;
408+
409+
if ((!primary || bpf_cpumask_test_cpu(cpu, primary)) &&
410+
(!smt || bpf_cpumask_test_cpu(cpu, smt)) &&
411+
scx_bpf_test_and_clear_cpu_idle(cpu))
412+
return cpu;
413+
}
414+
415+
return -EBUSY;
416+
}
417+
418+
/*
419+
* Return the optimal idle CPU for task @p or -EBUSY if no idle CPU is
420+
* found.
421+
*/
422+
static s32 pick_idle_cpu_scan(struct task_struct *p, s32 prev_cpu)
423+
{
424+
const struct cpumask *smt, *primary;
425+
bool is_prev_allowed = bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr);
426+
s32 cpu;
427+
428+
primary = !primary_all ? cast_mask(primary_cpumask) : NULL;
429+
smt = smt_enabled ? get_idle_smtmask(prev_cpu) : NULL;
430+
431+
/*
432+
* If the task can't migrate, there's no point looking for other
433+
* CPUs.
434+
*/
435+
if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) {
436+
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
437+
cpu = prev_cpu;
438+
goto out;
439+
}
440+
}
441+
442+
if (!primary_all) {
443+
if (smt_enabled) {
444+
/*
445+
* Try to pick a full-idle core in the primary
446+
* domain.
447+
*/
448+
cpu = pick_idle_cpu_pref_smt(p, prev_cpu, is_prev_allowed, primary, smt);
449+
if (cpu >= 0)
450+
goto out;
451+
}
452+
453+
/*
454+
* Try to pick any idle CPU in the primary domain.
455+
*/
456+
cpu = pick_idle_cpu_pref_smt(p, prev_cpu, is_prev_allowed, primary, NULL);
457+
if (cpu >= 0)
458+
goto out;
459+
}
460+
461+
if (smt_enabled) {
462+
/*
463+
* Try to pick any full-idle core in the system.
464+
*/
465+
cpu = pick_idle_cpu_pref_smt(p, prev_cpu, is_prev_allowed, NULL, smt);
466+
if (cpu >= 0)
467+
goto out;
468+
}
469+
470+
/*
471+
* Try to pick any idle CPU in the system.
472+
*/
473+
cpu = pick_idle_cpu_pref_smt(p, prev_cpu, is_prev_allowed, NULL, NULL);
474+
475+
out:
476+
if (smt)
477+
scx_bpf_put_cpumask(smt);
478+
479+
return cpu;
480+
}
481+
332482
/*
333483
* Pick an optimal idle CPU for task @p (as close as possible to
334484
* @prev_cpu).
@@ -341,6 +491,14 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, s32 this_cpu,
341491
const struct cpumask *primary = cast_mask(primary_cpumask);
342492
s32 cpu;
343493

494+
/*
495+
* Use lightweight idle CPU scanning when flat or preferred idle
496+
* scan is enabled, unless the system is busy, in which case the
497+
* cpumask-based scanning is more efficient.
498+
*/
499+
if (preferred_idle_scan)
500+
return pick_idle_cpu_scan(p, prev_cpu);
501+
344502
/*
345503
* Clear the wake sync bit if synchronous wakeups are disabled.
346504
*/

scheds/rust/scx_bpfland/src/main.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ pub mod bpf_intf;
1111
pub use bpf_intf::*;
1212

1313
mod stats;
14-
use std::ffi::c_int;
14+
use std::ffi::{c_int, c_ulong};
1515
use std::fmt::Write;
1616
use std::mem::MaybeUninit;
1717
use std::sync::atomic::AtomicBool;
@@ -197,6 +197,13 @@ struct Opts {
197197
#[clap(short = 'm', long, default_value = "auto")]
198198
primary_domain: String,
199199

200+
/// Enable preferred idle CPU scanning.
201+
///
202+
/// With this option enabled, the scheduler will prioritize assigning tasks to higher-ranked
203+
/// cores before considering lower-ranked ones.
204+
#[clap(short = 'P', long, action = clap::ArgAction::SetTrue)]
205+
preferred_idle_scan: bool,
206+
200207
/// Disable SMT awareness.
201208
#[clap(long, action = clap::ArgAction::SetTrue)]
202209
disable_smt: bool,
@@ -333,6 +340,21 @@ impl<'a> Scheduler<'a> {
333340
rodata.throttle_ns = opts.throttle_us * 1000;
334341
rodata.primary_all = domain.weight() == *NR_CPU_IDS;
335342

343+
// Generate the list of available CPUs sorted by capacity in descending order.
344+
let mut cpus: Vec<_> = topo.all_cpus.values().collect();
345+
cpus.sort_by_key(|cpu| std::cmp::Reverse(cpu.cpu_capacity));
346+
for (i, cpu) in cpus.iter().enumerate() {
347+
rodata.cpu_capacity[cpu.id] = cpu.cpu_capacity as c_ulong;
348+
rodata.preferred_cpus[i] = cpu.id as u64;
349+
}
350+
if opts.preferred_idle_scan {
351+
info!(
352+
"Preferred CPUs: {:?}",
353+
&rodata.preferred_cpus[0..cpus.len()]
354+
);
355+
}
356+
rodata.preferred_idle_scan = opts.preferred_idle_scan;
357+
336358
// Implicitly enable direct dispatch of per-CPU kthreads if CPU throttling is enabled
337359
// (it's never a good idea to throttle per-CPU kthreads).
338360
rodata.local_kthreads = opts.local_kthreads || opts.throttle_us > 0;

0 commit comments

Comments
 (0)