Skip to content

Commit c90883b

Browse files
authored
Merge pull request #1683 from multics69/lavd-pelt2
scx_lavd: Calculate scaled and invariant CPU utilization.
2 parents 8ade00e + a13d6e5 commit c90883b

File tree

8 files changed

+112
-19
lines changed

8 files changed

+112
-19
lines changed

scheds/rust/scx_lavd/src/bpf/intf.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,9 @@ enum {
6464
*/
6565
struct sys_stat {
6666
u64 last_update_clk;
67-
u64 util; /* average of the CPU utilization */
67+
u64 avg_util; /* average of the CPU utilization */
68+
u64 avg_sc_util; /* average of the scaled CPU utilization,
69+
which is capacity and frequency invariant */
6870

6971
u64 avg_svc_time; /* average service time per task */
7072
u64 nr_queued_task;
@@ -112,7 +114,7 @@ struct task_ctx {
112114
u64 run_freq; /* scheduling frequency in a second */
113115
u64 wait_freq; /* waiting frequency in a second */
114116
u64 wake_freq; /* waking-up frequency in a second */
115-
u64 svc_time; /* total CPU time consumed for this task */
117+
u64 svc_time; /* total CPU time consumed for this task scaled by task's weight */
116118
u64 dsq_id; /* DSQ id where a task run for statistics */
117119

118120
/*
@@ -143,6 +145,7 @@ struct task_ctx_x {
143145
u16 static_prio; /* nice priority */
144146
u32 cpu_id; /* where a task ran */
145147
u64 cpu_util; /* cpu utilization in [0..100] */
148+
u64 cpu_sutil; /* scaled cpu utilization in [0..100] */
146149
u32 thr_perf_cri; /* performance criticality threshold */
147150
u32 avg_lat_cri; /* average latency criticality */
148151
u32 nr_active; /* number of active cores */

scheds/rust/scx_lavd/src/bpf/introspec.bpf.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ int submit_task_ctx(struct task_struct *p, struct task_ctx *taskc, u32 cpu_id)
3737
__builtin_memcpy_inline(m->taskc_x.comm, p->comm, TASK_COMM_LEN);
3838
m->taskc_x.static_prio = get_nice_prio(p);
3939
m->taskc_x.cpu_util = s2p(cpuc->avg_util);
40+
m->taskc_x.cpu_sutil = s2p(cpuc->avg_sc_util);
4041
m->taskc_x.cpu_id = cpu_id;
4142
m->taskc_x.avg_lat_cri = sys_stat.avg_lat_cri;
4243
m->taskc_x.thr_perf_cri = sys_stat.thr_perf_cri;

scheds/rust/scx_lavd/src/bpf/lavd.bpf.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,13 +96,17 @@ struct cpu_ctx {
9696
*/
9797
volatile u32 avg_util; /* average of the CPU utilization */
9898
volatile u32 cur_util; /* CPU utilization of the current interval */
99+
volatile u32 avg_sc_util; /* average of the scaled CPU utilization, which is capacity and frequency invariant. */
100+
volatile u32 cur_sc_util; /* the scaled CPU utilization of the current interval, which is capacity and frequency invariant. */
99101
volatile u64 idle_total; /* total idle time so far */
100102
volatile u64 idle_start_clk; /* when the CPU becomes idle */
101103

102104
/*
103105
* Information used to keep track of load
104106
*/
105-
volatile u64 tot_svc_time; /* total service time on a CPU */
107+
volatile u64 tot_svc_time; /* total service time on a CPU scaled by tasks' weights */
108+
volatile u64 tot_sc_time; /* total scaled CPU time, which is capacity and frequency invariant. */
109+
volatile u64 cpu_release_clk; /* when the CPU is taken by higher-priority scheduler class */
106110

107111
/*
108112
* Information used to keep track of latency criticality

scheds/rust/scx_lavd/src/bpf/main.bpf.c

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,12 @@ static void update_stat_for_stopping(struct task_struct *p,
673673
if (READ_ONCE(cur_svc_time) < taskc->svc_time)
674674
WRITE_ONCE(cur_svc_time, taskc->svc_time);
675675

676+
/*
677+
* Increase total scaled CPU time of this CPU,
678+
* whcih is capacity and frequency invariant.
679+
*/
680+
cpuc->tot_sc_time += scale_cap_freq(task_runtime, cpuc->cpu_id);
681+
676682
/*
677683
* Reset task's lock and futex boost count
678684
* for a lock holder to be boosted only once.
@@ -1317,20 +1323,52 @@ void BPF_STRUCT_OPS(lavd_set_cpumask, struct task_struct *p,
13171323
set_on_core_type(taskc, cpumask);
13181324
}
13191325

1320-
void BPF_STRUCT_OPS(lavd_cpu_release, s32 cpu,
1321-
struct scx_cpu_release_args *args)
1326+
void BPF_STRUCT_OPS(lavd_cpu_acquire, s32 cpu,
1327+
struct scx_cpu_acquire_args *args)
13221328
{
13231329
struct cpu_ctx *cpuc;
1330+
u64 dur, scaled_dur;
1331+
1332+
cpuc = get_cpu_ctx_id(cpu);
1333+
if (!cpuc) {
1334+
scx_bpf_error("Failed to lookup cpu_ctx %d", cpu);
1335+
return;
1336+
}
13241337

13251338
/*
1326-
* When a CPU is released to serve higher priority scheduler class,
1327-
* reset the CPU's preemption information so it cannot be a victim.
1339+
* When regaining control of a CPU under the higher priority scheduler
1340+
* class, measure how much time the higher priority scheduler class
1341+
* used -- i.e., [lavd_cpu_release, lavd_cpu_acquire]. This will be
1342+
* used to calculate capacity-invariant and frequency-invariant CPU
1343+
* utilization.
1344+
*/
1345+
dur = time_delta(scx_bpf_now(), cpuc->cpu_release_clk);
1346+
scaled_dur = scale_cap_freq(dur, cpu);
1347+
cpuc->tot_sc_time += scaled_dur;
1348+
1349+
/*
1350+
* The higher-priority scheduler class could change the CPU frequency,
1351+
* so let's keep track of the frequency when we gain the CPU control.
1352+
* This helps to make the frequency update decision.
13281353
*/
1354+
cpuc->cpuperf_cur = scx_bpf_cpuperf_cur(cpu);
1355+
}
1356+
1357+
void BPF_STRUCT_OPS(lavd_cpu_release, s32 cpu,
1358+
struct scx_cpu_release_args *args)
1359+
{
1360+
struct cpu_ctx *cpuc;
1361+
13291362
cpuc = get_cpu_ctx_id(cpu);
13301363
if (!cpuc) {
13311364
scx_bpf_error("Failed to lookup cpu_ctx %d", cpu);
13321365
return;
13331366
}
1367+
1368+
/*
1369+
* When a CPU is released to serve higher priority scheduler class,
1370+
* reset the CPU's preemption information so it cannot be a victim.
1371+
*/
13341372
reset_cpu_preemption_info(cpuc, true);
13351373

13361374
/*
@@ -1343,6 +1381,13 @@ void BPF_STRUCT_OPS(lavd_cpu_release, s32 cpu,
13431381
* the target properly after regaining the control.
13441382
*/
13451383
reset_cpuperf_target(cpuc);
1384+
1385+
/*
1386+
* Keep track of when the higher-priority scheduler class takes
1387+
* the CPUto calculate capacity-invariant and frequency-invariant
1388+
* CPU utilization.
1389+
*/
1390+
cpuc->cpu_release_clk = scx_bpf_now();
13461391
}
13471392

13481393
void BPF_STRUCT_OPS(lavd_enable, struct task_struct *p)
@@ -1566,6 +1611,7 @@ static s32 init_per_cpu_ctx(u64 now)
15661611
cpuc->stopping_tm_est_ns = SCX_SLICE_INF;
15671612
cpuc->online_clk = now;
15681613
cpuc->offline_clk = now;
1614+
cpuc->cpu_release_clk = now;
15691615
cpuc->is_online = bpf_cpumask_test_cpu(cpu, online_cpumask);
15701616
cpuc->capacity = get_cpuperf_cap(cpu);
15711617
cpuc->cpdom_poll_pos = cpu % LAVD_CPDOM_MAX_NR;
@@ -1745,6 +1791,7 @@ SCX_OPS_DEFINE(lavd_ops,
17451791
.cpu_offline = (void *)lavd_cpu_offline,
17461792
.update_idle = (void *)lavd_update_idle,
17471793
.set_cpumask = (void *)lavd_set_cpumask,
1794+
.cpu_acquire = (void *)lavd_cpu_acquire,
17481795
.cpu_release = (void *)lavd_cpu_release,
17491796
.enable = (void *)lavd_enable,
17501797
.init_task = (void *)lavd_init_task,

scheds/rust/scx_lavd/src/bpf/power.bpf.c

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ static u64 calc_nr_active_cpus(void)
6060
/*
6161
* nr_active = ceil(nr_cpus_onln * cpu_util * per_core_max_util)
6262
*/
63-
nr_active = ((nr_cpus_onln * sys_stat.util) << LAVD_SHIFT) + p2s(50);
63+
nr_active = ((nr_cpus_onln * sys_stat.avg_util) << LAVD_SHIFT) + p2s(50);
6464
nr_active /= (LAVD_CC_PER_CORE_MAX_CTUIL << LAVD_SHIFT);
6565

6666
/*
@@ -299,23 +299,23 @@ static int do_autopilot(void)
299299
* performance is not required. We run the scheduler in powersave mode
300300
* to save energy consumption.
301301
*/
302-
if (sys_stat.util <= LAVD_AP_LOW_UTIL)
303-
return do_set_power_profile(LAVD_PM_POWERSAVE, sys_stat.util);
302+
if (sys_stat.avg_util <= LAVD_AP_LOW_UTIL)
303+
return do_set_power_profile(LAVD_PM_POWERSAVE, sys_stat.avg_util);
304304

305305
/*
306306
* If the CPU utiulization is moderate (say > 5%, <= 30%), we run the
307307
* scheduler in balanced mode. Actually, balanced mode can save energy
308308
* consumption only under moderate CPU load.
309309
*/
310-
if (sys_stat.util <= LAVD_AP_HIGH_UTIL)
311-
return do_set_power_profile(LAVD_PM_BALANCED, sys_stat.util);
310+
if (sys_stat.avg_util <= LAVD_AP_HIGH_UTIL)
311+
return do_set_power_profile(LAVD_PM_BALANCED, sys_stat.avg_util);
312312

313313
/*
314314
* If the CPU utilization is high enough (say > 30%), we run the
315315
* scheduler in performance mode. The system indeed needs perrformance
316316
* also there is little energy benefit even under balanced mode anyway.
317317
*/
318-
return do_set_power_profile(LAVD_PM_PERFORMANCE, sys_stat.util);
318+
return do_set_power_profile(LAVD_PM_PERFORMANCE, sys_stat.avg_util);
319319
}
320320

321321
static void update_thr_perf_cri(void)
@@ -521,6 +521,21 @@ static u16 get_cputurbo_cap(void)
521521
return turbo_cap;
522522
}
523523

524+
static u64 scale_cap_freq(u64 dur, s32 cpu)
525+
{
526+
u64 cap, freq, scaled_dur;
527+
528+
/*
529+
* Scale the duration by CPU capacity and frequency, so calculate
530+
* capacity-invariant and frequency-invariant time duration.
531+
*/
532+
cap = get_cpuperf_cap(cpu);
533+
freq = scx_bpf_cpuperf_cur(cpu);
534+
scaled_dur = (dur * cap * freq) / (LAVD_SCALE * LAVD_SCALE);
535+
536+
return scaled_dur;
537+
}
538+
524539
static void init_autopilot_low_util(void)
525540
{
526541
if (nr_cpus_big < nr_cpus_onln) {

scheds/rust/scx_lavd/src/bpf/sys_stat.bpf.c

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ struct sys_stat_ctx {
2929
u64 idle_total;
3030
u64 compute_total;
3131
u64 tot_svc_time;
32+
u64 tot_sc_time;
3233
u64 nr_queued_task;
3334
s32 max_lat_cri;
3435
s32 avg_lat_cri;
@@ -47,6 +48,7 @@ struct sys_stat_ctx {
4748
u64 sum_perf_cri;
4849
u32 thr_perf_cri;
4950
u32 cur_util;
51+
u32 cur_sc_util;
5052
u32 nr_violation;
5153
};
5254

@@ -134,6 +136,20 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
134136
c->tot_svc_time += cpuc->tot_svc_time;
135137
cpuc->tot_svc_time = 0;
136138

139+
/*
140+
* Update scaled CPU utilization,
141+
* which is capacity and frequency invariant.
142+
*/
143+
cpuc->cur_sc_util = (cpuc->tot_sc_time << LAVD_SHIFT) / c->duration;
144+
cpuc->avg_sc_util = calc_avg(cpuc->avg_sc_util, cpuc->cur_sc_util);
145+
146+
/*
147+
* Accumulate cpus' scaled loads,
148+
* whcih is capacity and frequency invariant.
149+
*/
150+
c->tot_sc_time += cpuc->tot_sc_time;
151+
cpuc->tot_sc_time = 0;
152+
137153
/*
138154
* Accumulate statistics.
139155
*/
@@ -208,7 +224,7 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
208224
if (c->duration > cpuc->idle_total)
209225
compute = c->duration - cpuc->idle_total;
210226

211-
cpuc->cur_util = (compute * LAVD_SCALE) / c->duration;
227+
cpuc->cur_util = (compute << LAVD_SHIFT) / c->duration;
212228
cpuc->avg_util = calc_avg(cpuc->avg_util, cpuc->cur_util);
213229

214230
if (cpuc->turbo_core) {
@@ -247,7 +263,8 @@ static void calc_sys_stat(struct sys_stat_ctx *c)
247263
c->compute_total = c->duration_total - c->idle_total;
248264
else
249265
c->compute_total = 0;
250-
c->cur_util = (c->compute_total * LAVD_SCALE) / c->duration_total;
266+
c->cur_util = (c->compute_total << LAVD_SHIFT) / c->duration_total;
267+
c->cur_sc_util = (c->tot_sc_time << LAVD_SHIFT) / c->duration_total;
251268

252269
if (c->nr_sched == 0) {
253270
/*
@@ -272,7 +289,8 @@ static void calc_sys_stat(struct sys_stat_ctx *c)
272289
/*
273290
* Update the CPU utilization to the next version.
274291
*/
275-
sys_stat.util = calc_avg(sys_stat.util, c->cur_util);
292+
sys_stat.avg_util = calc_avg(sys_stat.avg_util, c->cur_util);
293+
sys_stat.avg_sc_util = calc_avg(sys_stat.avg_sc_util, c->cur_sc_util);
276294
sys_stat.max_lat_cri = calc_avg32(sys_stat.max_lat_cri, c->max_lat_cri);
277295
sys_stat.avg_lat_cri = calc_avg32(sys_stat.avg_lat_cri, c->avg_lat_cri);
278296
sys_stat.thr_lat_cri = sys_stat.max_lat_cri - ((sys_stat.max_lat_cri -

scheds/rust/scx_lavd/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,7 @@ impl<'a> Scheduler<'a> {
733733
thr_perf_cri: tx.thr_perf_cri,
734734
cpuperf_cur: tx.cpuperf_cur,
735735
cpu_util: tx.cpu_util,
736+
cpu_sutil: tx.cpu_sutil,
736737
nr_active: tx.nr_active,
737738
}) {
738739
Ok(()) | Err(TrySendError::Full(_)) => 0,

scheds/rust/scx_lavd/src/stats.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -157,8 +157,10 @@ pub struct SchedSample {
157157
pub thr_perf_cri: u32,
158158
#[stat(desc = "Target performance level of this CPU")]
159159
pub cpuperf_cur: u32,
160-
#[stat(desc = "CPU utilization of this particular CPU")]
160+
#[stat(desc = "CPU utilization of this CPU")]
161161
pub cpu_util: u64,
162+
#[stat(desc = "Scaled CPU utilization of this CPU")]
163+
pub cpu_sutil: u64,
162164
#[stat(desc = "Number of active CPUs when core compaction is enabled")]
163165
pub nr_active: u32,
164166
}
@@ -167,7 +169,7 @@ impl SchedSample {
167169
pub fn format_header<W: Write>(w: &mut W) -> Result<()> {
168170
writeln!(
169171
w,
170-
"\x1b[93m| {:6} | {:7} | {:17} | {:5} | {:4} | {:8} | {:8} | {:7} | {:8} | {:7} | {:9} | {:9} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:6} |\x1b[0m",
172+
"\x1b[93m| {:6} | {:7} | {:17} | {:5} | {:4} | {:8} | {:8} | {:7} | {:8} | {:7} | {:9} | {:9} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:6} |\x1b[0m",
171173
"MSEQ",
172174
"PID",
173175
"COMM",
@@ -186,6 +188,7 @@ impl SchedSample {
186188
"THR_PC",
187189
"CPUFREQ",
188190
"CPU_UTIL",
191+
"CPU_SUTIL",
189192
"NR_ACT",
190193
)?;
191194
Ok(())
@@ -198,7 +201,7 @@ impl SchedSample {
198201

199202
writeln!(
200203
w,
201-
"| {:6} | {:7} | {:17} | {:5} | {:4} | {:8} | {:8} | {:7} | {:8} | {:7} | {:9} | {:9} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:6} |",
204+
"| {:6} | {:7} | {:17} | {:5} | {:4} | {:8} | {:8} | {:7} | {:8} | {:7} | {:9} | {:9} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:6} |",
202205
self.mseq,
203206
self.pid,
204207
self.comm,
@@ -217,6 +220,7 @@ impl SchedSample {
217220
self.thr_perf_cri,
218221
self.cpuperf_cur,
219222
self.cpu_util,
223+
self.cpu_sutil,
220224
self.nr_active,
221225
)?;
222226
Ok(())

0 commit comments

Comments
 (0)