From 63a5241bfa6423d3e8ea907620c23a95cadbc4fb Mon Sep 17 00:00:00 2001 From: Yusheng Zheng Date: Sun, 20 Oct 2024 07:19:08 +0000 Subject: [PATCH] add 45 --- .github/workflows/test-libbpf.yml | 4 + README.md | 1 + src/44-scx-simple/.gitignore | 1 + src/44-scx-simple/README.md | 69 +- src/44-scx-simple/README.zh.md | 431 ++++++++ src/45-scx-nest/.config | 2 + src/45-scx-nest/.gitignore | 3 + src/45-scx-nest/Makefile | 144 +++ src/45-scx-nest/README.md | 927 ++++++++++++++++++ src/45-scx-nest/README.zh.md | 903 +++++++++++++++++ .../include/bpf-compat/gnu/stubs.h | 11 + src/45-scx-nest/include/scx/common.bpf.h | 427 ++++++++ src/45-scx-nest/include/scx/common.h | 75 ++ src/45-scx-nest/include/scx/compat.bpf.h | 47 + src/45-scx-nest/include/scx/compat.h | 186 ++++ src/45-scx-nest/include/scx/user_exit_info.h | 115 +++ src/45-scx-nest/scx_nest.bpf.c | 654 ++++++++++++ src/45-scx-nest/scx_nest.c | 236 +++++ src/45-scx-nest/scx_nest.h | 18 + src/45-scx-nest/scx_nest_stats_table.h | 20 + src/SUMMARY.md | 1 + src/SUMMARY.zh.md | 4 +- src/scripts/generate_toc.py | 6 +- 23 files changed, 4255 insertions(+), 30 deletions(-) create mode 100644 src/45-scx-nest/.config create mode 100644 src/45-scx-nest/.gitignore create mode 100644 src/45-scx-nest/Makefile create mode 100644 src/45-scx-nest/README.md create mode 100644 src/45-scx-nest/README.zh.md create mode 100644 src/45-scx-nest/include/bpf-compat/gnu/stubs.h create mode 100644 src/45-scx-nest/include/scx/common.bpf.h create mode 100644 src/45-scx-nest/include/scx/common.h create mode 100644 src/45-scx-nest/include/scx/compat.bpf.h create mode 100644 src/45-scx-nest/include/scx/compat.h create mode 100644 src/45-scx-nest/include/scx/user_exit_info.h create mode 100644 src/45-scx-nest/scx_nest.bpf.c create mode 100644 src/45-scx-nest/scx_nest.c create mode 100644 src/45-scx-nest/scx_nest.h create mode 100644 src/45-scx-nest/scx_nest_stats_table.h diff --git a/.github/workflows/test-libbpf.yml b/.github/workflows/test-libbpf.yml index 0c738df1..e567b3e6 100644 --- a/.github/workflows/test-libbpf.yml +++ b/.github/workflows/test-libbpf.yml @@ -81,3 +81,7 @@ jobs: - name: test 43 kfuncs run: | make -C src/43-kfuncs + + - name: test 44 + run: | + make -C src/43-kfuncs diff --git a/README.md b/README.md index 6ba23982..2fa13152 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,7 @@ Security: Scheduler: - [lesson 44-scx-simple](src/44-scx-simple/README.md) Introduction to the BPF Scheduler +- [lesson 45-scx-nest](src/45-scx-nest/README.md) Implementing the `scx_nest` Scheduler Other: diff --git a/src/44-scx-simple/.gitignore b/src/44-scx-simple/.gitignore index 4b4489c2..683466d8 100644 --- a/src/44-scx-simple/.gitignore +++ b/src/44-scx-simple/.gitignore @@ -1 +1,2 @@ scx_simple +.output diff --git a/src/44-scx-simple/README.md b/src/44-scx-simple/README.md index 4147a9cc..64624b3e 100644 --- a/src/44-scx-simple/README.md +++ b/src/44-scx-simple/README.md @@ -23,18 +23,11 @@ The **scx_simple** scheduler is a straightforward example of a sched_ext schedul 1. **Global Weighted Virtual Time (vtime) Mode:** Prioritizes tasks based on their virtual time, allowing for fair scheduling across different workloads. 2. **FIFO (First-In-First-Out) Mode:** Simple queue-based scheduling where tasks are executed in the order they arrive. -### Use Case and Suitability - scx_simple is particularly effective on single-socket CPUs with a uniform L3 cache topology. While the global FIFO mode can handle many workloads efficiently, it's essential to note that saturating threads might overshadow less active ones. Therefore, scx_simple is best suited for environments where a straightforward scheduling policy meets the performance and fairness requirements. -### Production Readiness - -While scx_simple is minimalistic, it can be deployed in production settings under the right conditions: - -- **Hardware Constraints:** Best suited for systems with single-socket CPUs and uniform cache architectures. -- **Workload Characteristics:** Ideal for workloads that don't require intricate scheduling policies and can benefit from simple FIFO or weighted vtime scheduling. +While scx_simple is minimalistic, it can be deployed in production settings under the right conditions. It is best suited for systems with single-socket CPUs and uniform cache architectures. Additionally, it is ideal for workloads that don't require intricate scheduling policies and can benefit from simple FIFO or weighted vtime scheduling. -## Diving into the Code: Kernel and User-Space Analysis +## Into the Code: Kernel and User-Space Analysis Let's explore how scx_simple is implemented both in the kernel and user-space. We'll start by presenting the complete code snippets and then break down their functionalities. @@ -280,6 +273,8 @@ restart: } ``` +The complete code can be found in + #### User-Space Breakdown The user-space component is responsible for interacting with the BPF scheduler, managing its lifecycle, and monitoring its performance. Here's a snapshot of its responsibilities: @@ -315,22 +310,43 @@ Virtual time is a mechanism to ensure fairness in scheduling by tracking how muc ### Scheduling Cycle -Understanding the scheduling cycle is crucial for modifying or extending scx_simple: - -1. **Task Wakeup:** - - `ops.select_cpu()` is invoked to select an optimal CPU for the waking task. - - If the selected CPU is idle, the task is dispatched immediately to the local DSQ. - -2. **Task Enqueueing:** - - `ops.enqueue()` decides whether to dispatch the task to the global DSQ, a local DSQ, or a custom DSQ based on the scheduling mode. - -3. **Task Dispatching:** - - When a CPU is ready to schedule, it first checks its local DSQ, then the global DSQ, and finally invokes `ops.dispatch()` if needed. - -4. **Task Execution:** - - The CPU executes the selected task, updating its virtual time and ensuring fair scheduling. - -This cycle ensures that tasks are scheduled efficiently while maintaining fairness and responsiveness. +Understanding the scheduling cycle is crucial for modifying or extending scx_simple. The following steps detail how a waking task is scheduled and executed: + +1. **Task Wakeup and CPU Selection:** + - When a task wakes up, the first operation invoked is `ops.select_cpu()`.This function serves two purposes: + - **CPU Selection Optimization Hint:** Provides a suggested CPU for the task to run on. While this is an optimization hint and not binding, matching the CPU the task eventually runs on can yield performance gains. + - **Waking Up Idle CPUs:** If the selected CPU is idle, `ops.select_cpu()` can wake it up, preparing it to execute tasks. + - Note: The scheduler core will ignore invalid CPU selections, such as CPUs outside the allowed CPU mask of the task. + +2. **Immediate Dispatch from `ops.select_cpu()`:** + - A task can be immediately dispatched to a Dispatch Queue (DSQ) directly from `ops.select_cpu()` by calling `scx_bpf_dispatch()`. + - If dispatched to `SCX_DSQ_LOCAL`, the task will be placed in the local DSQ of the CPU returned by `ops.select_cpu()`. + - Dispatching directly from `ops.select_cpu()` causes the `ops.enqueue()` callback to be skipped, potentially reducing scheduling latency. + +3. **Task Enqueueing (`ops.enqueue()`):** + - If the task was not dispatched in the previous step, `ops.enqueue()` is invoked. + - `ops.enqueue()` can make several decisions: + - **Immediate Dispatch:** Dispatch the task to either the global DSQ (`SCX_DSQ_GLOBAL`), a local DSQ (`SCX_DSQ_LOCAL`), or a custom DSQ by calling `scx_bpf_dispatch()`. + - **Queue on BPF Side:** Queue the task within the BPF program for custom scheduling logic. + +4. **CPU Scheduling Readiness:** + - When a CPU is ready to schedule, it follows this order: + - **Local DSQ Check:** The CPU first checks its local DSQ for tasks. + - **Global DSQ Check:** If the local DSQ is empty, it checks the global DSQ. + - **Invoke `ops.dispatch()`:** If no tasks are found, `ops.dispatch()` is invoked to populate the local DSQ. + - Within `ops.dispatch()`, the following functions can be used: + - `scx_bpf_dispatch()`: Schedules tasks to any DSQ (local, global, or custom). Note that this function currently cannot be called with BPF locks held. + - `scx_bpf_consume()`: Transfers a task from a specified non-local DSQ to the dispatching DSQ. This function cannot be called with any BPF locks held and will flush pending dispatched tasks before attempting to consume the specified DSQ. + +5. **Task Execution Decision:** + - After `ops.dispatch()` returns, if there are tasks in the local DSQ, the CPU runs the first one. + - If the local DSQ is still empty, the CPU performs the following steps: + - **Consume Global DSQ:** Attempts to consume a task from the global DSQ using `scx_bpf_consume()`. If successful, the task is executed. + - **Retry Dispatch:** If `ops.dispatch()` has dispatched any tasks, the CPU retries checking the local DSQ. + - **Execute Previous Task:** If the previous task is an SCX task and still runnable, the CPU continues executing it (see `SCX_OPS_ENQ_LAST`). + - **Idle State:** If no tasks are available, the CPU goes idle. + +This scheduling cycle ensures that tasks are scheduled efficiently while maintaining fairness and responsiveness. By understanding each step, developers can modify or extend scx_simple to implement custom scheduling behaviors that meet specific requirements. ## Compiling and Running scx_simple @@ -412,7 +428,7 @@ In this tutorial, we've introduced the **sched_ext** scheduler class and walked By mastering scx_simple, you're well-equipped to design and implement more sophisticated scheduling policies tailored to your specific requirements. Whether you're optimizing for performance, fairness, or specific workload characteristics, sched_ext and eBPF offer the flexibility and power to achieve your goals. -> Ready to take your eBPF skills to the next level? Dive deeper into our tutorials and explore more examples by visiting our [tutorial repository](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our [website](https://eunomia.dev/tutorials/). +> Ready to take your eBPF skills to the next level? Dive deeper into our tutorials and explore more examples by visiting our [tutorial repository https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our [website https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/). ## References @@ -423,3 +439,4 @@ By mastering scx_simple, you're well-equipped to design and implement more sophi - **libbpf Documentation:** [https://github.com/libbpf/libbpf](https://github.com/libbpf/libbpf) Feel free to explore these resources to expand your understanding and continue your journey into advanced eBPF programming! + diff --git a/src/44-scx-simple/README.zh.md b/src/44-scx-simple/README.zh.md index e69de29b..bff51b9a 100644 --- a/src/44-scx-simple/README.zh.md +++ b/src/44-scx-simple/README.zh.md @@ -0,0 +1,431 @@ +# eBPF 教程:BPF 调度器入门 + +欢迎来到我们深入探讨 eBPF 世界的教程,本教程将重点介绍 BPF 调度器!如果你希望将 eBPF 知识扩展到基础之外,你来对地方了。在本教程中,我们将探索 **scx_simple 调度器**,这是 Linux 内核版本 `6.12` 中引入的 sched_ext 调度类的一个最小示例。我们将带你了解其架构,如何利用 BPF 程序定义调度行为,并指导你编译和运行示例。到最后,你将对如何使用 eBPF 创建和管理高级调度策略有一个坚实的理解。 + +## 理解可扩展的 BPF 调度器 + +本教程的核心是 **sched_ext** 调度类。与传统调度器不同,sched_ext 允许通过一组 BPF 程序动态定义其行为,使其高度灵活和可定制。这意味着你可以在 sched_ext 之上实现任何调度算法,量身定制以满足你的特定需求。 + +### sched_ext 的关键特性 + +- **灵活的调度算法:** 通过编写 BPF 程序实现任何调度策略。 +- **动态 CPU 分组:** BPF 调度器可以根据需要分组 CPU,无需在唤醒时将任务绑定到特定 CPU。 +- **运行时控制:** 可在不重启的情况下即时启用或禁用 BPF 调度器。 +- **系统完整性:** 即使 BPF 调度器遇到错误,系统也会优雅地回退到默认调度行为。 +- **调试支持:** 通过 `sched_ext_dump` 跟踪点和 SysRq 键序列提供全面的调试信息。 + +凭借这些特性,sched_ext 为实验和部署高级调度策略提供了坚实的基础。 + +## 介绍 scx_simple:一个最小的 sched_ext 调度器 + +**scx_simple** 调度器是 Linux 工具中 sched_ext 调度器的一个简明示例。它设计简单易懂,并为更复杂的调度策略提供了基础。scx_simple 可以在两种模式下运行: + +1. **全局加权虚拟时间 (vtime) 模式:** 根据任务的虚拟时间优先级排序,实现不同工作负载之间的公平调度。 +2. **FIFO(先进先出)模式:** 基于简单队列的调度,任务按照到达顺序执行。 + +### 用例和适用性 + +scx_simple 在具有单插槽 CPU 和统一 L3 缓存拓扑的系统上尤其有效。虽然全局 FIFO 模式可以高效处理许多工作负载,但需要注意的是,饱和线程可能会压倒较不活跃的线程。因此,scx_simple 最适合在简单的调度策略能够满足性能和公平性要求的环境中使用。 + +### 生产就绪性 + +尽管 scx_simple 功能简洁,但在合适的条件下可以部署到生产环境中: + +- **硬件约束:** 最适用于具有单插槽 CPU 和统一缓存架构的系统。 +- **工作负载特性:** 适用于不需要复杂调度策略且可以受益于简单 FIFO 或加权 vtime 调度的工作负载。 + +## 代码深入:内核和用户空间分析 + +让我们深入探讨 scx_simple 在内核和用户空间中的实现。我们将首先展示完整的代码片段,然后分解其功能。 + +### 内核端实现 + +```c +#include + +char _license[] SEC("license") = "GPL"; + +const volatile bool fifo_sched; + +static u64 vtime_now; +UEI_DEFINE(uei); + +/* + * 内置 DSQ 如 SCX_DSQ_GLOBAL 不能用作优先级队列 + * (意味着,不能用 scx_bpf_dispatch_vtime() 分派)。因此,我们 + * 创建一个 ID 为 0 的单独 DSQ 来分派和消费。如果 scx_simple + * 只支持全局 FIFO 调度,那么我们可以直接使用 SCX_DSQ_GLOBAL。 + */ +#define SHARED_DSQ 0 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 2); /* [local, global] */ +} stats SEC(".maps"); + +static void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + bool is_idle = false; + s32 cpu; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + if (is_idle) { + stat_inc(0); /* 统计本地队列 */ + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + } + + return cpu; +} + +void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) +{ + stat_inc(1); /* 统计全局队列 */ + + if (fifo_sched) { + scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); + } else { + u64 vtime = p->scx.dsq_vtime; + + /* + * 限制空闲任务可积累的预算量为一个切片。 + */ + if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) + vtime = vtime_now - SCX_SLICE_DFL; + + scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime, + enq_flags); + } +} + +void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_consume(SHARED_DSQ); +} + +void BPF_STRUCT_OPS(simple_running, struct task_struct *p) +{ + if (fifo_sched) + return; + + /* + * 全局 vtime 随着任务开始执行而总是向前推进。测试和更新可以 + * 从多个 CPU 并发执行,因此存在竞争。如果有错误,应当被 + * 限制并且是临时的。让我们接受它。 + */ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) +{ + if (fifo_sched) + return; + + /* + * 按照权重和费用的倒数缩放执行时间。 + * + * 注意,默认的让出实现通过将 @p->scx.slice 设置为零来让出, + * 以下操作将会将让出的任务视为已消耗所有切片。如果这对 + * 让出任务的惩罚过大,请通过显式时间戳来确定执行时间, + * 而不是依赖于 @p->scx.slice。 + */ + p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) +{ + return scx_bpf_create_dsq(SHARED_DSQ, -1); +} + +void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(simple_ops, + .select_cpu = (void *)simple_select_cpu, + .enqueue = (void *)simple_enqueue, + .dispatch = (void *)simple_dispatch, + .running = (void *)simple_running, + .stopping = (void *)simple_stopping, + .enable = (void *)simple_enable, + .init = (void *)simple_init, + .exit = (void *)simple_exit, + .name = "simple"); +``` + +#### 内核端分解 + +scx_simple 的内核端实现定义了如何选择、入队、分派和管理任务。以下是高层次的概述: + +1. **初始化和许可:** + - 调度器的许可证为 GPL。 + - 全局变量 `fifo_sched` 决定调度模式(FIFO 或加权 vtime)。 + +2. **分派队列(DSQ)管理:** + - 创建一个共享的 DSQ(`SHARED_DSQ`,ID 为 0)用于任务分派。 + - 使用 `stats` 映射跟踪本地和全局队列中的任务数量。 + +3. **CPU 选择 (`simple_select_cpu`):** + - 为唤醒任务选择 CPU。 + - 如果选择的 CPU 处于空闲状态,任务将立即分派到本地 DSQ。 + +4. **任务入队 (`simple_enqueue`):** + - 根据 `fifo_sched` 标志,将任务分派到共享 DSQ 的 FIFO 模式或基于虚拟时间的优先级队列。 + - 虚拟时间 (`vtime`) 通过考虑任务执行时间和权重,确保公平调度。 + +5. **任务分派 (`simple_dispatch`):** + - 从共享 DSQ 消费任务并将其分配给 CPU。 + +6. **运行和停止任务 (`simple_running` & `simple_stopping`):** + - 管理任务的虚拟时间进度,确保调度决策的公平和平衡。 + +7. **启用和退出:** + - 处理调度器的启用,并记录退出信息以便调试。 + +这种模块化结构使得 scx_simple 既简单又有效,提供了一个清晰的示例,展示如何使用 eBPF 实现自定义调度策略。 + +### 用户空间实现 + +```c +static void read_stats(struct scx_simple *skel, __u64 *stats) +{ + int nr_cpus = libbpf_num_possible_cpus(); + __u64 cnts[2][nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * 2); + + for (idx = 0; idx < 2; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_simple *skel; + struct bpf_link *link; + __u32 opt; + __u64 ecode; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(simple_ops, scx_simple); + + while ((opt = getopt(argc, argv, "fvh")) != -1) { + switch (opt) { + case 'f': + skel->rodata->fifo_sched = true; + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei); + link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + __u64 stats[2]; + + read_stats(skel, stats); + printf("local=%llu global=%llu\n", stats[0], stats[1]); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_simple__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} +``` + +#### 用户空间分解 + +用户空间组件负责与 BPF 调度器交互,管理其生命周期,并监控其性能。`read_stats` 函数通过读取 BPF 映射中的本地和全局队列任务数量来收集统计数据,并跨所有 CPU 聚合这些统计数据以进行报告。 + +在 `main` 函数中,程序初始化 libbpf,处理信号中断,并打开 scx_simple BPF 骨架。它处理命令行选项以切换 FIFO 调度和详细模式,加载 BPF 程序,并将其附加到调度器。监控循环每秒连续读取并打印调度统计数据,提供调度器行为的实时洞察。终止时,程序通过销毁 BPF 链接并根据退出代码处理潜在的重启来清理资源。 + +这个用户空间程序提供了一个简洁的接口,用于监控和控制 scx_simple 调度器,使得更容易实时理解其行为。 + +## 关键概念深入 + +为了充分理解 scx_simple 的运行机制,让我们探讨一些基础概念和机制: + +### 分派队列(DSQs) + +DSQs 是 sched_ext 运行的核心,充当任务在被分派到 CPU 之前的缓冲区。它们可以根据虚拟时间作为 FIFO 队列或优先级队列运行。 + +- **本地 DSQs (`SCX_DSQ_LOCAL`):** 每个 CPU 都有自己的本地 DSQ,确保任务可以高效地分派和消费,而不会发生争用。 +- **全局 DSQ (`SCX_DSQ_GLOBAL`):** 一个共享队列,来自所有 CPU 的任务可以被排队,当本地队列为空时提供回退。 +- **自定义 DSQs:** 开发者可以使用 `scx_bpf_create_dsq()` 创建额外的 DSQs,以满足更专业的调度需求。 + +### 虚拟时间(vtime) + +虚拟时间是一种确保调度公平性的机制,通过跟踪任务相对于其权重消耗了多少时间来实现。在 scx_simple 的加权 vtime 模式下,权重较高的任务消耗虚拟时间的速度较慢,允许权重较低的任务更频繁地运行。这种方法基于预定义的权重平衡任务执行,确保没有单个任务垄断 CPU 资源。 + +### 调度周期 + +理解调度周期对于修改或扩展 scx_simple 至关重要。以下步骤详细说明了唤醒任务的调度和执行过程: + +1. **任务唤醒和 CPU 选择:** + - 当一个任务被唤醒时,首先调用 `ops.select_cpu()`。 + - 该函数有两个目的: + - **CPU 选择优化提示:** 提供建议的 CPU 供任务运行。虽然这是一个优化提示而非绑定,但如果 `ops.select_cpu()` 返回的 CPU 与任务最终运行的 CPU 匹配,可以带来性能提升。 + - **唤醒空闲 CPU:** 如果选择的 CPU 处于空闲状态,`ops.select_cpu()` 可以唤醒它,为执行任务做好准备。 + - 注意:如果 CPU 选择无效(例如,超出任务允许的 CPU 掩码),调度器核心将忽略该选择。 + +2. **从 `ops.select_cpu()` 立即分派:** + - 任务可以通过调用 `scx_bpf_dispatch()` 直接从 `ops.select_cpu()` 分派到分派队列(DSQ)。 + - 如果分派到 `SCX_DSQ_LOCAL`,任务将被放入 `ops.select_cpu()` 返回的 CPU 的本地 DSQ。 + - 直接从 `ops.select_cpu()` 分派将导致跳过 `ops.enqueue()` 回调,可能减少调度延迟。 + +3. **任务入队 (`ops.enqueue()`):** + - 如果任务未在上一步被分派,`ops.enqueue()` 将被调用。 + - `ops.enqueue()` 可以做出以下几种决定: + - **立即分派:** 通过调用 `scx_bpf_dispatch()` 将任务分派到全局 DSQ(`SCX_DSQ_GLOBAL`)、本地 DSQ(`SCX_DSQ_LOCAL`)或自定义 DSQ。 + - **在 BPF 端排队:** 在 BPF 程序中排队任务,以便进行自定义调度逻辑。 + +4. **CPU 调度准备:** + - 当 CPU 准备好调度时,它按照以下顺序进行: + - **检查本地 DSQ:** CPU 首先检查其本地 DSQ 是否有任务。 + - **检查全局 DSQ:** 如果本地 DSQ 为空,则检查全局 DSQ。 + - **调用 `ops.dispatch()`:** 如果仍然没有找到任务,调用 `ops.dispatch()` 来填充本地 DSQ。 + - 在 `ops.dispatch()` 内,可以使用以下函数: + - `scx_bpf_dispatch()`:将任务调度到任何 DSQ(本地、全局或自定义)。注意,该函数目前不能在持有 BPF 锁时调用。 + - `scx_bpf_consume()`:将任务从指定的非本地 DSQ 转移到分派 DSQ。该函数不能在持有任何 BPF 锁时调用,并且会在尝试消费指定 DSQ 之前刷新待分派的任务。 + +5. **任务执行决策:** + - `ops.dispatch()` 返回后,如果本地 DSQ 中有任务,CPU 将运行第一个任务。 + - 如果本地 DSQ 仍为空,CPU 将执行以下步骤: + - **消费全局 DSQ:** 尝试使用 `scx_bpf_consume()` 从全局 DSQ 消费任务。如果成功,执行该任务。 + - **重试分派:** 如果 `ops.dispatch()` 已经分派了任何任务,CPU 将重试检查本地 DSQ。 + - **执行前一个任务:** 如果前一个任务是 SCX 任务且仍然可运行,CPU 将继续执行它(参见 `SCX_OPS_ENQ_LAST`)。 + - **进入空闲状态:** 如果没有可用任务,CPU 将进入空闲状态。 + +这种调度周期确保任务高效调度,同时保持公平性和响应性。通过理解每一步,开发者可以修改或扩展 scx_simple,以实现满足特定需求的自定义调度行为。 + +## 编译和运行 scx_simple + +要运行 scx_simple,需要设置必要的工具链并正确配置内核。以下是编译和执行示例调度器的方法。 + +### 工具链依赖 + +在编译 scx_simple 之前,请确保已安装以下工具: + +1. **clang >= 16.0.0** + 编译 BPF 程序所需。虽然 GCC 正在开发 BPF 支持,但它缺乏某些必要功能,如 BTF 类型标签。 + +2. **pahole >= 1.25** + 用于从 DWARF 生成 BTF,对于 BPF 程序中的类型信息至关重要。 + +3. **rust >= 1.70.0** + 如果你正在使用基于 Rust 的调度器,请确保拥有适当的 Rust 工具链版本。 + +此外,还需要 `make` 等工具来构建示例。 + +### 内核配置 + +要启用和使用 sched_ext,请确保设置了以下内核配置选项: + +```plaintext +CONFIG_BPF=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_DEBUG_INFO_BTF=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +CONFIG_PAHOLE_HAS_SPLIT_BTF=y +CONFIG_PAHOLE_HAS_BTF_TAG=y +``` + +这些配置启用了 BPF 调度所需的功能,并确保 sched_ext 正常运行。 + +### 构建 scx_simple + +导航到内核的 `tools/sched_ext/` 目录并运行: + +```bash +make +``` + +此命令将编译 scx_simple 调度器及其依赖项。 + +### 运行 scx_simple + +编译完成后,可以执行用户空间程序来加载和监控调度器: + +```bash +./scx_simple -f +``` + +`-f` 标志启用 FIFO 调度模式。你还可以使用 `-v` 进行详细输出,或使用 `-h` 获取帮助。当程序运行时,它将每秒显示本地和全局队列中的任务数量: + +```plaintext +local=123 global=456 +local=124 global=457 +... +``` + +### 在 sched_ext 和 CFS 之间切换 + +sched_ext 与默认的完全公平调度器(CFS)并行运行。你可以通过加载或卸载 scx_simple 程序动态切换 sched_ext 和 CFS。 + +- **启用 sched_ext:** 使用 scx_simple 加载 BPF 调度器。 +- **禁用 sched_ext:** 终止 scx_simple 程序,将所有任务恢复到 CFS。 + +此外,使用 SysRq 键序列如 `SysRq-S` 可以帮助管理调度器的状态,并使用 `SysRq-D` 触发调试转储。 + +## 总结与下一步 + +在本教程中,我们介绍了 **sched_ext** 调度类,并通过一个最小示例 **scx_simple** 展示了如何使用 eBPF 程序定义自定义调度行为。我们涵盖了架构、关键概念如 DSQs 和虚拟时间,并提供了编译和运行调度器的分步说明。 + +掌握 scx_simple 后,你将具备设计和实现更复杂调度策略的能力,以满足特定需求。无论你是优化性能、公平性,还是针对特定工作负载特性,sched_ext 和 eBPF 都提供了实现目标所需的灵活性和强大功能。 + +> 准备好将你的 eBPF 技能提升到新的水平了吗?深入探索我们的教程并通过访问我们的 [教程仓库 https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 或 [网站 https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/) 探索更多示例。 + +## 参考资料 + +- **sched_ext 仓库:** [https://github.com/sched-ext/scx](https://github.com/sched-ext/scx) +- **Linux 内核文档:** [Scheduler Ext Documentation](https://www.kernel.org/doc/html/next/scheduler/sched-ext.html) +- **内核源代码树:** [Linux Kernel sched_ext Tools](https://github.com/torvalds/linux/tree/master/tools/sched_ext) +- **eBPF 官方文档:** [https://ebpf.io/docs/](https://ebpf.io/docs/) +- **libbpf 文档:** [https://github.com/libbpf/libbpf](https://github.com/libbpf/libbpf) + diff --git a/src/45-scx-nest/.config b/src/45-scx-nest/.config new file mode 100644 index 00000000..29189131 --- /dev/null +++ b/src/45-scx-nest/.config @@ -0,0 +1,2 @@ +level=Depth +type=Scheduler diff --git a/src/45-scx-nest/.gitignore b/src/45-scx-nest/.gitignore new file mode 100644 index 00000000..fafeaf8a --- /dev/null +++ b/src/45-scx-nest/.gitignore @@ -0,0 +1,3 @@ +scx_simple +scx_nest +.output \ No newline at end of file diff --git a/src/45-scx-nest/Makefile b/src/45-scx-nest/Makefile new file mode 100644 index 00000000..4b4f5744 --- /dev/null +++ b/src/45-scx-nest/Makefile @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG ?= clang +LIBBPF_SRC := $(abspath ../third_party/libbpf/src) +BPFTOOL_SRC := $(abspath ../third_party/bpftool/src) +LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) +BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) +BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool +LIBBLAZESYM_SRC := $(abspath ../third_party/blazesym/) +LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym.a) +LIBBLAZESYM_HEADER := $(abspath $(OUTPUT)/blazesym.h) +ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ + | sed 's/arm.*/arm/' \ + | sed 's/aarch64/arm64/' \ + | sed 's/ppc64le/powerpc/' \ + | sed 's/mips.*/mips/' \ + | sed 's/riscv64/riscv/' \ + | sed 's/loongarch64/loongarch/') +VMLINUX := ../third_party/vmlinux/$(ARCH)/vmlinux.h +# Use our own libbpf API headers and Linux UAPI headers distributed with +# libbpf to avoid dependency on system-wide headers, which could be missing or +# outdated +INCLUDES := -I$(OUTPUT) -I../third_party/libbpf/include/uapi -Iinclude/ -I$(dir $(VMLINUX)) +CFLAGS := -g -Wall +ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) + +APPS = scx_nest + +CARGO ?= $(shell which cargo) +ifeq ($(strip $(CARGO)),) +BZS_APPS := +else +BZS_APPS := +APPS += $(BZS_APPS) +# Required by libblazesym +ALL_LDFLAGS += -lrt -ldl -lpthread -lm +endif + +# Get Clang's default includes on this system. We'll explicitly add these dirs +# to the includes list when compiling with `-target bpf` because otherwise some +# architecture-specific dirs will be "missing" on some architectures/distros - +# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h, +# sys/cdefs.h etc. might be missing. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' \ + "$(1)" \ + "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ + "$(if $(3), $(3))"; + MAKEFLAGS += --no-print-directory +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +.PHONY: all +all: $(APPS) + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) + +$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +# Build libbpf +$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf + $(call msg,LIB,$@) + $(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \ + OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \ + INCLUDEDIR= LIBDIR= UAPIDIR= \ + install + +# Build bpftool +$(BPFTOOL): | $(BPFTOOL_OUTPUT) + $(call msg,BPFTOOL,$@) + $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap + + +$(LIBBLAZESYM_SRC)/target/release/libblazesym.a:: + $(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release + +$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT) + $(call msg,LIB, $@) + $(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@ + +$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT) + $(call msg,LIB,$@) + $(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@ + +# Build BPF code +$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL) + $(call msg,BPF,$@) + $(Q)$(CLANG) -mlittle-endian -g -O2 -mcpu=v3 -target bpf -D__TARGET_ARCH_$(ARCH) \ + $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \ + -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + $(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + +# Generate BPF skeletons +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< + $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< name $(APPS) > $@ + +# Build user-space code +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h + +$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER) + +$(BZS_APPS): $(LIBBLAZESYM_OBJ) + +# Build application binary +$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/src/45-scx-nest/README.md b/src/45-scx-nest/README.md new file mode 100644 index 00000000..7f6c11e7 --- /dev/null +++ b/src/45-scx-nest/README.md @@ -0,0 +1,927 @@ +# eBPF Tutorial by Example: Implementing the `scx_nest` Scheduler + +In the ever-evolving landscape of system performance optimization, the ability to customize and extend kernel behavior is invaluable. One of the most powerful tools for achieving this is eBPF (extended Berkeley Packet Filter). In this tutorial, we'll explore the implementation of the `scx_nest` scheduler, an advanced eBPF program that leverages the `sched_ext` scheduler class introduced in Linux kernel version `6.12`. By the end of this guide, you'll understand how to build a sophisticated scheduler that dynamically adjusts task placement based on CPU core frequencies and utilization. + +## Introduction to `sched_ext` + +The `sched_ext` scheduler class marks a significant advancement in Linux kernel scheduling capabilities. Unlike traditional schedulers, `sched_ext` allows its behavior to be defined dynamically through a set of BPF (Berkeley Packet Filter) programs. This flexibility enables developers to implement custom scheduling algorithms tailored to specific workloads and system requirements. + +## Understanding the `scx_nest` Scheduler + +### Overview + +The `scx_nest` scheduler is inspired by the Inria Paris paper titled "[OS Scheduling with Nest: Keeping Tasks Close Together on Warm Cores](https://hal.inria.fr/hal-03612592/file/paper.pdf)." Developed by Meta Platforms, Inc., `scx_nest` focuses on encouraging task placement on CPU cores that are likely to run at higher frequencies based on recent usage patterns. This approach aims to optimize performance by ensuring that tasks execute on the most efficient cores available. + +The scheduler operates as a global weighted virtual time (vtime) scheduler, similar to the Completely Fair Scheduler (CFS), while utilizing the Nest algorithm to select idle cores during task wakeup. This dual strategy ensures that tasks are not only fairly distributed but also placed on cores that can execute them most effectively. + +`scx_nest` is designed to optimize workloads with relatively low CPU utilization that can benefit from running on a subset of cores. By concentrating tasks on fewer cores, the scheduler helps maintain high frequencies on those cores, enhancing performance. However, for workloads that perform better when distributed across many cores to avoid cache thrashing, `scx_nest` may not be the ideal choice. Evaluating the suitability of `scx_nest` for a specific workload often requires experimentation. + +Given its design, `scx_nest` is suitable for production environments, provided the hardware constraints are met. It performs optimally on single CCX (Core Complex) or single-socket hosts with a uniform L3 cache topology. While preemption is not implemented in the current version, the shared scheduling queue across all CPUs ensures that tasks at the front of the queue are executed promptly, provided there are enough CPUs available. + +## High-Level Code Analysis + +The `scx_nest` scheduler's implementation is intricate, involving various data structures, maps, and functions that work in harmony to manage task placement and CPU core utilization. The complete source code is available in the [eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) repository. Below, we'll dissect the core components of the scheduler, explaining each part in detail. + +### Core Data Structures and Maps + +#### Task Context (`task_ctx`) + +Each task in the system has an associated context that maintains scheduling-related information. This context is crucial for making informed scheduling decisions based on the task's history and current state. + +```c +/* Per-task scheduling context */ +struct task_ctx { + /* + * A temporary cpumask for calculating a task's primary and reserve + * mask. + */ + struct bpf_cpumask __kptr *tmp_mask; + + /* + * The number of times that a task observes that its previous core is + * not idle. If this occurs r_impatient times in a row, a core is + * attempted to be retrieved from either the reserve nest, or the + * fallback nest. + */ + u32 prev_misses; + + /* + * A core that the task is "attached" to, meaning the last core that it + * executed on at least twice in a row, and the core that it first + * tries to migrate to on wakeup. The task only migrates to the + * attached core if it is idle and in the primary nest. + */ + s32 attached_core; + + /* + * The last core that the task executed on. This is used to determine + * if the task should attach to the core that it will execute on next. + */ + s32 prev_cpu; +}; +``` + +The `task_ctx` structure holds a temporary CPU mask (`tmp_mask`) used for calculating the task's primary and reserve CPU sets. The `prev_misses` counter tracks how often the task's preferred core was not idle, influencing decisions to migrate the task to different cores. The `attached_core` indicates the core the task is currently bound to, ensuring it runs on a high-frequency core when possible. Lastly, `prev_cpu` records the last core the task executed on, aiding in maintaining task-core affinity. + +#### Per-CPU Context (`pcpu_ctx`) + +Each CPU has an associated context that manages timers and compaction state. This context helps in determining when a core should be demoted from the primary nest due to inactivity. + +```c +struct pcpu_ctx { + /* The timer used to compact the core from the primary nest. */ + struct bpf_timer timer; + + /* Whether the current core has been scheduled for compaction. */ + bool scheduled_compaction; +}; +``` + +The `pcpu_ctx` structure contains a `bpf_timer` used to schedule compaction events and a boolean flag `scheduled_compaction` indicating whether a compaction has been scheduled for the core. + +#### Maps + +Several BPF maps are utilized to store contexts and manage timers: + +```c +/* Task storage map */ +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +/* Per-CPU contexts */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1024); + __type(key, s32); + __type(value, struct pcpu_ctx); +} pcpu_ctxs SEC(".maps"); + +/* Statistics timer */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct stats_timer); +} stats_timer SEC(".maps"); +``` + +- **`task_ctx_stor`:** This map stores the scheduling context for each task, enabling the scheduler to access and modify task-specific information. +- **`pcpu_ctxs`:** An array map that holds the per-CPU contexts, allowing the scheduler to manage timers and compaction states for each CPU. +- **`stats_timer`:** A single-entry array map used to manage a central timer for collecting scheduling statistics. + +Additionally, the scheduler maintains masks for primary, reserved, other, and idle CPUs, as well as a statistics map to track various scheduler metrics. + +### Core Functions + +#### `stat_inc` + +A helper function to increment scheduler statistics: + +```c +static __always_inline void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} +``` + +This function looks up a counter in the `stats` map and increments it if the counter exists. It's used throughout the scheduler to track various events and states. + +#### `vtime_before` + +A utility function to compare virtual times: + +```c +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} +``` + +This function determines if virtual time `a` is before `b`, facilitating time-based scheduling decisions. + +#### `try_make_core_reserved` + +Attempts to promote a core to the reserved nest: + +```c +static __always_inline void +try_make_core_reserved(s32 cpu, struct bpf_cpumask * reserved, bool promotion) +{ + s32 tmp_nr_reserved; + + /* + * This check is racy, but that's OK. If we incorrectly fail to promote + * a core to reserve, it's because another context added or removed a + * core from reserved in this small window. It will balance out over + * subsequent wakeups. + */ + tmp_nr_reserved = nr_reserved; + if (tmp_nr_reserved < r_max) { + /* + * It's possible that we could exceed r_max for a time here, + * but that should balance out as more cores are either demoted + * or fail to be promoted into the reserve nest. + */ + __sync_fetch_and_add(&nr_reserved, 1); + bpf_cpumask_set_cpu(cpu, reserved); + if (promotion) + stat_inc(NEST_STAT(PROMOTED_TO_RESERVED)); + else + stat_inc(NEST_STAT(DEMOTED_TO_RESERVED)); + } else { + bpf_cpumask_clear_cpu(cpu, reserved); + stat_inc(NEST_STAT(RESERVED_AT_CAPACITY)); + } +} +``` + +The `try_make_core_reserved` function attempts to add a CPU core to the reserved mask. It first checks if the number of reserved cores (`nr_reserved`) is below the maximum allowed (`r_max`). If so, it increments the `nr_reserved` counter and adds the core to the reserved mask. Depending on whether the core is being promoted or demoted, it increments the corresponding statistic. If the reserved capacity is full, it clears the core from the reserved mask and updates the relevant statistic. + +#### `update_attached` + +Updates the task's attached core based on recent execution: + +```c +static void update_attached(struct task_ctx *tctx, s32 prev_cpu, s32 new_cpu) +{ + if (tctx->prev_cpu == new_cpu) + tctx->attached_core = new_cpu; + tctx->prev_cpu = prev_cpu; +} +``` + +This function updates the `attached_core` for a task. If the task has executed on the same core consecutively, it attaches the task to that core. It then updates the `prev_cpu` to reflect the latest core the task ran on. + +#### `compact_primary_core` + +Handles the compaction of a primary core by demoting it to the reserve nest: + +```c +static int compact_primary_core(void *map, int *key, struct bpf_timer *timer) +{ + struct bpf_cpumask *primary, *reserve; + s32 cpu = bpf_get_smp_processor_id(); + struct pcpu_ctx *pcpu_ctx; + + stat_inc(NEST_STAT(CALLBACK_COMPACTED)); + + /* + * If we made it to this callback, it means that the timer callback was + * never cancelled, and so the core needs to be demoted from the + * primary nest. + */ + pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu); + if (!pcpu_ctx) { + scx_bpf_error("Couldn't lookup pcpu ctx"); + return 0; + } + bpf_rcu_read_lock(); + primary = primary_cpumask; + reserve = reserve_cpumask; + if (!primary || !reserve) { + scx_bpf_error("Couldn't find primary or reserve"); + bpf_rcu_read_unlock(); + return 0; + } + + bpf_cpumask_clear_cpu(cpu, primary); + try_make_core_reserved(cpu, reserve, false); + bpf_rcu_read_unlock(); + pcpu_ctx->scheduled_compaction = false; + return 0; +} +``` + +When the compaction timer expires, `compact_primary_core` is invoked. It demotes the current CPU core from the primary nest to the reserve nest by clearing it from the primary mask and attempting to add it to the reserve mask using `try_make_core_reserved`. This ensures that inactive cores are efficiently managed, maintaining a balance between performance and resource utilization. + +#### `nest_select_cpu` + +Determines the appropriate CPU for a task upon waking up: + +```c +s32 BPF_STRUCT_OPS(nest_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + struct bpf_cpumask *p_mask, *primary, *reserve; + s32 cpu; + struct task_ctx *tctx; + struct pcpu_ctx *pcpu_ctx; + bool direct_to_primary = false, reset_impatient = true; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) + return -ENOENT; + + bpf_rcu_read_lock(); + p_mask = tctx->tmp_mask; + primary = primary_cpumask; + reserve = reserve_cpumask; + if (!p_mask || !primary || !reserve) { + bpf_rcu_read_unlock(); + return -ENOENT; + } + + tctx->prev_cpu = prev_cpu; + + bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary)); + + /* First try to wake the task on its attached core. */ + if (bpf_cpumask_test_cpu(tctx->attached_core, cast_mask(p_mask)) && + scx_bpf_test_and_clear_cpu_idle(tctx->attached_core)) { + cpu = tctx->attached_core; + stat_inc(NEST_STAT(WAKEUP_ATTACHED)); + goto migrate_primary; + } + + /* + * Try to stay on the previous core if it's in the primary set, and + * there's no hypertwin. If the previous core is the core the task is + * attached to, don't bother as we already just tried that above. + */ + if (prev_cpu != tctx->attached_core && + bpf_cpumask_test_cpu(prev_cpu, cast_mask(p_mask)) && + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + stat_inc(NEST_STAT(WAKEUP_PREV_PRIMARY)); + goto migrate_primary; + } + + if (find_fully_idle) { + /* Then try any fully idle core in primary. */ + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), + SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_PRIMARY)); + goto migrate_primary; + } + } + + /* Then try _any_ idle core in primary, even if its hypertwin is active. */ + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0); + if (cpu >= 0) { + stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_PRIMARY)); + goto migrate_primary; + } + + if (r_impatient > 0 && ++tctx->prev_misses >= r_impatient) { + direct_to_primary = true; + tctx->prev_misses = 0; + stat_inc(NEST_STAT(TASK_IMPATIENT)); + } + + reset_impatient = false; + + /* Then try any fully idle core in reserve. */ + bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(reserve)); + if (find_fully_idle) { + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), + SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_RESERVE)); + goto promote_to_primary; + } + } + + /* Then try _any_ idle core in reserve, even if its hypertwin is active. */ + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0); + if (cpu >= 0) { + stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_RESERVE)); + goto promote_to_primary; + } + + /* Then try _any_ idle core in the task's cpumask. */ + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) { + /* + * We found a core that (we didn't _think_) is in any nest. + * This means that we need to either promote the core to the + * reserve nest, or if we're going direct to primary due to + * r_impatient being exceeded, promote directly to primary. + * + * We have to do one final check here to see if the core is in + * the primary or reserved cpumask because we could potentially + * race with the core changing states between AND'ing the + * primary and reserve masks with p->cpus_ptr above, and + * atomically reserving it from the idle mask with + * scx_bpf_pick_idle_cpu(). This is also technically true of + * the checks above, but in all of those cases we just put the + * core directly into the primary mask so it's not really that + * big of a problem. Here, we want to make sure that we don't + * accidentally put a core into the reserve nest that was e.g. + * already in the primary nest. This is unlikely, but we check + * for it on what should be a relatively cold path regardless. + */ + stat_inc(NEST_STAT(WAKEUP_IDLE_OTHER)); + if (bpf_cpumask_test_cpu(cpu, cast_mask(primary))) + goto migrate_primary; + else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) + goto promote_to_primary; + else if (direct_to_primary) + goto promote_to_primary; + else + try_make_core_reserved(cpu, reserve, true); + bpf_rcu_read_unlock(); + return cpu; + } + + bpf_rcu_read_unlock(); + return prev_cpu; + +promote_to_primary: + stat_inc(NEST_STAT(PROMOTED_TO_PRIMARY)); +migrate_primary: + if (reset_impatient) + tctx->prev_misses = 0; + pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu); + if (pcpu_ctx) { + if (pcpu_ctx->scheduled_compaction) { + if (bpf_timer_cancel(&pcpu_ctx->timer) < 0) + scx_bpf_error("Failed to cancel pcpu timer"); + if (bpf_timer_set_callback(&pcpu_ctx->timer, compact_primary_core)) + scx_bpf_error("Failed to re-arm pcpu timer"); + pcpu_ctx->scheduled_compaction = false; + stat_inc(NEST_STAT(CANCELLED_COMPACTION)); + } + } else { + scx_bpf_error("Failed to lookup pcpu ctx"); + } + bpf_cpumask_set_cpu(cpu, primary); + /* + * Check to see whether the CPU is in the reserved nest. This can + * happen if the core is compacted concurrently with us trying to place + * the currently-waking task onto it. Similarly, this is the expected + * state of the core if we found the core in the reserve nest and are + * promoting it. + * + * We don't have to worry about racing with any other waking task here + * because we've atomically reserved the core with (some variant of) + * scx_bpf_pick_idle_cpu(). + */ + if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) { + __sync_sub_and_fetch(&nr_reserved, 1); + bpf_cpumask_clear_cpu(cpu, reserve); + } + bpf_rcu_read_unlock(); + update_attached(tctx, prev_cpu, cpu); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0); + return cpu; +} +``` + +The `nest_select_cpu` function is the heart of the `scx_nest` scheduler. When a task wakes up, this function determines the most suitable CPU core for its execution. The function follows a series of checks to ensure that tasks are placed on high-frequency, idle cores, promoting efficiency and performance. + +Initially, it retrieves the task's context from the `task_ctx_stor` map. It then locks the read-copy-update (RCU) lock to safely access the primary and reserve CPU masks. The scheduler first attempts to place the task on its attached core, ensuring core affinity. If the attached core is not idle, it tries the previous core. Depending on various conditions, including the task's impatience (`r_impatient`) and the availability of idle cores in the primary and reserve nests, the scheduler decides whether to migrate the task, promote a core to the primary nest, or demote a core to the reserve nest. + +Throughout the process, the scheduler updates relevant statistics to provide insights into its operations. The use of RCU locks ensures that the scheduler's decisions are made safely without interfering with other concurrent operations. + +#### `nest_enqueue` + +Handles the enqueuing of tasks into the scheduling queue: + +```c +void BPF_STRUCT_OPS(nest_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct task_ctx *tctx; + u64 vtime = p->scx.dsq_vtime; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("Unable to find task ctx"); + return; + } + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (vtime_before(vtime, vtime_now - slice_ns)) + vtime = vtime_now - slice_ns; + + scx_bpf_dispatch_vtime(p, FALLBACK_DSQ_ID, slice_ns, vtime, enq_flags); +} +``` + +The `nest_enqueue` function manages the queuing of tasks, adjusting their virtual time (`vtime`) to ensure fairness and prevent tasks from accumulating excessive execution budget while idling. If a task's `vtime` falls below a certain threshold, it's adjusted to maintain balance within the scheduler. + +#### `nest_dispatch` + +Manages the dispatching of tasks to CPU cores: + +```c +void BPF_STRUCT_OPS(nest_dispatch, s32 cpu, struct task_struct *prev) +{ + struct pcpu_ctx *pcpu_ctx; + struct bpf_cpumask *primary, *reserve; + s32 key = cpu; + bool in_primary; + + primary = primary_cpumask; + reserve = reserve_cpumask; + if (!primary || !reserve) { + scx_bpf_error("No primary or reserve cpumask"); + return; + } + + pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key); + if (!pcpu_ctx) { + scx_bpf_error("Failed to lookup pcpu ctx"); + return; + } + + if (!scx_bpf_consume(FALLBACK_DSQ_ID)) { + in_primary = bpf_cpumask_test_cpu(cpu, cast_mask(primary)); + + if (prev && (prev->scx.flags & SCX_TASK_QUEUED) && in_primary) { + scx_bpf_dispatch(prev, SCX_DSQ_LOCAL, slice_ns, 0); + return; + } + + stat_inc(NEST_STAT(NOT_CONSUMED)); + if (in_primary) { + /* + * Immediately demote a primary core if the previous + * task on it is dying + * + * Note that we elect to not compact the "first" CPU in + * the mask so as to encourage at least one core to + * remain in the nest. It would be better to check for + * whether there is only one core remaining in the + * nest, but BPF doesn't yet have a kfunc for querying + * cpumask weight. + */ + if ((prev && prev->__state == TASK_DEAD) && + (cpu != bpf_cpumask_first(cast_mask(primary)))) { + stat_inc(NEST_STAT(EAGERLY_COMPACTED)); + bpf_cpumask_clear_cpu(cpu, primary); + try_make_core_reserved(cpu, reserve, false); + } else { + pcpu_ctx->scheduled_compaction = true; + /* + * The core isn't being used anymore. Set a + * timer to remove the core from the nest in + * p_remove if it's still unused by that point. + */ + bpf_timer_start(&pcpu_ctx->timer, p_remove_ns, + BPF_F_TIMER_CPU_PIN); + stat_inc(NEST_STAT(SCHEDULED_COMPACTION)); + } + } + return; + } + stat_inc(NEST_STAT(CONSUMED)); +} +``` + +The `nest_dispatch` function is responsible for dispatching tasks to CPU cores. It first checks if there's a task available in the fallback dispatch queue (`FALLBACK_DSQ_ID`). If no task is consumed, it evaluates whether the previous task on the CPU is dead. If so, and the CPU is not the first in the primary mask, the scheduler demotes the core to the reserve nest. Otherwise, it schedules a compaction timer to potentially demote the core after a specified duration (`p_remove_ns`). If a task is successfully consumed from the fallback queue, it increments the corresponding statistic. + +#### `nest_running` + +Updates the global virtual time when a task starts running: + +```c +void BPF_STRUCT_OPS(nest_running, struct task_struct *p) +{ + /* + * Global vtime always progresses forward as tasks start executing. The + * test and update can be performed concurrently from multiple CPUs and + * thus racy. Any error should be contained and temporary. Let's just + * live with it. + */ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} +``` + +The `nest_running` function ensures that the global virtual time (`vtime_now`) progresses forward as tasks start executing. This mechanism helps maintain fairness and temporal consistency across the scheduler's operations. + +#### `nest_stopping` + +Handles the stopping of a task, adjusting its virtual time: + +```c +void BPF_STRUCT_OPS(nest_stopping, struct task_struct *p, bool runnable) +{ + /* scale the execution time by the inverse of the weight and charge */ + p->scx.dsq_vtime += (slice_ns - p->scx.slice) * 100 / p->scx.weight; +} +``` + +When a task stops running, `nest_stopping` adjusts its virtual time based on its execution slice and weight. This adjustment ensures that tasks are fairly accounted for in the scheduler's virtual time calculations, maintaining balance and preventing any single task from monopolizing CPU resources. + +#### `nest_init_task` + +Initializes a new task's context: + +```c +s32 BPF_STRUCT_OPS(nest_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + struct task_ctx *tctx; + struct bpf_cpumask *cpumask; + + /* + * @p is new. Let's ensure that its task_ctx is available. We can sleep + * in this function and the following will automatically use GFP_KERNEL. + */ + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!tctx) + return -ENOMEM; + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + + cpumask = bpf_kptr_xchg(&tctx->tmp_mask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + tctx->attached_core = -1; + tctx->prev_cpu = -1; + + return 0; +} +``` + +The `nest_init_task` function initializes the scheduling context for a new task. It ensures that the task's context is available by retrieving it from the `task_ctx_stor` map, creating a new `bpf_cpumask` for temporary calculations, and setting initial values for `attached_core` and `prev_cpu`. + +#### `nest_enable` + +Enables scheduling for a task by setting its virtual time: + +```c +void BPF_STRUCT_OPS(nest_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} +``` + +The `nest_enable` function activates scheduling for a task by initializing its virtual time (`dsq_vtime`) to the current global virtual time (`vtime_now`). This ensures that the task's scheduling state is synchronized with the scheduler's virtual time. + +#### `stats_timerfn` + +Handles periodic statistics collection: + +```c +static int stats_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + s32 cpu; + struct bpf_cpumask *primary, *reserve; + const struct cpumask *idle; + stats_primary_mask = 0; + stats_reserved_mask = 0; + stats_other_mask = 0; + stats_idle_mask = 0; + long err; + + bpf_rcu_read_lock(); + primary = primary_cpumask; + reserve = reserve_cpumask; + if (!primary || !reserve) { + bpf_rcu_read_unlock(); + scx_bpf_error("Failed to lookup primary or reserve"); + return 0; + } + + idle = scx_bpf_get_idle_cpumask(); + bpf_for(cpu, 0, nr_cpus) { + if (bpf_cpumask_test_cpu(cpu, cast_mask(primary))) + stats_primary_mask |= (1ULL << cpu); + else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) + stats_reserved_mask |= (1ULL << cpu); + else + stats_other_mask |= (1ULL << cpu); + + if (bpf_cpumask_test_cpu(cpu, idle)) + stats_idle_mask |= (1ULL << cpu); + } + bpf_rcu_read_unlock(); + scx_bpf_put_idle_cpumask(idle); + + err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0); + if (err) + scx_bpf_error("Failed to arm stats timer"); + + return 0; +} +``` + +The `stats_timerfn` function is invoked periodically by a central timer to collect and update scheduler statistics. It captures the current state of CPU cores, categorizing them into primary, reserve, other, and idle masks. This information provides insights into how the scheduler is managing CPU resources and task placement over time. After collecting the statistics, the function re-arms the timer to ensure continuous monitoring. + +#### `nest_init` + +Initializes the `scx_nest` scheduler: + +```c +s32 BPF_STRUCT_OPS_SLEEPABLE(nest_init) +{ + struct bpf_cpumask *cpumask; + s32 cpu; + int err; + struct bpf_timer *timer; + u32 key = 0; + + err = scx_bpf_create_dsq(FALLBACK_DSQ_ID, NUMA_NO_NODE); + if (err) { + scx_bpf_error("Failed to create fallback DSQ"); + return err; + } + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + bpf_cpumask_clear(cpumask); + cpumask = bpf_kptr_xchg(&primary_cpumask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + + bpf_cpumask_clear(cpumask); + cpumask = bpf_kptr_xchg(&reserve_cpumask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + bpf_for(cpu, 0, nr_cpus) { + s32 key = cpu; + struct pcpu_ctx *ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key); + + if (!ctx) { + scx_bpf_error("Failed to lookup pcpu_ctx"); + return -ENOENT; + } + ctx->scheduled_compaction = false; + if (bpf_timer_init(&ctx->timer, &pcpu_ctxs, CLOCK_BOOTTIME)) { + scx_bpf_error("Failed to initialize pcpu timer"); + return -EINVAL; + } + err = bpf_timer_set_callback(&ctx->timer, compact_primary_core); + if (err) { + scx_bpf_error("Failed to set pcpu timer callback"); + return -EINVAL; + } + } + + timer = bpf_map_lookup_elem(&stats_timer, &key); + if (!timer) { + scx_bpf_error("Failed to lookup central timer"); + return -ESRCH; + } + bpf_timer_init(timer, &stats_timer, CLOCK_BOOTTIME); + bpf_timer_set_callback(timer, stats_timerfn); + err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0); + if (err) + scx_bpf_error("Failed to arm stats timer"); + + return err; +} +``` + +The `nest_init` function sets up the `scx_nest` scheduler during system initialization. It creates a fallback dispatch queue (`FALLBACK_DSQ_ID`) and initializes the primary and reserve CPU masks. For each CPU, it retrieves the per-CPU context from the `pcpu_ctxs` map, initializes a timer for core compaction, and sets the callback to `compact_primary_core`. Additionally, it initializes and starts the central statistics timer (`stats_timer`) with the callback function `stats_timerfn`, ensuring that scheduler statistics are continuously monitored. + +#### `nest_exit` + +Handles cleanup when the scheduler exits: + +```c +void BPF_STRUCT_OPS(nest_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} +``` + +The `nest_exit` function records exit information and performs any necessary cleanup when the scheduler is being removed or the system is shutting down. This ensures that all resources are properly released and that the system remains stable. + +#### `SCX_OPS_DEFINE` + +Defines the operations structure for the `scx_nest` scheduler: + +```c +SCX_OPS_DEFINE(nest_ops, + .select_cpu = (void *)nest_select_cpu, + .enqueue = (void *)nest_enqueue, + .dispatch = (void *)nest_dispatch, + .running = (void *)nest_running, + .stopping = (void *)nest_stopping, + .init_task = (void *)nest_init_task, + .enable = (void *)nest_enable, + .init = (void *)nest_init, + .exit = (void *)nest_exit, + .flags = 0, + .name = "nest"); +``` + +The `SCX_OPS_DEFINE` macro binds all the scheduler's functions to the `nest_ops` structure, which the `sched_ext` framework uses to interface with the scheduler. This structure ensures that the scheduler's operations are correctly mapped and invoked by the kernel during task scheduling events. + +### Initialization and Cleanup + +Proper initialization and cleanup are crucial for the scheduler's stability and performance. + +#### `nest_init` Function + +The `nest_init` function is responsible for setting up the scheduler during system initialization. Here's how it operates: + +1. **Create Fallback Dispatch Queue:** + - It calls `scx_bpf_create_dsq` to create a fallback dispatch queue (`FALLBACK_DSQ_ID`). If this fails, it logs an error and exits. + +2. **Initialize Primary and Reserve CPU Masks:** + - It creates and clears a new `bpf_cpumask` for the primary mask. + - It exchanges the newly created mask with the existing `primary_cpumask`. If an old mask exists, it releases it. + - The same process is repeated for the reserve mask. + +3. **Initialize Per-CPU Contexts:** + - For each CPU, it retrieves the per-CPU context from the `pcpu_ctxs` map. + - It initializes the `scheduled_compaction` flag to `false`. + - It initializes the timer using `bpf_timer_init` and sets the callback to `compact_primary_core` using `bpf_timer_set_callback`. + - If any of these steps fail, it logs an error and exits. + +4. **Initialize and Start Statistics Timer:** + - It retrieves the central statistics timer from the `stats_timer` map. + - It initializes the timer and sets its callback to `stats_timerfn`. + - It starts the timer with a delay of `sampling_cadence_ns - 5000` nanoseconds. + - If starting the timer fails, it logs an error. + +5. **Return:** + - The function returns the result of the timer initialization, indicating success or failure. + +This initialization process ensures that all necessary components of the scheduler are correctly set up, including CPU masks, timers, and dispatch queues. + +#### `nest_exit` Function + +The `nest_exit` function handles cleanup when the scheduler is being removed or the system is shutting down: + +```c +void BPF_STRUCT_OPS(nest_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} +``` + +This function records exit information through the `UEI_RECORD` macro, ensuring that any necessary cleanup actions are performed. Proper cleanup is essential to maintain system stability and prevent resource leaks. + +### Final Scheduler Definition + +The `SCX_OPS_DEFINE` macro binds all the scheduler's functions into a single structure used by the `sched_ext` framework: + +```c +SCX_OPS_DEFINE(nest_ops, + .select_cpu = (void *)nest_select_cpu, + .enqueue = (void *)nest_enqueue, + .dispatch = (void *)nest_dispatch, + .running = (void *)nest_running, + .stopping = (void *)nest_stopping, + .init_task = (void *)nest_init_task, + .enable = (void *)nest_enable, + .init = (void *)nest_init, + .exit = (void *)nest_exit, + .flags = 0, + .name = "nest"); +``` + +This structure, `nest_ops`, effectively registers the scheduler's operations with the `sched_ext` framework, ensuring that the scheduler responds appropriately to various scheduling events and system states. + +## Compilation and Execution + +To compile and run the `scx_nest` scheduler, follow these steps: + +**Compile the Code:** + +Use `make` to build the scheduler. Ensure that you have the necessary build tools and kernel headers installed. + +```bash +make +``` + +**Run the Scheduler:** + +Execute the compiled scheduler binary. Depending on your system's configuration and permissions, you might need to run this command with elevated privileges. + +```bash +./scx_nest +``` + +### Sample Output + +Upon running the scheduler, you should observe output similar to the following: + +``` +# ./scx_nest + +Wakeup stats +------------ +WAKEUP_ATTACHED=150 +WAKEUP_PREV_PRIMARY=61 +WAKEUP_FULLY_IDLE_PRIMARY=0 +WAKEUP_ANY_IDLE_PRIMARY=103 +WAKEUP_FULLY_IDLE_RESERVE=0 +WAKEUP_ANY_IDLE_RESERVE=216 +WAKEUP_IDLE_OTHER=11 + + +Nest stats +---------- +TASK_IMPATIENT=67 +PROMOTED_TO_PRIMARY=217 +PROMOTED_TO_RESERVED=8 +DEMOTED_TO_RESERVED=212 +RESERVED_AT_CAPACITY=6 +SCHEDULED_COMPACTION=525 +CANCELLED_COMPACTION=314 +EAGERLY_COMPACTED=8 +CALLBACK_COMPACTED=208 + + +Consume stats +------------- +CONSUMED=166 +NOT_CONSUMED=667 + + + +Masks +----- +PRIMARY ( 0): | -------------------------------------------------------------------------------------------------------------------------------- | +RESERVED (10): | ***-*--*--------------------------------------------------------***-*--*-------------------------------------------------------- | +OTHER (128): | ******************************************************************************************************************************** | +IDLE (16): | ********--------------------------------------------------------********-------------------------------------------------------- | + + +^CEXIT: unregistered from user space +``` + +This output provides comprehensive statistics on task wakeups, nest operations, consumption rates, and CPU mask statuses. It indicates how the scheduler is managing tasks and CPU cores, showcasing the effectiveness of the `scx_nest` algorithm in maintaining high-frequency core utilization and efficient task placement. + +## Summary and Call to Action + +In this tutorial, we've delved into the implementation of the `scx_nest` scheduler, an advanced eBPF program that customizes CPU scheduling to optimize performance based on core frequency and utilization. By leveraging the `sched_ext` framework, `scx_nest` demonstrates how eBPF can dynamically define scheduling behavior, offering flexibility and control beyond traditional schedulers. + +Key takeaways include: + +- Understanding the flexibility and power of the `sched_ext` scheduler class. +- Exploring the intricate data structures and maps that underpin the `scx_nest` scheduler. +- Analyzing core functions that manage task placement, core compaction, and statistics collection. +- Learning how to compile and execute the scheduler, observing its impact through detailed statistics. + +The `scx_nest` scheduler serves as an excellent example of how advanced eBPF programming can be utilized to implement complex system functionalities in a flexible and dynamic manner. + +If you'd like to dive deeper into eBPF and explore more advanced examples, visit our tutorial repository at [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or check out our website at [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/). + +## References + +The original source code for the `scx_nest` scheduler is available in the [sched-ext/scx](https://github.com/sched-ext/scx) repository. + +Additional resources that can enhance your understanding include: + +- **Linux Kernel Documentation:** [Scheduler Ext Documentation](https://www.kernel.org/doc/html/next/scheduler/sched-ext.html) +- **Kernel Source Tree:** [Linux Kernel `sched_ext` Tools](https://github.com/torvalds/linux/tree/master/tools/sched_ext) +- **eBPF Official Documentation:** [https://ebpf.io/docs/](https://ebpf.io/docs/) +- **libbpf Documentation:** [https://github.com/libbpf/libbpf](https://github.com/libbpf/libbpf) + +Feel free to explore these resources to expand your knowledge and continue your journey into advanced eBPF programming! \ No newline at end of file diff --git a/src/45-scx-nest/README.zh.md b/src/45-scx-nest/README.zh.md new file mode 100644 index 00000000..0cae7768 --- /dev/null +++ b/src/45-scx-nest/README.zh.md @@ -0,0 +1,903 @@ +# eBPF 示例教程:实现 `scx_nest` 调度器 + +在系统性能优化不断发展的领域中,自定义和扩展内核行为的能力是非常宝贵的。实现这一目标的最强大工具之一是 eBPF(扩展的 Berkeley 包过滤器)。在本教程中,我们将探讨 `scx_nest` 调度器的实现,这是一个先进的 eBPF 程序,利用了在 Linux 内核版本 `6.12` 中引入的 `sched_ext` 调度器类。在本指南结束时,您将了解如何构建一个复杂的调度器,该调度器根据 CPU 核心频率和利用率动态调整任务分配。 + +## `sched_ext` 介绍 + +`sched_ext` 调度器类标志着 Linux 内核调度能力的重大进步。与传统调度器不同,`sched_ext` 允许通过一组 BPF(Berkeley 包过滤器)程序动态定义其行为。这种灵活性使开发人员能够实现针对特定工作负载和系统需求量身定制的自定义调度算法。 + +## 理解 `scx_nest` 调度器 + +### 概述 + +`scx_nest` 调度器受 Inria Paris 论文《[OS Scheduling with Nest: Keeping Tasks Close Together on Warm Cores](https://hal.inria.fr/hal-03612592/file/paper.pdf)》的启发。由 Meta Platforms, Inc. 开发,`scx_nest` 专注于鼓励将任务分配到基于最近使用模式可能以更高频率运行的 CPU 核心上。这种方法旨在通过确保任务在最有效的核心上执行来优化性能。 + +该调度器作为一个全局加权虚拟时间(vtime)调度器运行,类似于完全公平调度器(CFS),同时利用 Nest 算法在任务唤醒时选择空闲核心。这种双重策略确保任务不仅被公平分配,还被放置在能够最有效执行它们的核心上。 + +`scx_nest` 旨在优化 CPU 利用率相对较低且可以受益于在少数核心上运行的工作负载。通过将任务集中在较少的核心上,调度器有助于保持这些核心的高频率,从而提升性能。然而,对于那些在分布到多个核心以避免缓存抖动时表现更好的工作负载,`scx_nest` 可能并不是理想选择。评估 `scx_nest` 对特定工作负载的适用性通常需要实验。 + +鉴于其设计,`scx_nest` 适用于生产环境,前提是满足硬件限制。它在具有统一 L3 缓存拓扑的单个 CCX(核心复合体)或单插槽主机上表现最佳。虽然当前版本未实现抢占,但所有 CPU 共享的调度队列确保队列前端的任务能够及时执行,前提是有足够的 CPU 可用。 + +## 高级代码分析 + +`scx_nest` 调度器的实现复杂,涉及各种数据结构、映射和函数,它们协同工作以管理任务分配和 CPU 核心利用率。完整的源代码可在 [eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 仓库中找到。下面,我们将剖析调度器的核心组件,详细解释每个部分。 + +### 核心数据结构和映射 + +#### 任务上下文 (`task_ctx`) + +系统中的每个任务都有一个关联的上下文,用于维护与调度相关的信息。这个上下文对于基于任务的历史和当前状态做出明智的调度决策至关重要。 + +```c +/* 每个任务的调度上下文 */ +struct task_ctx { + /* + * 用于计算任务的主掩码和保留掩码的临时 cpumask。 + */ + struct bpf_cpumask __kptr *tmp_mask; + + /* + * 任务观察到其之前的核心不为空闲的次数。如果连续发生 r_impatient 次, + * 将尝试从保留 Nest 或回退 Nest 中获取一个核心。 + */ + u32 prev_misses; + + /* + * 任务“附加”的核心,意味着它至少连续在该核心上执行了两次, + * 并且在唤醒时首先尝试迁移到该核心。任务只有在附加核心空闲且 + * 在主 Nest 中时才会迁移到附加核心。 + */ + s32 attached_core; + + /* + * 任务上次执行的核心。这用于确定任务是否应该附加到下一个 + * 执行的核心。 + */ + s32 prev_cpu; +}; +``` + +`task_ctx` 结构体包含一个临时 CPU 掩码 (`tmp_mask`),用于计算任务的主 CPU 集合和保留 CPU 集合。`prev_misses` 计数器跟踪任务的首选核心不为空闲的次数,影响迁移任务到不同核心的决策。`attached_core` 指示任务当前绑定的核心,确保在可能的情况下在高频率核心上运行。最后,`prev_cpu` 记录任务上次执行的核心,有助于维护任务与核心的亲和性。 + +#### 每 CPU 上下文 (`pcpu_ctx`) + +每个 CPU 都有一个关联的上下文,用于管理定时器和压缩状态。这个上下文有助于确定何时由于不活动而将核心从主 Nest 中降级。 + +```c +struct pcpu_ctx { + /* 用于从主 Nest 中压缩核心的定时器。 */ + struct bpf_timer timer; + + /* 当前核心是否已安排进行压缩。 */ + bool scheduled_compaction; +}; +``` + +`pcpu_ctx` 结构体包含一个 `bpf_timer`,用于调度压缩事件,以及一个布尔标志 `scheduled_compaction`,指示是否已为核心安排了压缩。 + +#### 映射 + +多个 BPF 映射用于存储上下文和管理定时器: + +```c +/* 任务存储映射 */ +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +/* 每 CPU 上下文 */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1024); + __type(key, s32); + __type(value, struct pcpu_ctx); +} pcpu_ctxs SEC(".maps"); + +/* 统计定时器 */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct stats_timer); +} stats_timer SEC(".maps"); +``` + +- **`task_ctx_stor`:** 该映射存储每个任务的调度上下文,使调度器能够访问和修改特定任务的信息。 +- **`pcpu_ctxs`:** 一个数组映射,保存每个 CPU 的上下文,使调度器能够管理每个 CPU 的定时器和压缩状态。 +- **`stats_timer`:** 一个单条目的数组映射,用于管理用于收集调度统计信息的中央定时器。 + +此外,调度器维护了主 CPU 掩码、保留 CPU 掩码、其他 CPU 掩码和空闲 CPU 掩码,以及用于跟踪各种调度器指标的统计映射。 + +### 核心函数 + +#### `stat_inc` + +一个辅助函数,用于递增调度统计数据: + +```c +static __always_inline void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} +``` + +此函数在 `stats` 映射中查找一个计数器,并在计数器存在时递增它。调度器在各处使用它来跟踪各种事件和状态。 + +#### `vtime_before` + +一个用于比较虚拟时间的实用函数: + +```c +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} +``` + +此函数确定虚拟时间 `a` 是否在 `b` 之前,有助于基于时间的调度决策。 + +#### `try_make_core_reserved` + +尝试将一个核心提升为保留 Nest: + +```c +static __always_inline void +try_make_core_reserved(s32 cpu, struct bpf_cpumask * reserved, bool promotion) +{ + s32 tmp_nr_reserved; + + /* + * 此检查存在竞争,但没关系。如果我们错误地未能将核心提升到保留, + * 那是因为另一个上下文在这个小窗口中添加或移除了保留中的核心。 + * 这将在随后的唤醒中平衡。 + */ + tmp_nr_reserved = nr_reserved; + if (tmp_nr_reserved < r_max) { + /* + * 这里有可能暂时超过 r_max,但随着更多核心被降级或未能 + * 被提升到保留 Nest,应该会平衡。 + */ + __sync_fetch_and_add(&nr_reserved, 1); + bpf_cpumask_set_cpu(cpu, reserved); + if (promotion) + stat_inc(NEST_STAT(PROMOTED_TO_RESERVED)); + else + stat_inc(NEST_STAT(DEMOTED_TO_RESERVED)); + } else { + bpf_cpumask_clear_cpu(cpu, reserved); + stat_inc(NEST_STAT(RESERVED_AT_CAPACITY)); + } +} +``` + +`try_make_core_reserved` 函数尝试将一个 CPU 核心添加到保留掩码中。首先检查保留核心的数量 (`nr_reserved`) 是否低于允许的最大值 (`r_max`)。如果是,则递增 `nr_reserved` 计数器并将核心添加到保留掩码中。根据核心是被提升还是降级,递增相应的统计数据。如果保留容量已满,则从保留掩码中清除核心并更新相关统计数据。 + +#### `update_attached` + +根据最近的执行更新任务的附加核心: + +```c +static void update_attached(struct task_ctx *tctx, s32 prev_cpu, s32 new_cpu) +{ + if (tctx->prev_cpu == new_cpu) + tctx->attached_core = new_cpu; + tctx->prev_cpu = prev_cpu; +} +``` + +此函数更新任务的 `attached_core`。如果任务连续在同一核心上执行,它会将任务附加到该核心。然后更新 `prev_cpu` 以反映任务最近运行的核心。 + +#### `compact_primary_core` + +处理主核心的压缩,将其降级到保留 Nest: + +```c +static int compact_primary_core(void *map, int *key, struct bpf_timer *timer) +{ + struct bpf_cpumask *primary, *reserve; + s32 cpu = bpf_get_smp_processor_id(); + struct pcpu_ctx *pcpu_ctx; + + stat_inc(NEST_STAT(CALLBACK_COMPACTED)); + + /* + * 如果我们到达此回调,这意味着定时器回调从未被取消, + * 因此需要将核心从主 Nest 中降级。 + */ + pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu); + if (!pcpu_ctx) { + scx_bpf_error("无法查找 pcpu ctx"); + return 0; + } + bpf_rcu_read_lock(); + primary = primary_cpumask; + reserve = reserve_cpumask; + if (!primary || !reserve) { + scx_bpf_error("无法找到 primary 或 reserve"); + bpf_rcu_read_unlock(); + return 0; + } + + bpf_cpumask_clear_cpu(cpu, primary); + try_make_core_reserved(cpu, reserve, false); + bpf_rcu_read_unlock(); + pcpu_ctx->scheduled_compaction = false; + return 0; +} +``` + +当压缩定时器到期时,将调用 `compact_primary_core`。它通过从主掩码中清除当前 CPU 核心并尝试将其添加到保留掩码中,将当前 CPU 核心从主 Nest 降级到保留 Nest。这确保了不活动的核心得到有效管理,保持性能和资源利用之间的平衡。 + +#### `nest_select_cpu` + +在任务唤醒时确定适当的 CPU: + +```c +s32 BPF_STRUCT_OPS(nest_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + struct bpf_cpumask *p_mask, *primary, *reserve; + s32 cpu; + struct task_ctx *tctx; + struct pcpu_ctx *pcpu_ctx; + bool direct_to_primary = false, reset_impatient = true; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) + return -ENOENT; + + bpf_rcu_read_lock(); + p_mask = tctx->tmp_mask; + primary = primary_cpumask; + reserve = reserve_cpumask; + if (!p_mask || !primary || !reserve) { + bpf_rcu_read_unlock(); + return -ENOENT; + } + + tctx->prev_cpu = prev_cpu; + + bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary)); + + /* 首先尝试在附加核心上唤醒任务。 */ + if (bpf_cpumask_test_cpu(tctx->attached_core, cast_mask(p_mask)) && + scx_bpf_test_and_clear_cpu_idle(tctx->attached_core)) { + cpu = tctx->attached_core; + stat_inc(NEST_STAT(WAKEUP_ATTACHED)); + goto migrate_primary; + } + + /* + * 如果之前的核心在主集合中,并且没有 hypertwin,则尝试留在之前的核心。 + * 如果之前的核心是任务附加的核心,不需要再尝试,因为我们已经在上面尝试过了。 + */ + if (prev_cpu != tctx->attached_core && + bpf_cpumask_test_cpu(prev_cpu, cast_mask(p_mask)) && + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + stat_inc(NEST_STAT(WAKEUP_PREV_PRIMARY)); + goto migrate_primary; + } + + if (find_fully_idle) { + /* 然后尝试在主集合中选择任何完全空闲的核心。 */ + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), + SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_PRIMARY)); + goto migrate_primary; + } + } + + /* 然后尝试在主集合中选择任何空闲的核心,即使其 hypertwin 正在活动。 */ + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0); + if (cpu >= 0) { + stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_PRIMARY)); + goto migrate_primary; + } + + if (r_impatient > 0 && ++tctx->prev_misses >= r_impatient) { + direct_to_primary = true; + tctx->prev_misses = 0; + stat_inc(NEST_STAT(TASK_IMPATIENT)); + } + + reset_impatient = false; + + /* 然后尝试在保留集合中选择任何完全空闲的核心。 */ + bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(reserve)); + if (find_fully_idle) { + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), + SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_RESERVE)); + goto promote_to_primary; + } + } + + /* 然后尝试在保留集合中选择任何空闲的核心,即使其 hypertwin 正在活动。 */ + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0); + if (cpu >= 0) { + stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_RESERVE)); + goto promote_to_primary; + } + + /* 然后尝试在任务的 cpumask 中选择任何空闲的核心。 */ + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) { + /* + * 我们找到了一个核心(我们认为它不在任何 Nest 中)。 + * 这意味着我们需要将该核心提升到保留 Nest,或者如果由于 + * 超过 r_impatient 而直接提升到主 Nest。 + * + * 我们必须在这里进行最后一次检查,看看核心是否在主掩码或保留掩码中, + * 因为我们可能与核心在将主掩码和保留掩码与 p->cpus_ptr 进行 AND + * 运算之间更改状态,并使用 scx_bpf_pick_idle_cpu() 原子性地保留它。 + * 这在上面的检查中技术上也是如此,但在那些情况下我们只是直接 + * 将核心放入主掩码中,因此问题不大。在这里,我们要确保不会 + * 意外地将已经在主掩码中的核心放入保留 Nest 中。这是不太可能的, + * 但我们在应该相对冷路径上进行了检查。 + */ + stat_inc(NEST_STAT(WAKEUP_IDLE_OTHER)); + if (bpf_cpumask_test_cpu(cpu, cast_mask(primary))) + goto migrate_primary; + else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) + goto promote_to_primary; + else if (direct_to_primary) + goto promote_to_primary; + else + try_make_core_reserved(cpu, reserve, true); + bpf_rcu_read_unlock(); + return cpu; + } + + bpf_rcu_read_unlock(); + return prev_cpu; + +promote_to_primary: + stat_inc(NEST_STAT(PROMOTED_TO_PRIMARY)); +migrate_primary: + if (reset_impatient) + tctx->prev_misses = 0; + pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu); + if (pcpu_ctx) { + if (pcpu_ctx->scheduled_compaction) { + if (bpf_timer_cancel(&pcpu_ctx->timer) < 0) + scx_bpf_error("取消 pcpu 定时器失败"); + if (bpf_timer_set_callback(&pcpu_ctx->timer, compact_primary_core)) + scx_bpf_error("重新设置 pcpu 定时器回调失败"); + pcpu_ctx->scheduled_compaction = false; + stat_inc(NEST_STAT(CANCELLED_COMPACTION)); + } + } else { + scx_bpf_error("查找 pcpu ctx 失败"); + } + bpf_cpumask_set_cpu(cpu, primary); + /* + * 检查 CPU 是否在保留掩码中。如果是,这可能发生在核心在我们尝试 + * 将当前唤醒任务分配到其上时被并发地压缩。同样,如果我们在 + * 由于超时直接提升到主 Nest,也会发生这种情况。 + * + * 我们不必担心与其他唤醒任务的竞争,因为我们已经通过(某种 + * 变体的)scx_bpf_pick_idle_cpu() 原子性地保留了该核心。 + */ + if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) { + __sync_sub_and_fetch(&nr_reserved, 1); + bpf_cpumask_clear_cpu(cpu, reserve); + } + bpf_rcu_read_unlock(); + update_attached(tctx, prev_cpu, cpu); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0); + return cpu; +} +``` + +`nest_select_cpu` 函数是 `scx_nest` 调度器的核心。当任务唤醒时,此函数确定其执行最合适的 CPU 核心。该函数遵循一系列检查,以确保任务被放置在高频率、空闲的核心上,从而提升效率和性能。 + +最初,它从 `task_ctx_stor` 映射中检索任务的上下文。然后,它锁定读拷贝更新(RCU)锁,以安全地访问主掩码和保留掩码。调度器首先尝试将任务放置在其附加核心上,确保核心亲和性。如果附加核心不空闲,它会尝试先前的核心。根据各种条件,包括任务的急躁程度 (`r_impatient`) 和主 Nest 及保留 Nest 中空闲核心的可用性,调度器决定是否迁移任务、将核心提升到主 Nest,或将核心降级到保留 Nest。 + +在整个过程中,调度器更新相关统计数据,以提供对其操作的见解。使用 RCU 锁确保调度器的决策是在不干扰其他并发操作的情况下安全做出的。 + +#### `nest_enqueue` + +处理将任务入队到调度队列: + +```c +void BPF_STRUCT_OPS(nest_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct task_ctx *tctx; + u64 vtime = p->scx.dsq_vtime; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("无法找到任务上下文"); + return; + } + + /* + * 将空闲任务的预算限制为一个切片。 + */ + if (vtime_before(vtime, vtime_now - slice_ns)) + vtime = vtime_now - slice_ns; + + scx_bpf_dispatch_vtime(p, FALLBACK_DSQ_ID, slice_ns, vtime, enq_flags); +} +``` + +`nest_enqueue` 函数管理任务的入队,调整其虚拟时间 (`vtime`) 以确保公平性并防止任务在空闲时积累过多的执行预算。如果任务的 `vtime` 低于某个阈值,它将被调整以保持调度器内部的平衡。 + +#### `nest_dispatch` + +管理将任务分派到 CPU 核心: + +```c +void BPF_STRUCT_OPS(nest_dispatch, s32 cpu, struct task_struct *prev) +{ + struct pcpu_ctx *pcpu_ctx; + struct bpf_cpumask *primary, *reserve; + s32 key = cpu; + bool in_primary; + + primary = primary_cpumask; + reserve = reserve_cpumask; + if (!primary || !reserve) { + scx_bpf_error("没有主或保留 cpumask"); + return; + } + + pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key); + if (!pcpu_ctx) { + scx_bpf_error("查找 pcpu ctx 失败"); + return; + } + + if (!scx_bpf_consume(FALLBACK_DSQ_ID)) { + in_primary = bpf_cpumask_test_cpu(cpu, cast_mask(primary)); + + if (prev && (prev->scx.flags & SCX_TASK_QUEUED) && in_primary) { + scx_bpf_dispatch(prev, SCX_DSQ_LOCAL, slice_ns, 0); + return; + } + + stat_inc(NEST_STAT(NOT_CONSUMED)); + if (in_primary) { + /* + * 如果主集合中的前一个任务正在死亡,立即降级主核心。 + * + * 注意,我们选择不压缩掩码中的“第一个” CPU,以鼓励至少保留一个核心在 Nest 中。 + * 最好检查是否仅剩一个核心在 Nest 中,但 BPF 目前没有用于查询 + * cpumask 权重的内核函数。 + */ + if ((prev && prev->__state == TASK_DEAD) && + (cpu != bpf_cpumask_first(cast_mask(primary)))) { + stat_inc(NEST_STAT(EAGERLY_COMPACTED)); + bpf_cpumask_clear_cpu(cpu, primary); + try_make_core_reserved(cpu, reserve, false); + } else { + pcpu_ctx->scheduled_compaction = true; + /* + * 核心不再被使用。设置定时器以在 p_remove 中移除核心 + * 如果在那时仍未使用。 + */ + bpf_timer_start(&pcpu_ctx->timer, p_remove_ns, + BPF_F_TIMER_CPU_PIN); + stat_inc(NEST_STAT(SCHEDULED_COMPACTION)); + } + } + return; + } + stat_inc(NEST_STAT(CONSUMED)); +} +``` + +`nest_dispatch` 函数负责将任务分派到 CPU 核心。它首先检查回退调度队列 (`FALLBACK_DSQ_ID`) 中是否有可用任务。如果没有任务被消耗,它会评估 CPU 上的前一个任务是否已经死亡。如果是,并且 CPU 不在主掩码中的第一个位置,调度器将核心降级到保留 Nest。否则,它会为核心安排一个压缩定时器,以便在指定时间后可能降级该核心。如果从回退队列成功消耗了一个任务,它会递增相应的统计数据。 + +#### `nest_running` + +当任务开始运行时更新全局虚拟时间: + +```c +void BPF_STRUCT_OPS(nest_running, struct task_struct *p) +{ + /* + * 全局虚拟时间在任务开始执行时总是向前推进。 + * 测试和更新可以从多个 CPU 同时执行,因此存在竞争。 + * 任何错误都应该是可控且暂时的。我们就这样处理。 + */ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} +``` + +`nest_running` 函数确保全局虚拟时间 (`vtime_now`) 在任务开始执行时向前推进。这一机制有助于维护调度器操作的公平性和时间一致性。 + +#### `nest_stopping` + +处理任务停止运行,调整其虚拟时间: + +```c +void BPF_STRUCT_OPS(nest_stopping, struct task_struct *p, bool runnable) +{ + /* 按权重的倒数和费用缩放执行时间 */ + p->scx.dsq_vtime += (slice_ns - p->scx.slice) * 100 / p->scx.weight; +} +``` + +当任务停止运行时,`nest_stopping` 根据其执行切片和权重调整其虚拟时间。这一调整确保任务在调度器的虚拟时间计算中得到公平考虑,保持平衡并防止任何单个任务垄断 CPU 资源。 + +#### `nest_init_task` + +初始化新任务的上下文: + +```c +s32 BPF_STRUCT_OPS(nest_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + struct task_ctx *tctx; + struct bpf_cpumask *cpumask; + + /* + * @p 是新的。确保其 task_ctx 可用。 + * 我们可以在此函数中休眠,以下内容将自动使用 GFP_KERNEL。 + */ + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!tctx) + return -ENOMEM; + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + + cpumask = bpf_kptr_xchg(&tctx->tmp_mask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + tctx->attached_core = -1; + tctx->prev_cpu = -1; + + return 0; +} +``` + +`nest_init_task` 函数为新任务初始化调度上下文。它通过从 `task_ctx_stor` 映射中检索任务的上下文来确保任务的上下文可用,创建一个新的 `bpf_cpumask` 进行临时计算,并为 `attached_core` 和 `prev_cpu` 设置初始值。 + +#### `nest_enable` + +通过设置任务的虚拟时间启用调度: + +```c +void BPF_STRUCT_OPS(nest_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} +``` + +`nest_enable` 函数通过将任务的虚拟时间 (`dsq_vtime`) 初始化为当前的全局虚拟时间 (`vtime_now`) 来激活任务的调度。这确保了任务的调度状态与调度器的虚拟时间同步。 + +#### `stats_timerfn` + +处理定期的统计信息收集: + +```c +static int stats_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + s32 cpu; + struct bpf_cpumask *primary, *reserve; + const struct cpumask *idle; + stats_primary_mask = 0; + stats_reserved_mask = 0; + stats_other_mask = 0; + stats_idle_mask = 0; + long err; + + bpf_rcu_read_lock(); + primary = primary_cpumask; + reserve = reserve_cpumask; + if (!primary || !reserve) { + bpf_rcu_read_unlock(); + scx_bpf_error("查找主或保留失败"); + return 0; + } + + idle = scx_bpf_get_idle_cpumask(); + bpf_for(cpu, 0, nr_cpus) { + if (bpf_cpumask_test_cpu(cpu, cast_mask(primary))) + stats_primary_mask |= (1ULL << cpu); + else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) + stats_reserved_mask |= (1ULL << cpu); + else + stats_other_mask |= (1ULL << cpu); + + if (bpf_cpumask_test_cpu(cpu, idle)) + stats_idle_mask |= (1ULL << cpu); + } + bpf_rcu_read_unlock(); + scx_bpf_put_idle_cpumask(idle); + + err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0); + if (err) + scx_bpf_error("启动统计定时器失败"); + + return 0; +} +``` + +`stats_timerfn` 函数由中央定时器定期调用,用于收集和更新调度统计信息。它捕捉当前 CPU 核心的状态,将它们分类到主、保留、其他和空闲掩码中。这些信息提供了调度器如何管理 CPU 资源和任务分配的洞察。在收集统计信息后,该函数重新启动定时器以确保持续监控。 + +#### `nest_init` + +初始化 `scx_nest` 调度器: + +```c +s32 BPF_STRUCT_OPS_SLEEPABLE(nest_init) +{ + struct bpf_cpumask *cpumask; + s32 cpu; + int err; + struct bpf_timer *timer; + u32 key = 0; + + err = scx_bpf_create_dsq(FALLBACK_DSQ_ID, NUMA_NO_NODE); + if (err) { + scx_bpf_error("创建回退 DSQ 失败"); + return err; + } + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + bpf_cpumask_clear(cpumask); + cpumask = bpf_kptr_xchg(&primary_cpumask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + + bpf_cpumask_clear(cpumask); + cpumask = bpf_kptr_xchg(&reserve_cpumask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + bpf_for(cpu, 0, nr_cpus) { + s32 key = cpu; + struct pcpu_ctx *ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key); + + if (!ctx) { + scx_bpf_error("查找 pcpu_ctx 失败"); + return -ENOENT; + } + ctx->scheduled_compaction = false; + if (bpf_timer_init(&ctx->timer, &pcpu_ctxs, CLOCK_BOOTTIME)) { + scx_bpf_error("初始化 pcpu 定时器失败"); + return -EINVAL; + } + err = bpf_timer_set_callback(&ctx->timer, compact_primary_core); + if (err) { + scx_bpf_error("设置 pcpu 定时器回调失败"); + return -EINVAL; + } + } + + timer = bpf_map_lookup_elem(&stats_timer, &key); + if (!timer) { + scx_bpf_error("查找中央定时器失败"); + return -ESRCH; + } + bpf_timer_init(timer, &stats_timer, CLOCK_BOOTTIME); + bpf_timer_set_callback(timer, stats_timerfn); + err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0); + if (err) + scx_bpf_error("启动统计定时器失败"); + + return err; +} +``` + +`nest_init` 函数在系统初始化期间设置 `scx_nest` 调度器。它创建了一个回退调度队列 (`FALLBACK_DSQ_ID`) 并初始化了主掩码和保留掩码。对于每个 CPU,它从 `pcpu_ctxs` 映射中检索每 CPU 上下文,初始化压缩定时器,并将回调设置为 `compact_primary_core`。此外,它初始化并启动中央统计定时器 (`stats_timer`) 及其回调函数 `stats_timerfn`,确保调度器统计信息的持续监控。 + +#### `nest_exit` + +在调度器退出时进行清理: + +```c +void BPF_STRUCT_OPS(nest_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} +``` + +`nest_exit` 函数记录退出信息并在调度器被移除或系统关闭时执行任何必要的清理操作。这确保所有资源得到适当释放,系统保持稳定。 + +#### `SCX_OPS_DEFINE` + +为 `scx_nest` 调度器定义操作结构: + +```c +SCX_OPS_DEFINE(nest_ops, + .select_cpu = (void *)nest_select_cpu, + .enqueue = (void *)nest_enqueue, + .dispatch = (void *)nest_dispatch, + .running = (void *)nest_running, + .stopping = (void *)nest_stopping, + .init_task = (void *)nest_init_task, + .enable = (void *)nest_enable, + .init = (void *)nest_init, + .exit = (void *)nest_exit, + .flags = 0, + .name = "nest"); +``` + +`SCX_OPS_DEFINE` 宏将调度器的所有函数绑定到 `nest_ops` 结构中,`sched_ext` 框架使用该结构与调度器进行接口。这确保调度器的操作在任务调度事件期间被正确映射和调用。 + +### 初始化和清理 + +适当的初始化和清理对于调度器的稳定性和性能至关重要。 + +#### `nest_init` 函数 + +`nest_init` 函数负责在系统初始化期间设置调度器。其操作如下: + +1. **创建回退调度队列:** + - 调用 `scx_bpf_create_dsq` 创建回退调度队列 (`FALLBACK_DSQ_ID`)。如果失败,记录错误并退出。 + +2. **初始化主掩码和保留掩码:** + - 创建并清除一个新的 `bpf_cpumask` 作为主掩码。 + - 将新创建的掩码与现有的 `primary_cpumask` 交换。如果存在旧掩码,则释放它。 + - 对保留掩码重复相同的过程。 + +3. **初始化每 CPU 上下文:** + - 对于每个 CPU,从 `pcpu_ctxs` 映射中检索每 CPU 上下文。 + - 将 `scheduled_compaction` 标志初始化为 `false`。 + - 使用 `bpf_timer_init` 初始化定时器,并使用 `bpf_timer_set_callback` 将回调设置为 `compact_primary_core`。 + - 如果任何步骤失败,记录错误并退出。 + +4. **初始化并启动统计定时器:** + - 从 `stats_timer` 映射中检索中央统计定时器。 + - 初始化定时器并将其回调设置为 `stats_timerfn`。 + - 以 `sampling_cadence_ns - 5000` 纳秒的延迟启动定时器。 + - 如果启动定时器失败,记录错误。 + +5. **返回:** + - 函数返回定时器初始化的结果,指示成功或失败。 + +这一初始化过程确保调度器的所有必要组件(包括 CPU 掩码、定时器和调度队列)都已正确设置。 + +#### `nest_exit` 函数 + +`nest_exit` 函数在调度器被移除或系统关闭时处理清理工作: + +```c +void BPF_STRUCT_OPS(nest_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} +``` + +此函数通过 `UEI_RECORD` 宏记录退出信息,确保执行任何必要的清理操作。这对于保持系统稳定性和防止资源泄漏至关重要。 + +### 最终调度器定义 + +`SCX_OPS_DEFINE` 宏将调度器的所有函数绑定到单一结构中,供 `sched_ext` 框架使用: + +```c +SCX_OPS_DEFINE(nest_ops, + .select_cpu = (void *)nest_select_cpu, + .enqueue = (void *)nest_enqueue, + .dispatch = (void *)nest_dispatch, + .running = (void *)nest_running, + .stopping = (void *)nest_stopping, + .init_task = (void *)nest_init_task, + .enable = (void *)nest_enable, + .init = (void *)nest_init, + .exit = (void *)nest_exit, + .flags = 0, + .name = "nest"); +``` + +此结构体 `nest_ops` 有效地将调度器的操作注册到 `sched_ext` 框架,确保调度器在各种调度事件和系统状态下做出适当响应。 + +## 编译和执行 + +要编译和运行 `scx_nest` 调度器,请按照以下步骤操作: + +**编译代码:** + +使用 `make` 构建调度器。确保已安装必要的构建工具和内核头文件。 + +```bash +make +``` + +**运行调度器:** + +执行编译后的调度器二进制文件。根据系统配置和权限,您可能需要以提升的权限运行此命令。 + +```bash +./scx_nest +``` + +### 示例输出 + +运行调度器后,您应该会看到类似以下的输出: + +``` +# ./scx_nest + +唤醒统计 +------------ +WAKEUP_ATTACHED=150 +WAKEUP_PREV_PRIMARY=61 +WAKEUP_FULLY_IDLE_PRIMARY=0 +WAKEUP_ANY_IDLE_PRIMARY=103 +WAKEUP_FULLY_IDLE_RESERVE=0 +WAKEUP_ANY_IDLE_RESERVE=216 +WAKEUP_IDLE_OTHER=11 + + +Nest 统计 +---------- +TASK_IMPATIENT=67 +PROMOTED_TO_PRIMARY=217 +PROMOTED_TO_RESERVED=8 +DEMOTED_TO_RESERVED=212 +RESERVED_AT_CAPACITY=6 +SCHEDULED_COMPACTION=525 +CANCELLED_COMPACTION=314 +EAGERLY_COMPACTED=8 +CALLBACK_COMPACTED=208 + + +消耗统计 +------------- +CONSUMED=166 +NOT_CONSUMED=667 + + + +掩码 +----- +PRIMARY ( 0): | -------------------------------------------------------------------------------------------------------------------------------- | +RESERVED (10): | ***-*--*--------------------------------------------------------***-*--*-------------------------------------------------------- | +OTHER (128): | ******************************************************************************************************************************** | +IDLE (16): | ********--------------------------------------------------------********-------------------------------------------------------- | + + +^C退出:已从用户空间注销 +``` + +此输出提供了有关任务唤醒、Nest 操作、消耗率和 CPU 掩码状态的全面统计信息。它显示了调度器如何管理任务和 CPU 核心,展示了 `scx_nest` 算法在保持高频率核心利用率和高效任务分配方面的有效性。 + +## 总结与行动呼吁 + +在本教程中,我们深入探讨了 `scx_nest` 调度器的实现,这是一个先进的 eBPF 程序,基于核心频率和利用率定制 CPU 调度以优化性能。通过利用 `sched_ext` 框架,`scx_nest` 展示了 eBPF 如何动态定义调度行为,提供超越传统调度器的灵活性和控制力。 + +主要收获包括: + +- 理解 `sched_ext` 调度器类的灵活性和强大功能。 +- 探索支撑 `scx_nest` 调度器的复杂数据结构和映射。 +- 分析管理任务分配、核心压缩和统计信息收集的核心函数。 +- 学习如何编译和执行调度器,并通过详细统计信息观察其影响。 + +`scx_nest` 调度器是一个极好的例子,展示了如何利用先进的 eBPF 编程以灵活和动态的方式实现复杂的系统功能。 + +如果您想深入了解 eBPF 并探索更多高级示例,请访问我们的教程仓库 [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 或查看我们的网站 [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/)。 + +## 参考文献 + +`scx_nest` 调度器的原始源代码可在 [sched-ext/scx](https://github.com/sched-ext/scx) 仓库中找到。 + +可以增强您理解的其他资源包括: + +- **Linux 内核文档:** [Scheduler Ext 文档](https://www.kernel.org/doc/html/next/scheduler/sched-ext.html) +- **内核源树:** [Linux 内核 `sched_ext` 工具](https://github.com/torvalds/linux/tree/master/tools/sched_ext) +- **eBPF 官方文档:** [https://ebpf.io/docs/](https://ebpf.io/docs/) +- **libbpf 文档:** [https://github.com/libbpf/libbpf](https://github.com/libbpf/libbpf) + +欢迎探索这些资源,扩展您的知识,继续深入学习高级 eBPF 编程的旅程。 \ No newline at end of file diff --git a/src/45-scx-nest/include/bpf-compat/gnu/stubs.h b/src/45-scx-nest/include/bpf-compat/gnu/stubs.h new file mode 100644 index 00000000..ad7d139c --- /dev/null +++ b/src/45-scx-nest/include/bpf-compat/gnu/stubs.h @@ -0,0 +1,11 @@ +/* + * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when + * compiling BPF files although its content doesn't play any role. The file in + * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is + * defined. When compiling a BPF source, __x86_64__ isn't set and thus + * stubs-32.h is selected. However, the file is not there if the system doesn't + * have 32bit glibc devel package installed leading to a build failure. + * + * The problem is worked around by making this file available in the include + * search paths before the system one when building BPF. + */ diff --git a/src/45-scx-nest/include/scx/common.bpf.h b/src/45-scx-nest/include/scx/common.bpf.h new file mode 100644 index 00000000..225f61f9 --- /dev/null +++ b/src/45-scx-nest/include/scx/common.bpf.h @@ -0,0 +1,427 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef __SCX_COMMON_BPF_H +#define __SCX_COMMON_BPF_H + +#ifdef LSP +#define __bpf__ +#include "../vmlinux/vmlinux.h" +#else +#include "vmlinux.h" +#endif + +#include +#include +#include +#include "user_exit_info.h" + +#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ +#define PF_EXITING 0x00000004 +#define CLOCK_MONOTONIC 1 + +/* + * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can + * lead to really confusing misbehaviors. Let's trigger a build failure. + */ +static inline void ___vmlinux_h_sanity_check___(void) +{ + _Static_assert(SCX_DSQ_FLAG_BUILTIN, + "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); +} + +s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; +void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym; +void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym; +u32 scx_bpf_dispatch_nr_slots(void) __ksym; +void scx_bpf_dispatch_cancel(void) __ksym; +bool scx_bpf_consume(u64 dsq_id) __ksym; +void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym; +void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym; +bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +u32 scx_bpf_reenqueue_local(void) __ksym; +void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; +s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; +void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; +struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; +void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; +void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak; +void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; +void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak; +u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak; +u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak; +void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak; +u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; +const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; +const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; +void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak; +const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; +const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; +void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; +bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; +s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +bool scx_bpf_task_running(const struct task_struct *p) __ksym; +s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; +struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; +struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; + +/* + * Use the following as @it__iter when calling + * scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops. + */ +#define BPF_FOR_EACH_ITER (&___it) + +static inline __attribute__((format(printf, 1, 2))) +void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} + +/* + * Helper macro for initializing the fmt and variadic argument inputs to both + * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to + * refer to the initialized list of inputs to the bstr kfunc. + */ +#define scx_bpf_bstr_preamble(fmt, args...) \ + static char ___fmt[] = fmt; \ + /* \ + * Note that __param[] must have at least one \ + * element to keep the verifier happy. \ + */ \ + unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + +/* + * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments + * instead of an array of u64. Using this macro will cause the scheduler to + * exit cleanly with the specified exit code being passed to user space. + */ +#define scx_bpf_exit(code, fmt, args...) \ +({ \ + scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param)); \ + ___scx_bpf_bstr_format_checker(fmt, ##args); \ +}) + +/* + * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments + * instead of an array of u64. Invoking this macro will cause the scheduler to + * exit in an erroneous state, with diagnostic information being passed to the + * user. + */ +#define scx_bpf_error(fmt, args...) \ +({ \ + scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ + ___scx_bpf_bstr_format_checker(fmt, ##args); \ +}) + +/* + * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments + * instead of an array of u64. To be used from ops.dump() and friends. + */ +#define scx_bpf_dump(fmt, args...) \ +({ \ + scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param)); \ + ___scx_bpf_bstr_format_checker(fmt, ##args); \ +}) + +#define BPF_STRUCT_OPS(name, args...) \ +SEC("struct_ops/"#name) \ +BPF_PROG(name, ##args) + +#define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ +SEC("struct_ops.s/"#name) \ +BPF_PROG(name, ##args) + +/** + * RESIZABLE_ARRAY - Generates annotations for an array that may be resized + * @elfsec: the data section of the BPF program in which to place the array + * @arr: the name of the array + * + * libbpf has an API for setting map value sizes. Since data sections (i.e. + * bss, data, rodata) themselves are maps, a data section can be resized. If + * a data section has an array as its last element, the BTF info for that + * array will be adjusted so that length of the array is extended to meet the + * new length of the data section. This macro annotates an array to have an + * element count of one with the assumption that this array can be resized + * within the userspace program. It also annotates the section specifier so + * this array exists in a custom sub data section which can be resized + * independently. + * + * See RESIZE_ARRAY() for the userspace convenience macro for resizing an + * array declared with RESIZABLE_ARRAY(). + */ +#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr) + +/** + * MEMBER_VPTR - Obtain the verified pointer to a struct or array member + * @base: struct or array to index + * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...) + * + * The verifier often gets confused by the instruction sequence the compiler + * generates for indexing struct fields or arrays. This macro forces the + * compiler to generate a code sequence which first calculates the byte offset, + * checks it against the struct or array size and add that byte offset to + * generate the pointer to the member to help the verifier. + * + * Ideally, we want to abort if the calculated offset is out-of-bounds. However, + * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller + * must check for %NULL and take appropriate action to appease the verifier. To + * avoid confusing the verifier, it's best to check for %NULL and dereference + * immediately. + * + * vptr = MEMBER_VPTR(my_array, [i][j]); + * if (!vptr) + * return error; + * *vptr = new_value; + * + * sizeof(@base) should encompass the memory area to be accessed and thus can't + * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of + * `MEMBER_VPTR(ptr, ->member)`. + */ +#define MEMBER_VPTR(base, member) (typeof((base) member) *) \ +({ \ + u64 __base = (u64)&(base); \ + u64 __addr = (u64)&((base) member) - __base; \ + _Static_assert(sizeof(base) >= sizeof((base) member), \ + "@base is smaller than @member, is @base a pointer?"); \ + asm volatile ( \ + "if %0 <= %[max] goto +2\n" \ + "%0 = 0\n" \ + "goto +1\n" \ + "%0 += %1\n" \ + : "+r"(__addr) \ + : "r"(__base), \ + [max]"i"(sizeof(base) - sizeof((base) member))); \ + __addr; \ +}) + +/** + * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element + * @arr: array to index into + * @i: array index + * @n: number of elements in array + * + * Similar to MEMBER_VPTR() but is intended for use with arrays where the + * element count needs to be explicit. + * It can be used in cases where a global array is defined with an initial + * size but is intended to be be resized before loading the BPF program. + * Without this version of the macro, MEMBER_VPTR() will use the compile time + * size of the array to compute the max, which will result in rejection by + * the verifier. + */ +#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ +({ \ + u64 __base = (u64)arr; \ + u64 __addr = (u64)&(arr[i]) - __base; \ + asm volatile ( \ + "if %0 <= %[max] goto +2\n" \ + "%0 = 0\n" \ + "goto +1\n" \ + "%0 += %1\n" \ + : "+r"(__addr) \ + : "r"(__base), \ + [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ + __addr; \ +}) + + +/* + * BPF declarations and helpers + */ + +/* list and rbtree */ +#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) + +void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; +void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; + +#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) +#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) + +void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; +struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; +struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, + struct bpf_rb_node *node) __ksym; +int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), + void *meta, __u64 off) __ksym; +#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) + +struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; + +void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym; +#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL) + +/* task */ +struct task_struct *bpf_task_from_pid(s32 pid) __ksym; +struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; +void bpf_task_release(struct task_struct *p) __ksym; + +/* cgroup */ +struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; +void bpf_cgroup_release(struct cgroup *cgrp) __ksym; +struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; + +/* css iteration */ +struct bpf_iter_css; +struct cgroup_subsys_state; +extern int bpf_iter_css_new(struct bpf_iter_css *it, + struct cgroup_subsys_state *start, + unsigned int flags) __weak __ksym; +extern struct cgroup_subsys_state * +bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym; +extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; + +/* cpumask */ +struct bpf_cpumask *bpf_cpumask_create(void) __ksym; +struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; +u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; +u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, + const struct cpumask *src2) __ksym; +u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym; + +/* + * Access a cpumask in read-only mode (typically to check bits). + */ +const struct cpumask *cast_mask(struct bpf_cpumask *mask) +{ + return (const struct cpumask *)mask; +} + +/* rcu */ +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; + + +/* + * Other helpers + */ + +/* useful compiler attributes */ +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#define __maybe_unused __attribute__((__unused__)) + +/* + * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They + * prevent compiler from caching, redoing or reordering reads or writes. + */ +typedef __u8 __attribute__((__may_alias__)) __u8_alias_t; +typedef __u16 __attribute__((__may_alias__)) __u16_alias_t; +typedef __u32 __attribute__((__may_alias__)) __u32_alias_t; +typedef __u64 __attribute__((__may_alias__)) __u64_alias_t; + +static __always_inline void __read_once_size(const volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break; + case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break; + case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break; + case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break; + default: + barrier(); + __builtin_memcpy((void *)res, (const void *)p, size); + barrier(); + } +} + +static __always_inline void __write_once_size(volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break; + case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break; + case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break; + case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break; + default: + barrier(); + __builtin_memcpy((void *)p, (const void *)res, size); + barrier(); + } +} + +#define READ_ONCE(x) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__c = { 0 } }; \ + __read_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +#define WRITE_ONCE(x, val) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__val = (val) }; \ + __write_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +/* + * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. + * @v: The value for which we're computing the base 2 logarithm. + */ +static inline u32 log2_u32(u32 v) +{ + u32 r; + u32 shift; + + r = (v > 0xFFFF) << 4; v >>= r; + shift = (v > 0xFF) << 3; v >>= shift; r |= shift; + shift = (v > 0xF) << 2; v >>= shift; r |= shift; + shift = (v > 0x3) << 1; v >>= shift; r |= shift; + r |= (v >> 1); + return r; +} + +/* + * log2_u64 - Compute the base 2 logarithm of a 64-bit exponential value. + * @v: The value for which we're computing the base 2 logarithm. + */ +static inline u32 log2_u64(u64 v) +{ + u32 hi = v >> 32; + if (hi) + return log2_u32(hi) + 32 + 1; + else + return log2_u32(v) + 1; +} + +#include "compat.bpf.h" + +#endif /* __SCX_COMMON_BPF_H */ diff --git a/src/45-scx-nest/include/scx/common.h b/src/45-scx-nest/include/scx/common.h new file mode 100644 index 00000000..5b0f9015 --- /dev/null +++ b/src/45-scx-nest/include/scx/common.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo + * Copyright (c) 2023 David Vernet + */ +#ifndef __SCHED_EXT_COMMON_H +#define __SCHED_EXT_COMMON_H + +#ifdef __KERNEL__ +#error "Should not be included by BPF programs" +#endif + +#include +#include +#include +#include +#include + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +#define SCX_BUG(__fmt, ...) \ + do { \ + fprintf(stderr, "[SCX_BUG] %s:%d", __FILE__, __LINE__); \ + if (errno) \ + fprintf(stderr, " (%s)\n", strerror(errno)); \ + else \ + fprintf(stderr, "\n"); \ + fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + \ + exit(EXIT_FAILURE); \ + } while (0) + +#define SCX_BUG_ON(__cond, __fmt, ...) \ + do { \ + if (__cond) \ + SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__); \ + } while (0) + +/** + * RESIZE_ARRAY - Convenience macro for resizing a BPF array + * @__skel: the skeleton containing the array + * @elfsec: the data section of the BPF program in which the array exists + * @arr: the name of the array + * @n: the desired array element count + * + * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two + * operations. It resizes the map which corresponds to the custom data + * section that contains the target array. As a side effect, the BTF info for + * the array is adjusted so that the array length is sized to cover the new + * data section size. The second operation is reassigning the skeleton pointer + * for that custom data section so that it points to the newly memory mapped + * region. + */ +#define RESIZE_ARRAY(__skel, elfsec, arr, n) \ + do { \ + size_t __sz; \ + bpf_map__set_value_size((__skel)->maps.elfsec##_##arr, \ + sizeof((__skel)->elfsec##_##arr->arr[0]) * (n)); \ + (__skel)->elfsec##_##arr = \ + bpf_map__initial_value((__skel)->maps.elfsec##_##arr, &__sz); \ + } while (0) + +#include "user_exit_info.h" +#include "compat.h" + +#endif /* __SCHED_EXT_COMMON_H */ diff --git a/src/45-scx-nest/include/scx/compat.bpf.h b/src/45-scx-nest/include/scx/compat.bpf.h new file mode 100644 index 00000000..e5afe9ef --- /dev/null +++ b/src/45-scx-nest/include/scx/compat.bpf.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Tejun Heo + * Copyright (c) 2024 David Vernet + */ +#ifndef __SCX_COMPAT_BPF_H +#define __SCX_COMPAT_BPF_H + +#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ +({ \ + __type __ret = 0; \ + if (bpf_core_enum_value_exists(__type, __ent)) \ + __ret = __ent; \ + __ret; \ +}) + +/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */ +#define __COMPAT_scx_bpf_task_cgroup(p) \ + (bpf_ksym_exists(scx_bpf_task_cgroup) ? \ + scx_bpf_task_cgroup((p)) : NULL) + +/* v6.12: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") */ +#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it, slice) \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice) ? \ + scx_bpf_dispatch_from_dsq_set_slice((it), (slice)) : (void)0) +#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it, vtime) \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime) ? \ + scx_bpf_dispatch_from_dsq_set_vtime((it), (vtime)) : (void)0) +#define __COMPAT_scx_bpf_dispatch_from_dsq(it, p, dsq_id, enq_flags) \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq) ? \ + scx_bpf_dispatch_from_dsq((it), (p), (dsq_id), (enq_flags)) : false) +#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it, p, dsq_id, enq_flags) \ + (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq) ? \ + scx_bpf_dispatch_vtime_from_dsq((it), (p), (dsq_id), (enq_flags)) : false) + +/* + * Define sched_ext_ops. This may be expanded to define multiple variants for + * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). + */ +#define SCX_OPS_DEFINE(__name, ...) \ + SEC(".struct_ops.link") \ + struct sched_ext_ops __name = { \ + __VA_ARGS__, \ + }; + +#endif /* __SCX_COMPAT_BPF_H */ diff --git a/src/45-scx-nest/include/scx/compat.h b/src/45-scx-nest/include/scx/compat.h new file mode 100644 index 00000000..cc56ff9a --- /dev/null +++ b/src/45-scx-nest/include/scx/compat.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Tejun Heo + * Copyright (c) 2024 David Vernet + */ +#ifndef __SCX_COMPAT_H +#define __SCX_COMPAT_H + +#include +#include +#include +#include + +struct btf *__COMPAT_vmlinux_btf __attribute__((weak)); + +static inline void __COMPAT_load_vmlinux_btf(void) +{ + if (!__COMPAT_vmlinux_btf) { + __COMPAT_vmlinux_btf = btf__load_vmlinux_btf(); + SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()"); + } +} + +static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v) +{ + const struct btf_type *t; + const char *n; + s32 tid; + int i; + + __COMPAT_load_vmlinux_btf(); + + tid = btf__find_by_name(__COMPAT_vmlinux_btf, type); + if (tid < 0) + return false; + + t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); + SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); + + if (btf_is_enum(t)) { + struct btf_enum *e = btf_enum(t); + + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { + n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); + SCX_BUG_ON(!n, "btf__name_by_offset()"); + if (!strcmp(n, name)) { + *v = e[i].val; + return true; + } + } + } else if (btf_is_enum64(t)) { + struct btf_enum64 *e = btf_enum64(t); + + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { + n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); + SCX_BUG_ON(!n, "btf__name_by_offset()"); + if (!strcmp(n, name)) { + *v = btf_enum64_value(&e[i]); + return true; + } + } + } + + return false; +} + +#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ +({ \ + u64 __val = 0; \ + __COMPAT_read_enum(__type, __ent, &__val); \ + __val; \ +}) + +static inline bool __COMPAT_has_ksym(const char *ksym) +{ + __COMPAT_load_vmlinux_btf(); + return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0; +} + +static inline bool __COMPAT_struct_has_field(const char *type, const char *field) +{ + const struct btf_type *t; + const struct btf_member *m; + const char *n; + s32 tid; + int i; + + __COMPAT_load_vmlinux_btf(); + tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT); + if (tid < 0) + return false; + + t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); + SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); + + m = btf_members(t); + + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { + n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off); + SCX_BUG_ON(!n, "btf__name_by_offset()"); + if (!strcmp(n, field)) + return true; + } + + return false; +} + +#define SCX_OPS_SWITCH_PARTIAL \ + __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") + +static inline long scx_hotplug_seq(void) +{ + int fd; + char buf[32]; + ssize_t len; + long val; + + fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY); + if (fd < 0) + return -ENOENT; + + len = read(fd, buf, sizeof(buf) - 1); + SCX_BUG_ON(len <= 0, "read failed (%ld)", len); + buf[len] = 0; + close(fd); + + val = strtoul(buf, NULL, 10); + SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val); + + return val; +} + +/* + * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() + * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load + * and attach it, backward compatibility is automatically maintained where + * reasonable. + * + * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is + * the current minimum required kernel version. + */ +#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ + struct __scx_name *__skel; \ + \ + SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"), \ + "sched_ext_ops.dump() missing, kernel too old?"); \ + \ + __skel = __scx_name##__open(); \ + SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ + __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ + __skel; \ +}) + +#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \ + UEI_SET_SIZE(__skel, __ops_name, __uei_name); \ + SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \ +}) + +/* + * New versions of bpftool now emit additional link placeholders for BPF maps, + * and set up BPF skeleton in such a way that libbpf will auto-attach BPF maps + * automatically, assumming libbpf is recent enough (v1.5+). Old libbpf will do + * nothing with those links and won't attempt to auto-attach maps. + * + * To maintain compatibility with older libbpf while avoiding trying to attach + * twice, disable the autoattach feature on newer libbpf. + */ +#if LIBBPF_MAJOR_VERSION > 1 || \ + (LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5) +#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) \ + bpf_map__set_autoattach((__skel)->maps.__ops_name, false) +#else +#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0) +#endif + +#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({ \ + struct bpf_link *__link; \ + __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name); \ + SCX_BUG_ON(__scx_name##__attach((__skel)), "Failed to attach skel"); \ + __link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name); \ + SCX_BUG_ON(!__link, "Failed to attach struct_ops"); \ + __link; \ +}) + +#endif /* __SCX_COMPAT_H */ diff --git a/src/45-scx-nest/include/scx/user_exit_info.h b/src/45-scx-nest/include/scx/user_exit_info.h new file mode 100644 index 00000000..8ce27344 --- /dev/null +++ b/src/45-scx-nest/include/scx/user_exit_info.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef __USER_EXIT_INFO_H +#define __USER_EXIT_INFO_H + +enum uei_sizes { + UEI_REASON_LEN = 128, + UEI_MSG_LEN = 1024, + UEI_DUMP_DFL_LEN = 32768, +}; + +struct user_exit_info { + int kind; + s64 exit_code; + char reason[UEI_REASON_LEN]; + char msg[UEI_MSG_LEN]; +}; + +#ifdef __bpf__ + +#ifdef LSP +#include "../vmlinux/vmlinux.h" +#else +#include "vmlinux.h" +#endif +#include + +#define UEI_DEFINE(__name) \ + char RESIZABLE_ARRAY(data, __name##_dump); \ + const volatile u32 __name##_dump_len; \ + struct user_exit_info __name SEC(".data") + +#define UEI_RECORD(__uei_name, __ei) ({ \ + bpf_probe_read_kernel_str(__uei_name.reason, \ + sizeof(__uei_name.reason), (__ei)->reason); \ + bpf_probe_read_kernel_str(__uei_name.msg, \ + sizeof(__uei_name.msg), (__ei)->msg); \ + bpf_probe_read_kernel_str(__uei_name##_dump, \ + __uei_name##_dump_len, (__ei)->dump); \ + if (bpf_core_field_exists((__ei)->exit_code)) \ + __uei_name.exit_code = (__ei)->exit_code; \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ + (__ei)->kind); \ +}) + +#else /* !__bpf__ */ + +#include +#include + +/* no need to call the following explicitly if SCX_OPS_LOAD() is used */ +#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ + u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ + (__skel)->rodata->__uei_name##_dump_len = __len; \ + RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len); \ +}) + +#define UEI_EXITED(__skel, __uei_name) ({ \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \ +}) + +#define UEI_REPORT(__skel, __uei_name) ({ \ + struct user_exit_info *__uei = &(__skel)->data->__uei_name; \ + char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \ + if (__uei_dump[0] != '\0') { \ + fputs("\nDEBUG DUMP\n", stderr); \ + fputs("================================================================================\n\n", stderr); \ + fputs(__uei_dump, stderr); \ + fputs("\n================================================================================\n\n", stderr); \ + } \ + fprintf(stderr, "EXIT: %s", __uei->reason); \ + if (__uei->msg[0] != '\0') \ + fprintf(stderr, " (%s)", __uei->msg); \ + fputs("\n", stderr); \ + __uei->exit_code; \ +}) + +/* + * We can't import vmlinux.h while compiling user C code. Let's duplicate + * scx_exit_code definition. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + +enum uei_ecode_mask { + UEI_ECODE_USER_MASK = ((1LLU << 32) - 1), + UEI_ECODE_SYS_RSN_MASK = ((1LLU << 16) - 1) << 32, + UEI_ECODE_SYS_ACT_MASK = ((1LLU << 16) - 1) << 48, +}; + +/* + * These macro interpret the ecode returned from UEI_REPORT(). + */ +#define UEI_ECODE_USER(__ecode) ((__ecode) & UEI_ECODE_USER_MASK) +#define UEI_ECODE_SYS_RSN(__ecode) ((__ecode) & UEI_ECODE_SYS_RSN_MASK) +#define UEI_ECODE_SYS_ACT(__ecode) ((__ecode) & UEI_ECODE_SYS_ACT_MASK) + +#define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) + +#endif /* __bpf__ */ +#endif /* __USER_EXIT_INFO_H */ diff --git a/src/45-scx-nest/scx_nest.bpf.c b/src/45-scx-nest/scx_nest.bpf.c new file mode 100644 index 00000000..9ce113a9 --- /dev/null +++ b/src/45-scx-nest/scx_nest.bpf.c @@ -0,0 +1,654 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * As described in [0], a Nest scheduler which encourages task placement on + * cores that are likely to be running at higher frequency, based upon recent usage. + * + * [0]: https://hal.inria.fr/hal-03612592/file/paper.pdf + * + * It operates as a global weighted vtime scheduler (similarly to CFS), while + * using the Nest algorithm to choose idle cores at wakup time. + * + * It also demonstrates the following niceties. + * + * - More robust task placement policies. + * - Termination notification for userspace. + * + * While rather simple, this scheduler should work reasonably well on CPUs with + * a uniform L3 cache topology. While preemption is not implemented, the fact + * that the scheduling queue is shared across all CPUs means that whatever is + * at the front of the queue is likely to be executed fairly quickly given + * enough number of CPUs. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include + +#include "scx_nest.h" + +#define TASK_DEAD 0x00000080 + +char _license[] SEC("license") = "GPL"; + +enum { + FALLBACK_DSQ_ID = 0, + MSEC_PER_SEC = 1000LLU, + USEC_PER_MSEC = 1000LLU, + NSEC_PER_USEC = 1000LLU, + NSEC_PER_MSEC = USEC_PER_MSEC * NSEC_PER_USEC, + USEC_PER_SEC = USEC_PER_MSEC * MSEC_PER_SEC, + NSEC_PER_SEC = NSEC_PER_USEC * USEC_PER_SEC, +}; + +#define CLOCK_BOOTTIME 7 +#define NUMA_NO_NODE -1 + +const volatile u64 p_remove_ns = 2 * NSEC_PER_MSEC; +const volatile u64 r_max = 5; +const volatile u64 r_impatient = 2; +const volatile u64 slice_ns = SCX_SLICE_DFL; +const volatile bool find_fully_idle = false; +const volatile u64 sampling_cadence_ns = 1 * NSEC_PER_SEC; +const volatile u64 r_depth = 5; + +// Used for stats tracking. May be stale at any given time. +u64 stats_primary_mask, stats_reserved_mask, stats_other_mask, stats_idle_mask; + +// Used for internal tracking. +static s32 nr_reserved; + +static u64 vtime_now; +UEI_DEFINE(uei); + +extern unsigned long CONFIG_HZ __kconfig; + +/* Per-task scheduling context */ +struct task_ctx { + /* + * A temporary cpumask for calculating a task's primary and reserve + * mask. + */ + struct bpf_cpumask __kptr *tmp_mask; + + /* + * The number of times that a task observes that its previous core is + * not idle. If this occurs r_impatient times in a row, a core is + * attempted to be retrieved from either the reserve nest, or the + * fallback nest. + */ + u32 prev_misses; + + /* + * A core that the task is "attached" to, meaning the last core that it + * executed on at least twice in a row, and the core that it first + * tries to migrate to on wakeup. The task only migrates to the + * attached core if it is idle and in the primary nest. + */ + s32 attached_core; + + /* + * The last core that the task executed on. This is used to determine + * if the task should attach to the core that it will execute on next. + */ + s32 prev_cpu; +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +struct pcpu_ctx { + /* The timer used to compact the core from the primary nest. */ + struct bpf_timer timer; + + /* Whether the current core has been scheduled for compaction. */ + bool scheduled_compaction; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1024); + __type(key, s32); + __type(value, struct pcpu_ctx); +} pcpu_ctxs SEC(".maps"); + +struct stats_timer { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct stats_timer); +} stats_timer SEC(".maps"); + +const volatile u32 nr_cpus = 1; /* !0 for veristat, set during init. */ + +private(NESTS) struct bpf_cpumask __kptr *primary_cpumask; +private(NESTS) struct bpf_cpumask __kptr *reserve_cpumask; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, NEST_STAT(NR)); +} stats SEC(".maps"); + + +static __always_inline void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static __always_inline void +try_make_core_reserved(s32 cpu, struct bpf_cpumask * reserved, bool promotion) +{ + s32 tmp_nr_reserved; + + /* + * This check is racy, but that's OK. If we incorrectly fail to promote + * a core to reserve, it's because another context added or removed a + * core from reserved in this small window. It will balance out over + * subsequent wakeups. + */ + tmp_nr_reserved = nr_reserved; + if (tmp_nr_reserved < r_max) { + /* + * It's possible that we could exceed r_max for a time here, + * but that should balance out as more cores are either demoted + * or fail to be promoted into the reserve nest. + */ + __sync_fetch_and_add(&nr_reserved, 1); + bpf_cpumask_set_cpu(cpu, reserved); + if (promotion) + stat_inc(NEST_STAT(PROMOTED_TO_RESERVED)); + else + stat_inc(NEST_STAT(DEMOTED_TO_RESERVED)); + } else { + bpf_cpumask_clear_cpu(cpu, reserved); + stat_inc(NEST_STAT(RESERVED_AT_CAPACITY)); + } +} + +static void update_attached(struct task_ctx *tctx, s32 prev_cpu, s32 new_cpu) +{ + if (tctx->prev_cpu == new_cpu) + tctx->attached_core = new_cpu; + tctx->prev_cpu = prev_cpu; +} + +static int compact_primary_core(void *map, int *key, struct bpf_timer *timer) +{ + struct bpf_cpumask *primary, *reserve; + s32 cpu = bpf_get_smp_processor_id(); + struct pcpu_ctx *pcpu_ctx; + + stat_inc(NEST_STAT(CALLBACK_COMPACTED)); + /* + * If we made it to this callback, it means that the timer callback was + * never cancelled, and so the core needs to be demoted from the + * primary nest. + */ + pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu); + if (!pcpu_ctx) { + scx_bpf_error("Couldn't lookup pcpu ctx"); + return 0; + } + bpf_rcu_read_lock(); + primary = primary_cpumask; + reserve = reserve_cpumask; + if (!primary || !reserve) { + scx_bpf_error("Couldn't find primary or reserve"); + bpf_rcu_read_unlock(); + return 0; + } + + bpf_cpumask_clear_cpu(cpu, primary); + try_make_core_reserved(cpu, reserve, false); + bpf_rcu_read_unlock(); + pcpu_ctx->scheduled_compaction = false; + return 0; +} + +s32 BPF_STRUCT_OPS(nest_select_cpu, struct task_struct *p, s32 prev_cpu, + u64 wake_flags) +{ + struct bpf_cpumask *p_mask, *primary, *reserve; + s32 cpu; + struct task_ctx *tctx; + struct pcpu_ctx *pcpu_ctx; + bool direct_to_primary = false, reset_impatient = true; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) + return -ENOENT; + + bpf_rcu_read_lock(); + p_mask = tctx->tmp_mask; + primary = primary_cpumask; + reserve = reserve_cpumask; + if (!p_mask || !primary || !reserve) { + bpf_rcu_read_unlock(); + return -ENOENT; + } + + tctx->prev_cpu = prev_cpu; + + bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary)); + + /* First try to wake the task on its attached core. */ + if (bpf_cpumask_test_cpu(tctx->attached_core, cast_mask(p_mask)) && + scx_bpf_test_and_clear_cpu_idle(tctx->attached_core)) { + cpu = tctx->attached_core; + stat_inc(NEST_STAT(WAKEUP_ATTACHED)); + goto migrate_primary; + } + + /* + * Try to stay on the previous core if it's in the primary set, and + * there's no hypertwin. If the previous core is the core the task is + * attached to, don't bother as we already just tried that above. + */ + if (prev_cpu != tctx->attached_core && + bpf_cpumask_test_cpu(prev_cpu, cast_mask(p_mask)) && + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + stat_inc(NEST_STAT(WAKEUP_PREV_PRIMARY)); + goto migrate_primary; + } + + if (find_fully_idle) { + /* Then try any fully idle core in primary. */ + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), + SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_PRIMARY)); + goto migrate_primary; + } + } + + /* Then try _any_ idle core in primary, even if its hypertwin is active. */ + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0); + if (cpu >= 0) { + stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_PRIMARY)); + goto migrate_primary; + } + + if (r_impatient > 0 && ++tctx->prev_misses >= r_impatient) { + direct_to_primary = true; + tctx->prev_misses = 0; + stat_inc(NEST_STAT(TASK_IMPATIENT)); + } + + reset_impatient = false; + + /* Then try any fully idle core in reserve. */ + bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(reserve)); + if (find_fully_idle) { + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), + SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_RESERVE)); + goto promote_to_primary; + } + } + + /* Then try _any_ idle core in reserve, even if its hypertwin is active. */ + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0); + if (cpu >= 0) { + stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_RESERVE)); + goto promote_to_primary; + } + + /* Then try _any_ idle core in the task's cpumask. */ + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) { + /* + * We found a core that (we didn't _think_) is in any nest. + * This means that we need to either promote the core to the + * reserve nest, or if we're going direct to primary due to + * r_impatient being exceeded, promote directly to primary. + * + * We have to do one final check here to see if the core is in + * the primary or reserved cpumask because we could potentially + * race with the core changing states between AND'ing the + * primary and reserve masks with p->cpus_ptr above, and + * atomically reserving it from the idle mask with + * scx_bpf_pick_idle_cpu(). This is also technically true of + * the checks above, but in all of those cases we just put the + * core directly into the primary mask so it's not really that + * big of a problem. Here, we want to make sure that we don't + * accidentally put a core into the reserve nest that was e.g. + * already in the primary nest. This is unlikely, but we check + * for it on what should be a relatively cold path regardless. + */ + stat_inc(NEST_STAT(WAKEUP_IDLE_OTHER)); + if (bpf_cpumask_test_cpu(cpu, cast_mask(primary))) + goto migrate_primary; + else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) + goto promote_to_primary; + else if (direct_to_primary) + goto promote_to_primary; + else + try_make_core_reserved(cpu, reserve, true); + bpf_rcu_read_unlock(); + return cpu; + } + + bpf_rcu_read_unlock(); + return prev_cpu; + +promote_to_primary: + stat_inc(NEST_STAT(PROMOTED_TO_PRIMARY)); +migrate_primary: + if (reset_impatient) + tctx->prev_misses = 0; + pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu); + if (pcpu_ctx) { + if (pcpu_ctx->scheduled_compaction) { + if (bpf_timer_cancel(&pcpu_ctx->timer) < 0) + scx_bpf_error("Failed to cancel pcpu timer"); + if (bpf_timer_set_callback(&pcpu_ctx->timer, compact_primary_core)) + scx_bpf_error("Failed to re-arm pcpu timer"); + pcpu_ctx->scheduled_compaction = false; + stat_inc(NEST_STAT(CANCELLED_COMPACTION)); + } + } else { + scx_bpf_error("Failed to lookup pcpu ctx"); + } + bpf_cpumask_set_cpu(cpu, primary); + /* + * Check to see whether the CPU is in the reserved nest. This can + * happen if the core is compacted concurrently with us trying to place + * the currently-waking task onto it. Similarly, this is the expected + * state of the core if we found the core in the reserve nest and are + * promoting it. + * + * We don't have to worry about racing with any other waking task here + * because we've atomically reserved the core with (some variant of) + * scx_bpf_pick_idle_cpu(). + */ + if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) { + __sync_sub_and_fetch(&nr_reserved, 1); + bpf_cpumask_clear_cpu(cpu, reserve); + } + bpf_rcu_read_unlock(); + update_attached(tctx, prev_cpu, cpu); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0); + return cpu; +} + +void BPF_STRUCT_OPS(nest_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct task_ctx *tctx; + u64 vtime = p->scx.dsq_vtime; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("Unable to find task ctx"); + return; + } + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (vtime_before(vtime, vtime_now - slice_ns)) + vtime = vtime_now - slice_ns; + + scx_bpf_dispatch_vtime(p, FALLBACK_DSQ_ID, slice_ns, vtime, + enq_flags); +} + +void BPF_STRUCT_OPS(nest_dispatch, s32 cpu, struct task_struct *prev) +{ + struct pcpu_ctx *pcpu_ctx; + struct bpf_cpumask *primary, *reserve; + s32 key = cpu; + bool in_primary; + + primary = primary_cpumask; + reserve = reserve_cpumask; + if (!primary || !reserve) { + scx_bpf_error("No primary or reserve cpumask"); + return; + } + + pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key); + if (!pcpu_ctx) { + scx_bpf_error("Failed to lookup pcpu ctx"); + return; + } + + if (!scx_bpf_consume(FALLBACK_DSQ_ID)) { + in_primary = bpf_cpumask_test_cpu(cpu, cast_mask(primary)); + + if (prev && (prev->scx.flags & SCX_TASK_QUEUED) && in_primary) { + scx_bpf_dispatch(prev, SCX_DSQ_LOCAL, slice_ns, 0); + return; + } + + stat_inc(NEST_STAT(NOT_CONSUMED)); + if (in_primary) { + /* + * Immediately demote a primary core if the previous + * task on it is dying + * + * Note that we elect to not compact the "first" CPU in + * the mask so as to encourage at least one core to + * remain in the nest. It would be better to check for + * whether there is only one core remaining in the + * nest, but BPF doesn't yet have a kfunc for querying + * cpumask weight. + */ + if ((prev && prev->__state == TASK_DEAD) && + (cpu != bpf_cpumask_first(cast_mask(primary)))) { + stat_inc(NEST_STAT(EAGERLY_COMPACTED)); + bpf_cpumask_clear_cpu(cpu, primary); + try_make_core_reserved(cpu, reserve, false); + } else { + pcpu_ctx->scheduled_compaction = true; + /* + * The core isn't being used anymore. Set a + * timer to remove the core from the nest in + * p_remove if it's still unused by that point. + */ + bpf_timer_start(&pcpu_ctx->timer, p_remove_ns, + BPF_F_TIMER_CPU_PIN); + stat_inc(NEST_STAT(SCHEDULED_COMPACTION)); + } + } + return; + } + stat_inc(NEST_STAT(CONSUMED)); +} + +void BPF_STRUCT_OPS(nest_running, struct task_struct *p) +{ + /* + * Global vtime always progresses forward as tasks start executing. The + * test and update can be performed concurrently from multiple CPUs and + * thus racy. Any error should be contained and temporary. Let's just + * live with it. + */ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(nest_stopping, struct task_struct *p, bool runnable) +{ + /* scale the execution time by the inverse of the weight and charge */ + p->scx.dsq_vtime += (slice_ns - p->scx.slice) * 100 / p->scx.weight; +} + +s32 BPF_STRUCT_OPS(nest_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + struct task_ctx *tctx; + struct bpf_cpumask *cpumask; + + /* + * @p is new. Let's ensure that its task_ctx is available. We can sleep + * in this function and the following will automatically use GFP_KERNEL. + */ + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!tctx) + return -ENOMEM; + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + + cpumask = bpf_kptr_xchg(&tctx->tmp_mask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + tctx->attached_core = -1; + tctx->prev_cpu = -1; + + return 0; +} + +void BPF_STRUCT_OPS(nest_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +static int stats_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + s32 cpu; + struct bpf_cpumask *primary, *reserve; + const struct cpumask *idle; + stats_primary_mask = 0; + stats_reserved_mask = 0; + stats_other_mask = 0; + stats_idle_mask = 0; + long err; + + bpf_rcu_read_lock(); + primary = primary_cpumask; + reserve = reserve_cpumask; + if (!primary || !reserve) { + bpf_rcu_read_unlock(); + scx_bpf_error("Failed to lookup primary or reserve"); + return 0; + } + + idle = scx_bpf_get_idle_cpumask(); + bpf_for(cpu, 0, nr_cpus) { + if (bpf_cpumask_test_cpu(cpu, cast_mask(primary))) + stats_primary_mask |= (1ULL << cpu); + else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) + stats_reserved_mask |= (1ULL << cpu); + else + stats_other_mask |= (1ULL << cpu); + + if (bpf_cpumask_test_cpu(cpu, idle)) + stats_idle_mask |= (1ULL << cpu); + } + bpf_rcu_read_unlock(); + scx_bpf_put_idle_cpumask(idle); + + err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0); + if (err) + scx_bpf_error("Failed to arm stats timer"); + + return 0; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(nest_init) +{ + struct bpf_cpumask *cpumask; + s32 cpu; + int err; + struct bpf_timer *timer; + u32 key = 0; + + err = scx_bpf_create_dsq(FALLBACK_DSQ_ID, NUMA_NO_NODE); + if (err) { + scx_bpf_error("Failed to create fallback DSQ"); + return err; + } + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + bpf_cpumask_clear(cpumask); + cpumask = bpf_kptr_xchg(&primary_cpumask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + + bpf_cpumask_clear(cpumask); + cpumask = bpf_kptr_xchg(&reserve_cpumask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + bpf_for(cpu, 0, nr_cpus) { + s32 key = cpu; + struct pcpu_ctx *ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key); + + if (!ctx) { + scx_bpf_error("Failed to lookup pcpu_ctx"); + return -ENOENT; + } + ctx->scheduled_compaction = false; + if (bpf_timer_init(&ctx->timer, &pcpu_ctxs, CLOCK_BOOTTIME)) { + scx_bpf_error("Failed to initialize pcpu timer"); + return -EINVAL; + } + err = bpf_timer_set_callback(&ctx->timer, compact_primary_core); + if (err) { + scx_bpf_error("Failed to set pcpu timer callback"); + return -EINVAL; + } + } + + timer = bpf_map_lookup_elem(&stats_timer, &key); + if (!timer) { + scx_bpf_error("Failed to lookup central timer"); + return -ESRCH; + } + bpf_timer_init(timer, &stats_timer, CLOCK_BOOTTIME); + bpf_timer_set_callback(timer, stats_timerfn); + err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0); + if (err) + scx_bpf_error("Failed to arm stats timer"); + + return err; +} + +void BPF_STRUCT_OPS(nest_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(nest_ops, + .select_cpu = (void *)nest_select_cpu, + .enqueue = (void *)nest_enqueue, + .dispatch = (void *)nest_dispatch, + .running = (void *)nest_running, + .stopping = (void *)nest_stopping, + .init_task = (void *)nest_init_task, + .enable = (void *)nest_enable, + .init = (void *)nest_init, + .exit = (void *)nest_exit, + .flags = 0, + .name = "nest"); + diff --git a/src/45-scx-nest/scx_nest.c b/src/45-scx-nest/scx_nest.c new file mode 100644 index 00000000..90be55ad --- /dev/null +++ b/src/45-scx-nest/scx_nest.c @@ -0,0 +1,236 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include +#include +#include + +#include "scx_nest.skel.h" +#include "scx_nest.h" + +#define SAMPLING_CADENCE_S 2 + +const char help_fmt[] = +"A Nest sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-p] [-d DELAY] [-m ] [-i ITERS]\n" +"\n" +" -d DELAY_US Delay (us), before removing an idle core from the primary nest (default 2000us / 2ms)\n" +" -m R_MAX Maximum number of cores in the reserve nest (default 5)\n" +" -i ITERS Number of successive placement failures tolerated before trying to aggressively expand primary nest (default 2), or 0 to disable\n" +" -s SLICE_US Override slice duration in us (default 20000us / 20ms)\n" +" -I First try to find a fully idle core, and then any idle core, when searching nests. Default behavior is to ignore hypertwins and check for any idle core.\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int nest) +{ + exit_req = 1; +} + +struct nest_stat { + const char *label; + enum nest_stat_group group; + enum nest_stat_idx idx; +}; + +#define NEST_ST(__stat, __grp, __desc) { \ + .label = #__stat, \ + .group = __grp, \ + .idx = NEST_STAT(__stat) \ +}, +static struct nest_stat nest_stats[NEST_STAT(NR)] = { +#include "scx_nest_stats_table.h" +}; +#undef NEST_ST + +static void read_stats(struct scx_nest *skel, u64 *stats) +{ + int nr_cpus = libbpf_num_possible_cpus(); + u64 cnts[NEST_STAT(NR)][nr_cpus]; + u32 idx; + + memset(stats, 0, sizeof(stats[0]) * NEST_STAT(NR)); + + for (idx = 0; idx < NEST_STAT(NR); idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +static void print_underline(const char *str) +{ + char buf[64]; + size_t len; + + len = strlen(str); + memset(buf, '-', len); + buf[len] = '\0'; + printf("\n\n%s\n%s\n", str, buf); +} + +static void print_stat_grp(enum nest_stat_group grp) +{ + const char *group; + + switch (grp) { + case STAT_GRP_WAKEUP: + group = "Wakeup stats"; + break; + case STAT_GRP_NEST: + group = "Nest stats"; + break; + case STAT_GRP_CONSUME: + group = "Consume stats"; + break; + default: + group = "Unknown stats"; + break; + } + + print_underline(group); +} + +static void print_active_nests(const struct scx_nest *skel) +{ + u64 primary = skel->bss->stats_primary_mask; + u64 reserved = skel->bss->stats_reserved_mask; + u64 other = skel->bss->stats_other_mask; + u64 idle = skel->bss->stats_idle_mask; + u32 nr_cpus = skel->rodata->nr_cpus, cpu; + int idx; + char cpus[nr_cpus + 1]; + + memset(cpus, 0, nr_cpus + 1); + print_underline("Masks"); + for (idx = 0; idx < 4; idx++) { + const char *mask_str; + u64 mask, total = 0; + + memset(cpus, '-', nr_cpus); + if (idx == 0) { + mask_str = "PRIMARY"; + mask = primary; + } else if (idx == 1) { + mask_str = "RESERVED"; + mask = reserved; + } else if (idx == 2) { + mask_str = "OTHER"; + mask = other; + } else { + mask_str = "IDLE"; + mask = idle; + } + for (cpu = 0; cpu < nr_cpus; cpu++) { + if (mask & (1ULL << cpu)) { + cpus[cpu] = '*'; + total++; + } + } + printf("%-9s(%2" PRIu64 "): | %s |\n", mask_str, total, cpus); + } +} + +int main(int argc, char **argv) +{ + struct scx_nest *skel; + struct bpf_link *link; + __u32 opt; + __u64 ecode; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(nest_ops, scx_nest); + + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + skel->rodata->sampling_cadence_ns = SAMPLING_CADENCE_S * 1000 * 1000 * 1000; + + while ((opt = getopt(argc, argv, "d:m:i:Is:vh")) != -1) { + switch (opt) { + case 'd': + skel->rodata->p_remove_ns = strtoull(optarg, NULL, 0) * 1000; + break; + case 'm': + skel->rodata->r_max = strtoull(optarg, NULL, 0); + break; + case 'i': + skel->rodata->r_impatient = strtoull(optarg, NULL, 0); + break; + case 'I': + skel->rodata->find_fully_idle = true; + break; + case 's': + skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, nest_ops, scx_nest, uei); + link = SCX_OPS_ATTACH(skel, nest_ops, scx_nest); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + u64 stats[NEST_STAT(NR)]; + enum nest_stat_idx i; + enum nest_stat_group last_grp = -1; + + read_stats(skel, stats); + for (i = 0; i < NEST_STAT(NR); i++) { + struct nest_stat *nest_stat; + + nest_stat = &nest_stats[i]; + if (nest_stat->group != last_grp) { + print_stat_grp(nest_stat->group); + last_grp = nest_stat->group; + } + printf("%s=%" PRIu64 "\n", nest_stat->label, stats[nest_stat->idx]); + } + printf("\n"); + print_active_nests(skel); + printf("\n"); + printf("\n"); + printf("\n"); + fflush(stdout); + sleep(SAMPLING_CADENCE_S); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_nest__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} diff --git a/src/45-scx-nest/scx_nest.h b/src/45-scx-nest/scx_nest.h new file mode 100644 index 00000000..060444f8 --- /dev/null +++ b/src/45-scx-nest/scx_nest.h @@ -0,0 +1,18 @@ +#ifndef __SCX_NEST_H +#define __SCX_NEST_H + +enum nest_stat_group { + STAT_GRP_WAKEUP, + STAT_GRP_NEST, + STAT_GRP_CONSUME, +}; + +#define NEST_STAT(__stat) BPFSTAT_##__stat +#define NEST_ST(__stat, __grp, __desc) NEST_STAT(__stat), +enum nest_stat_idx { +#include "scx_nest_stats_table.h" + NEST_ST(NR, 0, 0) +}; +#undef NEST_ST + +#endif /* __SCX_NEST_H */ diff --git a/src/45-scx-nest/scx_nest_stats_table.h b/src/45-scx-nest/scx_nest_stats_table.h new file mode 100644 index 00000000..6625f705 --- /dev/null +++ b/src/45-scx-nest/scx_nest_stats_table.h @@ -0,0 +1,20 @@ +NEST_ST(WAKEUP_ATTACHED, STAT_GRP_WAKEUP, "Attached CPU was idle, and in primary nest") +NEST_ST(WAKEUP_PREV_PRIMARY, STAT_GRP_WAKEUP, "Previous CPU was idle, and in primary nest") +NEST_ST(WAKEUP_FULLY_IDLE_PRIMARY, STAT_GRP_WAKEUP, "Woken up to fully idle primary nest core") +NEST_ST(WAKEUP_ANY_IDLE_PRIMARY, STAT_GRP_WAKEUP, "Woken up to idle logical primary nest core") +NEST_ST(WAKEUP_FULLY_IDLE_RESERVE, STAT_GRP_WAKEUP, "Woken up to fully idle reserve nest core") +NEST_ST(WAKEUP_ANY_IDLE_RESERVE, STAT_GRP_WAKEUP, "Woken up to idle logical reserve nest core") +NEST_ST(WAKEUP_IDLE_OTHER, STAT_GRP_WAKEUP, "Woken to any idle logical core in p->cpus_ptr") + +NEST_ST(TASK_IMPATIENT, STAT_GRP_NEST, "A task was found to be impatient") +NEST_ST(PROMOTED_TO_PRIMARY, STAT_GRP_NEST, "A core was promoted into the primary nest") +NEST_ST(PROMOTED_TO_RESERVED, STAT_GRP_NEST, "A core was promoted into the reserve nest") +NEST_ST(DEMOTED_TO_RESERVED, STAT_GRP_NEST, "A core was demoted into the reserve nest") +NEST_ST(RESERVED_AT_CAPACITY, STAT_GRP_NEST, "Reserved nest was at capacity") +NEST_ST(SCHEDULED_COMPACTION, STAT_GRP_NEST, "Scheduled a primary core to be compacted") +NEST_ST(CANCELLED_COMPACTION, STAT_GRP_NEST, "Cancelled a primary core from being compacted at task wakeup time") +NEST_ST(EAGERLY_COMPACTED, STAT_GRP_NEST, "A core was compacted in ops.dispatch()") +NEST_ST(CALLBACK_COMPACTED, STAT_GRP_NEST, "A core was compacted in the scheduled timer callback") + +NEST_ST(CONSUMED, STAT_GRP_CONSUME, "A task was consumed from the global DSQ") +NEST_ST(NOT_CONSUMED, STAT_GRP_CONSUME, "There was no task in the global DSQ") diff --git a/src/SUMMARY.md b/src/SUMMARY.md index 61c9ebec..91652a96 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -67,6 +67,7 @@ Security: Scheduler: - [lesson 44-scx-simple](44-scx-simple/README.md) Introduction to the BPF Scheduler +- [lesson 45-scx-nest](45-scx-nest/README.md) Implementing the `scx_nest` Scheduler Other: diff --git a/src/SUMMARY.zh.md b/src/SUMMARY.zh.md index cad39d3e..47bce76d 100644 --- a/src/SUMMARY.zh.md +++ b/src/SUMMARY.zh.md @@ -61,7 +61,9 @@ Android: - [lesson 34-syscall](34-syscall/README.zh.md) eBPF 开发实践:使用 eBPF 修改系统调用参数 调度器: -- [lesson 44-scx-simple](44-scx-simple/README.zh.md) None +- [lesson 44-scx-simple](44-scx-simple/README.zh.md) eBPF 教程:BPF 调度器入门 +- [lesson 45-scx-nest](45-scx-nest/README.zh.md) eBPF 示例教程:实现 `scx_nest` 调度器 + 其他: - [lesson 35-user-ringbuf](35-user-ringbuf/README.zh.md) eBPF开发实践:使用 user ring buffer 向内核异步发送信息 diff --git a/src/scripts/generate_toc.py b/src/scripts/generate_toc.py index 6f0435f6..53c5a4d3 100644 --- a/src/scripts/generate_toc.py +++ b/src/scripts/generate_toc.py @@ -212,8 +212,8 @@ def sort_key(directory_name): # Example usage base_directory = "/root/bpf-developer-tutorial/src/" # Replace with the actual base directory -project_root = "/root/bpf-developer-tutorial/src/" # The root of the project -# toc_output = generate_toc(base_directory, project_root) -toc_output = generate_toc_cn(base_directory, project_root) +project_root = "/root/bpf-developer-tutorial/" # The root of the project +toc_output = generate_toc(base_directory, project_root) +# toc_output = generate_toc_cn(base_directory, project_root) # Output the TOC print(toc_output)