33 * This software may be used and distributed according to the terms of the
44 * GNU General Public License version 2.
55 *
6- * This header adds L3 cache awareness to scx_mitosis by defining BPF
7- * maps for CPU-to-L3 domain mappings. It provides functions to
6+ * This header assists adding L3 cache awareness to scx_mitosis by defining
7+ * maps and fns for managing CPU-to-L3 domain mappings. It provides code to
88 * recalculate per-L3 CPU counts within cells and implements weighted
99 * random L3 selection for tasks. It also tracks work-stealing
1010 * statistics for cross-L3 task migrations.
1414#include "mitosis.bpf.h"
1515#include "intf.h"
1616
17- // It's also an option to just compute this from the cpu_to_l3 map.
18- struct l3_cpu_mask {
19- unsigned long cpumask [CPUMASK_LONG_ENTRIES ];
20- };
17+ typedef u32 l3_id_t ;
18+ #define L3_INVALID ((l3_id_t) ~0u)
19+
20+ // Configure how aggressively we steal work.
21+ // When task is detected as a steal candidate, skip it this many times
22+ // On a web server workload, 100 reduced steal count by ~90%
23+ #ifdef MITOSIS_ENABLE_STEALING
24+ #define PREVENT_N_STEALS 0
25+ #endif
2126
2227/* Work stealing statistics map - accessible from both BPF and userspace */
2328struct steal_stats_map {
@@ -38,27 +43,46 @@ struct cpu_to_l3_map {
3843struct l3_to_cpus_map {
3944 __uint (type , BPF_MAP_TYPE_ARRAY );
4045 __type (key , u32 );
41- __type (value , struct l3_cpu_mask );
46+ __type (value , struct cpumask );
4247 __uint (max_entries , MAX_L3S );
4348};
4449
45- extern struct cpu_to_l3_map cpu_to_l3 SEC (".maps" );
46- extern struct l3_to_cpus_map l3_to_cpus SEC (".maps" );
47- extern struct steal_stats_map steal_stats SEC (".maps" );
50+ extern struct cpu_to_l3_map cpu_to_l3 ;
51+ extern struct l3_to_cpus_map l3_to_cpus ;
52+ extern struct steal_stats_map steal_stats ;
53+
54+ static inline const bool l3_is_valid (u32 l3_id ) {
55+ if (l3_id == L3_INVALID )
56+ return false;
57+
58+ return (l3_id >= 0 ) && (l3_id < MAX_L3S );
59+ }
60+
61+ static inline void init_task_l3 (struct task_ctx * tctx ) {
62+ tctx -> l3 = L3_INVALID ;
63+
64+ #if MITOSIS_ENABLE_STEALING
65+ tctx -> pending_l3 = L3_INVALID ;
66+ tctx -> steal_count = 0 ;
67+ tctx -> last_stolen_at = 0 ;
68+ tctx -> steals_prevented = 0 ;
69+ #endif
70+
71+ }
4872
4973static inline const struct cpumask * lookup_l3_cpumask (u32 l3 )
5074{
51- struct l3_cpu_mask * mask ;
75+ struct cpumask * mask ;
5276
5377 if (!(mask = bpf_map_lookup_elem (& l3_to_cpus , & l3 ))) {
5478 scx_bpf_error ("no l3 cpumask, l3: %d, %p" , l3 , & l3_to_cpus );
5579 return NULL ;
5680 }
5781
58- return ( const struct cpumask * ) mask ;
82+ return mask ;
5983}
6084
61- /* Recompute cell->l3_cpu_cnt[] after cell cpumask changes (no persistent kptrs). */
85+ /* Recompute cell->l3_cpu_cnt[] after cell cpumask changes */
6286static __always_inline void recalc_cell_l3_counts (u32 cell_idx )
6387{
6488 struct cell * cell = lookup_cell (cell_idx );
@@ -89,7 +113,6 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
89113 continue ;
90114 }
91115
92- /* ok: dst is bpf_cpumask*, sources are (RCU cpumask*, plain cpumask*) */
93116 bpf_cpumask_and (tmp , cell_mask , l3_mask );
94117
95118 u32 cnt = bpf_cpumask_weight ((const struct cpumask * )tmp );
@@ -113,24 +136,24 @@ static __always_inline void recalc_cell_l3_counts(u32 cell_idx)
113136 * have higher probability of being selected.
114137 *
115138 * @cell_id: The cell ID to select an L3 from
116- * @return: L3 ID on success, INVALID_L3_ID on error, or 0 as fallback
139+ * @return: L3 ID on success, L3_INVALID on error
117140 */
118141static inline s32 pick_l3_for_task (u32 cell_id )
119142{
120143 struct cell * cell ;
121144 u32 l3 , target , cur = 0 ;
122- s32 ret = INVALID_L3_ID ;
145+ s32 ret = L3_INVALID ;
123146
124147 /* Look up the cell structure */
125148 if (!(cell = lookup_cell (cell_id )))
126- return INVALID_L3_ID ;
149+ return L3_INVALID ;
127150
128151 /* Handle case where cell has no CPUs assigned yet */
129152 if (!cell -> cpu_cnt ) {
130153 scx_bpf_error (
131154 "pick_l3_for_task: cell %d has no CPUs accounted yet" ,
132155 cell_id );
133- return INVALID_L3_ID ;
156+ return L3_INVALID ;
134157 }
135158
136159 /* Generate random target value in range [0, cpu_cnt) */
@@ -148,3 +171,100 @@ static inline s32 pick_l3_for_task(u32 cell_id)
148171 }
149172 return ret ;
150173}
174+
175+ #ifdef MITOSIS_ENABLE_STEALING
176+
177+ static inline bool try_stealing_this_task (struct task_ctx * task_ctx ,
178+ s32 local_l3 , u64 candidate_dsq )
179+ {
180+ // Attempt the steal, can fail beacuse it's a race.
181+ if (!scx_bpf_dsq_move_to_local (candidate_dsq ))
182+ return false;
183+
184+ // We got the task!
185+ task_ctx -> steal_count ++ ;
186+ task_ctx -> last_stolen_at = scx_bpf_now ();
187+ /* Retag to thief L3 (the one for this cpu) */
188+ task_ctx -> pending_l3 = local_l3 ;
189+ task_ctx -> steals_prevented = 0 ;
190+
191+ /* Increment steal counter in map */
192+ u32 key = 0 ;
193+ u64 * count = bpf_map_lookup_elem (& steal_stats , & key );
194+ // NOTE: This could get expensive, but I'm not anticipating that many steals. Percpu if we care.
195+ if (count )
196+ __sync_fetch_and_add (count , 1 );
197+
198+ return true;
199+ }
200+
201+ /* Work stealing:
202+ * Scan sibling (cell,L3) DSQs in the same cell and steal the first queued task if it can run on this cpu
203+ */
204+ static inline bool try_stealing_work (u32 cell , s32 local_l3 )
205+ {
206+ if (!l3_is_valid (local_l3 ))
207+ scx_bpf_error ("try_stealing_work: invalid local_l3" );
208+
209+ struct cell * cell_ptr = lookup_cell (cell );
210+ if (!cell_ptr )
211+ scx_bpf_error ("try_stealing_work: invalid cell" );
212+
213+ // Loop over all other L3s, looking for a queued task to steal
214+ u32 i ;
215+ bpf_for (i , 1 , nr_l3 )
216+ {
217+ // Start with the next one to spread out the load
218+ u32 candidate_l3 = (local_l3 + i ) % nr_l3 ;
219+
220+ // Prevents the optimizer from removing the following conditional return
221+ // so that the verifier knows the read wil be safe
222+ barrier_var (candidate_l3 );
223+
224+ if (candidate_l3 >= MAX_L3S )
225+ continue ;
226+
227+ // Skip L3s that are not present in this cell
228+ // Note: rechecking cell_ptr for verifier
229+ if (cell_ptr && cell_ptr -> l3_cpu_cnt [candidate_l3 ] == 0 )
230+ continue ;
231+
232+ u64 candidate_dsq = get_cell_l3_dsq_id (cell , candidate_l3 );
233+
234+ struct task_struct * task = NULL ;
235+ struct task_ctx * task_ctx ;
236+ // I'm only using this for the verifier
237+ bool found_task = false;
238+
239+ // Optimization: skip if faster than constructing an iterator
240+ // Not redundant with later checking if task found (race)
241+ if (scx_bpf_dsq_nr_queued (candidate_dsq ))
242+ continue ;
243+
244+ // Just a trick for peeking the head element
245+ bpf_for_each (scx_dsq , task , candidate_dsq , 0 )
246+ {
247+ task_ctx = lookup_task_ctx (task );
248+ found_task = (task_ctx != NULL );
249+ break ;
250+ }
251+
252+ // No task? Try next L3
253+ if (!found_task )
254+ continue ;
255+
256+ // This knob throttles stealing.
257+ // TODO: make runtime configurable
258+ if (task_ctx -> steals_prevented ++ < PREVENT_N_STEALS ) {
259+ continue ;
260+ }
261+
262+ if (!try_stealing_this_task (task_ctx , local_l3 , candidate_dsq ))
263+ continue ;
264+
265+ // Success, we got a task (no guarantee it was the one we peeked though... race)
266+ return true;
267+ }
268+ return false;
269+ }
270+ #endif
0 commit comments